# Subsetting and Sorting


### 1. Subsetting

##### Quick Review

In [4]:
set.seed(13435)

X <- data.frame("var1" = sample(1:5), "var2"=sample(6:10), "var3"=sample(11:15))
X <- X[sample(1:5), ]; X$var2[c(1,3)] = NA
X

Unnamed: 0,var1,var2,var3
1,2,,15
4,1,10.0,11
2,3,,12
3,5,6.0,14
5,4,9.0,13


In [6]:
X[,1]

In [7]:
X[,"var1"]

In [11]:
X$var1

In [8]:
X[1:2, "var2"]

##### Using Logical Statements


In [9]:
X[ (X$var1 <= 3 & X$var3 > 11), ]

var1,var2,var3
2,,15
3,,12


In [10]:
X[ (X$var1 <=3 | X$var3 > 15), ]

Unnamed: 0,var1,var2,var3
1,2,,15
4,1,10.0,11
2,3,,12


##### Dealing with NAs

In [14]:
# which는 NA를 제외한다!!
X[which(X$var2 > 8), ]

Unnamed: 0,var1,var2,var3
4,1,10,11
5,4,9,13


### 2. Sorting

In [15]:
sort(X$var1)

In [16]:
sort(X$var1, decreasing=TRUE)

In [19]:
sort(X$var2, na.last =T)

##### Ordering

In [22]:
order(X$var1)

In [21]:
X[order(X$var1),]

Unnamed: 0,var1,var2,var3
4,1,10.0,11
1,2,,15
2,3,,12
5,4,9.0,13
3,5,6.0,14


In [24]:
# multiple order
X[order(X$var1, X$var3), ]

Unnamed: 0,var1,var2,var3
4,1,10.0,11
1,2,,15
2,3,,12
5,4,9.0,13
3,5,6.0,14


##### Ordering with plyr

In [28]:
library(plyr)

In [29]:
arrange(X, var1)

var1,var2,var3
1,10.0,11
2,,15
3,,12
4,9.0,13
5,6.0,14


In [30]:
arrange(X, desc(var1))

var1,var2,var3
5,6.0,14
4,9.0,13
3,,12
2,,15
1,10.0,11


### 3. Adding rows and columns

In [34]:
X$var4 <- rnorm(5)
X

Unnamed: 0,var1,var2,var3,var4
1,2,,15,1.1495053
4,1,10.0,11,-0.8705105
2,3,,12,-0.9870139
3,5,6.0,14,0.326253
5,4,9.0,13,-1.1025739


In [35]:
Y <- cbind(X, rnorm(5))

In [36]:
Y

Unnamed: 0,var1,var2,var3,var4,rnorm(5)
1,2,,15,1.1495053,-1.0105164
4,1,10.0,11,-0.8705105,0.6095613
2,3,,12,-0.9870139,0.5041528
3,5,6.0,14,0.326253,1.3798872
5,4,9.0,13,-1.1025739,0.4906615


# Summarizing Data

https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD

In [41]:
if(!file.exists("./data")){dir.create("./data")}
fileURL <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileURL, destfile="./data/restaurants.csv", method="curl")
restData <- read.csv("./data/restaurants.csv")

In [40]:
head(restData, 3)

name,zipCode,neighborhood,councilDistrict,policeDistrict,Location.1
410,21206,Frankford,2,NORTHEASTERN,"4509 BELAIR ROAD Baltimore, MD"
1919,21231,Fells Point,1,SOUTHEASTERN,"1919 FLEET ST Baltimore, MD"
SAUTE,21224,Canton,1,SOUTHEASTERN,"2844 HUDSON ST Baltimore, MD"


In [39]:
tail(restData, 3)

Unnamed: 0,name,zipCode,neighborhood,councilDistrict,policeDistrict,Location.1
1325,ZINK'S CAF,21213,Belair-Edison,13,NORTHEASTERN,"3300 LAWNVIEW AVE Baltimore, MD"
1326,ZISSIMOS BAR,21211,Hampden,7,NORTHERN,"1023 36TH ST Baltimore, MD"
1327,ZORBAS,21224,Greektown,2,SOUTHEASTERN,"4710 EASTERN Ave Baltimore, MD"


In [43]:
summary(restData)

# factor는 카운트를 센다
# 수치형은 최소값, .. 등을 보여준다
# 여기서 우편번호가 마이너스인 것을 알 수 있다! 명백한 오류

                           name         zipCode             neighborhood
 MCDONALD'S                  :   8   Min.   :-21226   Downtown    :128  
 POPEYES FAMOUS FRIED CHICKEN:   7   1st Qu.: 21202   Fells Point : 91  
 SUBWAY                      :   6   Median : 21218   Inner Harbor: 89  
 KENTUCKY FRIED CHICKEN      :   5   Mean   : 21185   Canton      : 81  
 BURGER KING                 :   4   3rd Qu.: 21226   Federal Hill: 42  
 DUNKIN DONUTS               :   4   Max.   : 21287   Mount Vernon: 33  
 (Other)                     :1293                    (Other)     :863  
 councilDistrict       policeDistrict                          Location.1    
 Min.   : 1.000   SOUTHEASTERN:385    1101 RUSSELL ST\nBaltimore, MD\n:   9  
 1st Qu.: 2.000   CENTRAL     :288    201 PRATT ST\nBaltimore, MD\n   :   8  
 Median : 9.000   SOUTHERN    :213    2400 BOSTON ST\nBaltimore, MD\n :   8  
 Mean   : 7.191   NORTHERN    :157    300 LIGHT ST\nBaltimore, MD\n   :   5  
 3rd Qu.:11.000   NORTHEAS

In [46]:
str(restData)

'data.frame':	1327 obs. of  6 variables:
 $ name           : Factor w/ 1277 levels "#1 CHINESE KITCHEN",..: 9 3 992 1 2 4 5 6 7 8 ...
 $ zipCode        : int  21206 21231 21224 21211 21223 21218 21205 21211 21205 21231 ...
 $ neighborhood   : Factor w/ 173 levels "Abell","Arlington",..: 53 52 18 66 104 33 98 133 98 157 ...
 $ councilDistrict: int  2 1 1 14 9 14 13 7 13 1 ...
 $ policeDistrict : Factor w/ 9 levels "CENTRAL","EASTERN",..: 3 6 6 4 8 3 6 4 6 6 ...
 $ Location.1     : Factor w/ 1210 levels "1 BIDDLE ST\nBaltimore, MD\n",..: 835 334 554 755 492 537 505 530 507 569 ...


In [48]:
quantile(restData$councilDistrict, na.rm=T)

In [49]:
quantile(restData$councilDistrict, probs=c(0.5, 0.75, 0.9), na.rm=T)

In [51]:
# useNA : NA가 있으면 NA 테이블 생성해라
table(restData$zipCode, useNA="ifany")


-21226  21201  21202  21205  21206  21207  21208  21209  21210  21211  21212 
     1    136    201     27     30      4      1      8     23     41     28 
 21213  21214  21215  21216  21217  21218  21220  21222  21223  21224  21225 
    31     17     54     10     32     69      1      7     56    199     19 
 21226  21227  21229  21230  21231  21234  21237  21239  21251  21287 
    18      4     13    156    127      7      1      3      2      1 

In [52]:
table(restData$councilDistrict, restData$zipCode)

    
     -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213
  1       0     0    37     0     0     0     0     0     0     0     0     2
  2       0     0     0     3    27     0     0     0     0     0     0     0
  3       0     0     0     0     0     0     0     0     0     0     0     2
  4       0     0     0     0     0     0     0     0     0     0    27     0
  5       0     0     0     0     0     3     0     6     0     0     0     0
  6       0     0     0     0     0     0     0     1    19     0     0     0
  7       0     0     0     0     0     0     0     1     0    27     0     0
  8       0     0     0     0     0     1     0     0     0     0     0     0
  9       0     1     0     0     0     0     0     0     0     0     0     0
  10      1     0     1     0     0     0     0     0     0     0     0     0
  11      0   115   139     0     0     0     1     0     0     0     1     0
  12      0    20    24     4     0     0     0     0     0

##### Check for missing values

In [53]:
sum(is.na(restData$councilDistrict))

In [54]:
# TRUE가 한 개라도 있는지 검사함
any(is.na(restData$councilDistrict))

In [55]:
# 모든 값이 조건에 맞는지 검사함
all(restData$zipCode >0)

In [56]:
colSums(is.na(restData))

In [57]:
all(colSums(is.na(restData))==0)

##### Values with specific characteristics

In [58]:
table(restData$zipCode %in% c("21212"))


FALSE  TRUE 
 1299    28 

In [60]:
table(restData$zipCode %in% c("21212", "21213"))


FALSE  TRUE 
 1268    59 

In [62]:
restData[restData$zipCode %in% c("21212", "21213"),]

Unnamed: 0,name,zipCode,neighborhood,councilDistrict,policeDistrict,Location.1
29,BAY ATLANTIC CLUB,21212,Downtown,11,CENTRAL,"206 REDWOOD ST Baltimore, MD"
39,BERMUDA BAR,21213,Broadway East,12,EASTERN,"1801 NORTH AVE Baltimore, MD"
92,ATWATER'S,21212,Chinquapin Park-Belvedere,4,NORTHERN,"529 BELVEDERE AVE Baltimore, MD"
111,BALTIMORE ESTONIAN SOCIETY,21213,South Clifton Park,12,EASTERN,"1932 BELAIR RD Baltimore, MD"
187,CAFE ZEN,21212,Rosebank,4,NORTHERN,"438 BELVEDERE AVE Baltimore, MD"
220,CERIELLO FINE FOODS,21212,Chinquapin Park-Belvedere,4,NORTHERN,"529 BELVEDERE AVE Baltimore, MD"
266,CLIFTON PARK GOLF COURSE SNACK BAR,21213,Darley Park,14,NORTHEASTERN,"2701 ST LO DR Baltimore, MD"
276,CLUB HOUSE BAR & GRILL,21213,Orangeville Industrial Area,13,EASTERN,"4217 ERDMAN AVE Baltimore, MD"
289,CLUBHOUSE BAR & GRILL,21213,Orangeville Industrial Area,13,EASTERN,"4217 ERDMAN AVE Baltimore, MD"
291,COCKY LOU'S,21213,Broadway East,12,EASTERN,"2101 NORTH AVE Baltimore, MD"


##### Cross tabs

In [64]:
data(UCBAdmissions)
DF = as.data.frame(UCBAdmissions)
summary(DF)
    # 4 variables

      Admit       Gender   Dept       Freq      
 Admitted:12   Male  :12   A:4   Min.   :  8.0  
 Rejected:12   Female:12   B:4   1st Qu.: 80.0  
                           C:4   Median :170.0  
                           D:4   Mean   :188.6  
                           E:4   3rd Qu.:302.5  
                           F:4   Max.   :512.0  

In [67]:
# break down by gender & admit
# 반드시 데이터셋 명시
xt <- xtabs(Freq ~ Gender + Admit, data =DF)
xt

        Admit
Gender   Admitted Rejected
  Male       1198     1493
  Female      557     1278

##### Flat tables

In [69]:
warpbreaks$replicate <- rep(1:9, len=54)

# break down by all the other variables in the dataset
xt = xtabs(breaks ~., data=warpbreaks)
xt

, , replicate = 1

    tension
wool  L  M  H
   A 26 18 36
   B 27 42 20

, , replicate = 2

    tension
wool  L  M  H
   A 30 21 21
   B 14 26 21

, , replicate = 3

    tension
wool  L  M  H
   A 54 29 24
   B 29 19 24

, , replicate = 4

    tension
wool  L  M  H
   A 25 17 18
   B 19 16 17

, , replicate = 5

    tension
wool  L  M  H
   A 70 12 10
   B 29 39 13

, , replicate = 6

    tension
wool  L  M  H
   A 52 18 43
   B 31 28 15

, , replicate = 7

    tension
wool  L  M  H
   A 51 35 28
   B 41 21 15

, , replicate = 8

    tension
wool  L  M  H
   A 26 30 15
   B 20 39 16

, , replicate = 9

    tension
wool  L  M  H
   A 67 36 26
   B 44 29 28


In [70]:
ftable(xt)

             replicate  1  2  3  4  5  6  7  8  9
wool tension                                     
A    L                 26 30 54 25 70 52 51 26 67
     M                 18 21 29 17 12 18 35 30 36
     H                 36 21 24 18 10 43 28 15 26
B    L                 27 14 29 19 29 31 41 20 44
     M                 42 26 19 16 39 28 21 39 29
     H                 20 21 24 17 13 15 15 16 28

##### Size of a data set

In [74]:
fakeData = rnorm(1e5)
object.size(fakeData)
print(object.size(fakeData), units="Mb")

800040 bytes

0.8 Mb
