# Basic Data Types

## Library Used

In [9]:
library(stringr)
# str_detect       (x, pattern)
# str_subset       (x, pattern)
# str_count        (x, pattern)
# str_replace      (x, pattern, replacment)
# str_replace_all  (x, pattern, replacement)
# str_length       (x)
# str_c            (..., sep),  (...,collapse),  (..., sep, collapse)
# str_sub          (string, start, end )
# str_split        (x, pattern)
# str_trim         (x)

## remark
# x      = multiple elements vector
# ...    = multiple vectors, vectorized execution


library(tidyverse)
# count


library(lubridate)
# year
# month
# day
# ddays            (integer)
# dhours           (integer)
# ceiling_date     ( x, unit)

## Number

### Integer
Use **L** to denote integer

In [10]:
print( 2L )
print( c(2L, 3L, 4L) )
print( is.integer(2L) )

[1] 2
[1] 2 3 4
[1] TRUE


### Double

In [11]:
print( 2.3 )
print( c(2.3,4.1,5.2) )
print( is.double( 2.3 ) )

[1] 2.3
[1] 2.3 4.1 5.2
[1] TRUE


### Operations
#### Sample Data

In [12]:
my.num = c(10/3,14/3,20/3)
my.num2 = c(7/3, 16/3, 26/3)
print(my.num)
print(my.num2)

[1] 3.333333 4.666667 6.666667
[1] 2.333333 5.333333 8.666667


#### Additon / Substraction

In [13]:
print( my.num + my.num2 )
print( my.num - my.num2 )

[1]  5.666667 10.000000 15.333333
[1]  1.0000000 -0.6666667 -2.0000000


#### Division

In [14]:
print( 15 /   4 )  # floting division
print( 15 %/% 4 )  # integer division (quotiant)
print( 21 %%  4 )  # remainder

[1] 3.75
[1] 3
[1] 1


#### Rounding

**Round To 0 decimal**

In [15]:
print( round(c(4.49, 4.50, 4.51) ))

[1] 4 4 5


**Round To X decimals**

In [16]:
print( round( c(3.844, 3.845, 3.846), 2 ))

[1] 3.84 3.85 3.85


#### Celing and Floor

In [17]:
print( ceiling(my.num) )
print( floor(my.num) )

[1] 4 5 7
[1] 3 4 6


#### Absolute (positive)

In [18]:
abs(0-my.num)

## String

In [19]:
print( 'abc' )
print( c('abc','efg','xyz'))
print( is.character('abc') )

[1] "abc"
[1] "abc" "efg" "xyz"
[1] TRUE


### Properties
#### Number of Chars

In [20]:
x = c('One','Two','Three','Four')
nchar(x)

### Splitting
#### strsplit

In [21]:
fruits_string = 'apple, banana, durian'
cars_string   = 'nissan, toyota, bmw'
print(cars_string)
print(fruits_string)

[1] "nissan, toyota, bmw"
[1] "apple, banana, durian"


In [22]:
print( strsplit(c(fruits_string, cars_string), ",") )

[[1]]
[1] "apple"   " banana" " durian"

[[2]]
[1] "nissan"  " toyota" " bmw"   



#### stringr::str_split

In [23]:
print( str_split(c(fruits_string, cars_string), pattern = ',') )

[[1]]
[1] "apple"   " banana" " durian"

[[2]]
[1] "nissan"  " toyota" " bmw"   



### Trimming

In [24]:
fat_string = c(' toyota  ', 'bmw  ','nissan')

#### trimws
Trim trailing and tailing white spaces

In [25]:
print( trimws(fat_string) )

[1] "toyota" "bmw"    "nissan"


#### stringr::str_trim

In [26]:
print( str_trim(fat_string) )

[1] "toyota" "bmw"    "nissan"


### Concatenante

#### Sample Data

In [27]:
fruits = c('banana','mango', 'durian')
cars   = c('toyota','nissan','bms')
print(fruits)
print(cars)

[1] "banana" "mango"  "durian"
[1] "toyota" "nissan" "bms"   


#### paste
**Seperator**  
Vectorized operation

In [28]:
print( paste(fruits, cars, sep = '-') )

[1] "banana-toyota" "mango-nissan"  "durian-bms"   


**Collapse**  
Objective is have result as one single string

In [29]:
print( paste(fruits, collapse = '-') )

[1] "banana-mango-durian"


**Seperator, Then Collapse**  
Combine both operation together

In [30]:
print( paste(fruits, cars, sep='|', collapse = '-'))

[1] "banana|toyota-mango|nissan-durian|bms"


Use **paste0( )  if no seperator required**  
**paste0 is shorthand** for paste(x, sep="")

In [31]:
print( paste0("file", "number", "32")  )

[1] "filenumber32"


#### **stringr::str_c**

In [32]:
library(stringr)

print( str_c( fruits, cars, sep='-') )
print( str_c(fruits, cars, collapse = '|', sep=','))

[1] "banana-toyota" "mango-nissan"  "durian-bms"   
[1] "banana,toyota|mango,nissan|durian,bms"


### Subposition

**stringr::str_sub**

In [33]:
str_sub(fruits, 1, 3)  # substring position 1 to 3

### Find

In [34]:
cars = c('Nissan serena','toyota camry','Toyota Estima','nissan sentra','TOYOTA vios')

#### grep
**grep( pattern=, x=, value=FALSE, ignore.case=FALSE)**  
can return either indices or values

**Find (Return Indices) the Matching Pattern**

In [35]:
print( cars ) 
print( grep('toyota', cars) )                      # case sensitive
print( grep('toyota', cars, ignore.case = TRUE) )  # case insensitive

[1] "Nissan serena" "toyota camry"  "Toyota Estima" "nissan sentra"
[5] "TOYOTA vios"  
[1] 2
[1] 2 3 5


**Find The Matching Values**

In [36]:
## Return **value instead of indices**
print( cars )
print( grep('toyota', cars, value=TRUE) )
print( grep('toyota', cars, value=TRUE, ignore.case = TRUE) )

[1] "Nissan serena" "toyota camry"  "Toyota Estima" "nissan sentra"
[5] "TOYOTA vios"  
[1] "toyota camry"
[1] "toyota camry"  "Toyota Estima" "TOYOTA vios"  


#### stringr::str_detect/subset
**str_detect** (string, pattern)  - which element contain pattern  
**str_subset** (string, pattern)  - return all elements that matches  

Default to case sensitive,
use pattern=fixed( , ignore_case=TRUE) for case insensitive.

**Detect (Return Logical Vector) The Matching Pattern**

In [37]:
print(cars)
print( str_detect(cars, 'toyota') )                           # case sensitive
print( str_detect(cars, fixed('toyota', ignore_case=TRUE)) )  # case insensitive

[1] "Nissan serena" "toyota camry"  "Toyota Estima" "nissan sentra"
[5] "TOYOTA vios"  
[1] FALSE  TRUE FALSE FALSE FALSE
[1] FALSE  TRUE  TRUE FALSE  TRUE


**Find The Matching Values**

In [38]:
print(cars)
print( str_subset(cars, 'toyota') )                           # case sensitive
print( str_subset(cars, fixed('toyota', ignore_case=TRUE)) )  # case insensitive

[1] "Nissan serena" "toyota camry"  "Toyota Estima" "nissan sentra"
[5] "TOYOTA vios"  
[1] "toyota camry"
[1] "toyota camry"  "Toyota Estima" "TOYOTA vios"  


### Replace
#### gsub
**gsub( pattern=, replacement=, x=, ignore.case=TRUE)**  
return a new string vector

In [39]:
print( cars )
print( gsub(pattern='toyota', replacement='Toyota', cars) ) 
print( gsub(pattern='toyota',replacement='Toyota',ignore.case = TRUE, cars) )

[1] "Nissan serena" "toyota camry"  "Toyota Estima" "nissan sentra"
[5] "TOYOTA vios"  
[1] "Nissan serena" "Toyota camry"  "Toyota Estima" "nissan sentra"
[5] "TOYOTA vios"  
[1] "Nissan serena" "Toyota camry"  "Toyota Estima" "nissan sentra"
[5] "Toyota vios"  


#### stringr::str_replace/replace_all
**str_replace** (x, pattern, replacement)      # replace first occurance for each elements  
**str_replace_all** (x, pattern, replacement)  # replace all occurances on every elements  

Default to case sensitive,  
use **pattern=fixed( , ignore_case=TRUE)** for case insensitive.

In [40]:
print( cars )
print( str_replace     (cars, 'toyota','Toyota') )
print( str_replace_all (cars, 
                        fixed('toyota', ignore_case=TRUE),  # case insensitive
                        'Toyota') )

[1] "Nissan serena" "toyota camry"  "Toyota Estima" "nissan sentra"
[5] "TOYOTA vios"  
[1] "Nissan serena" "Toyota camry"  "Toyota Estima" "nissan sentra"
[5] "TOYOTA vios"  
[1] "Nissan serena" "Toyota camry"  "Toyota Estima" "nissan sentra"
[5] "Toyota vios"  


### Case Conversion

In [41]:
x = c('One','Two','Three')
print( tolower(x)  )
print( toupper(x)  )

[1] "one"   "two"   "three"
[1] "ONE"   "TWO"   "THREE"


## Logical

### Logical Comparison
Take note, R will coerce different data type before comparing

#### Equality

In [42]:
print( 3 == 3 )          # compare two integers
print( 3L == 3.0 )       # compare integer and double
print( 'abc' == 'abc' )  # compare two char
print( 3.0 == '3' )      # compare integer and character
print( 3.0 == 'three' )  # compare integer and character

[1] TRUE
[1] TRUE
[1] TRUE
[1] TRUE
[1] FALSE


#### Not Equal

In [43]:
print( 3     != 3 )        # compare two integers
print( 3L    != 3.0 )      # compare integer and double
print( 'abc' != 'a' )      # compare two char
print( 3.0   != '33' )     # compare integer and character
print( 3.0   != 'three' )  # compare integer and character

[1] FALSE
[1] FALSE
[1] TRUE
[1] TRUE
[1] TRUE


### Logical Operator

#### Negate

In [44]:
!(3L == 3.0)

#### AND ( & and &&)
DONT'T Fall into trop of &&  
**Select numbers between 2 and 6**  

In [45]:
z = 1:8
print( z>2 & z<6    )
print( z[z>2 & z<6] )

[1] FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE
[1] 3 4 5


**However && only evaluate the first elemnt)**  
When will you use it ? No idea

In [46]:
print( z>2    && z<6     )
print( z[1]>2 && z[1]<6  )

[1] FALSE
[1] FALSE


#### OR ( | and ||)
**DONT'T Fall into trop of ||  **
**Select odd numbers or >5**

In [47]:
z = 1:10
print( z[ z%%2==1 | z>5 ] )

[1]  1  3  5  6  7  8  9 10


**However || only evaluate the first elemnt)**  
When will you use it ? No idea

In [48]:
print( z%%2==1 || z>5 )
print( z[1]%%2==1 || z[1]>5 )

[1] TRUE
[1] TRUE


### Evaluation using IF

#### Evaluating String
**String Has no relevance in logic**

In [49]:
x = c('ali','abu','ah kow', NULL, NA)
ifelse(x, 'Human','Unknown')

#### Evaluating Numbers
- General behaviour of **IF** is to evaluate actual FALSE, any other condition is considered TRUE
- Zero evaluates to FASE, other numbers considered TRUE

In [50]:
x= c(0:3,NULL,NA)
print(x)
print( x ==TRUE )
print( x ==FALSE )
print( !x==FALSE )

print( ifelse(x, 'Not Zero','Zero') )
print( ifelse(!x==FALSE, 'Not Zero','Zero') )

if (2.2) print('Not Zero') # same applies to double

[1]  0  1  2  3 NA
[1] FALSE  TRUE FALSE FALSE    NA
[1]  TRUE FALSE FALSE FALSE    NA
[1] FALSE  TRUE  TRUE  TRUE    NA
[1] "Zero"     "Not Zero" "Not Zero" "Not Zero" NA        
[1] "Zero"     "Not Zero" "Not Zero" "Not Zero" NA        
[1] "Not Zero"


### Comparing NA
Any operation against NA will **return NA**

In [51]:
nas = c(1,2,NA,4,5,6)
nas = nas + 1
print (nas)
print( nas == NA )

[1]  2  3 NA  5  6  7
[1] NA NA NA NA NA NA


Use **is.na()** to check if value is NA, NEVER use x==NA to check

In [52]:
is.na(nas)

### Comparing NULL

**Null is neither TRUE or FALSE**

In [53]:
NULL==TRUE | NULL==FALSE

- Comparing NULL with anything get **NOTHING**  
- **NOTHING is not NULL)**

In [54]:
result = (NULL=='abc')
print( typeof(result)  )
print( length(result)  )
print( is.null(result) )

[1] "logical"
[1] 0
[1] FALSE


## Factor

### Creating
#### Non Ordered Factor
There is no need to specify levels

In [55]:
X = c('apple','banana','apple','durian','rambutan','durian')
f1 = factor(X)
print( f1 )

[1] apple    banana   apple    durian   rambutan durian  
Levels: apple banana durian rambutan


#### Ordered Factor
- Specify ordered=T and its levels in order

In [56]:
y = c('slow','turbo','fast','slow','fast','slow')
f2 = factor(y, ordered=TRUE, 
               levels= c('slow','fast','turbo'))
print( f2 )

[1] slow  turbo fast  slow  fast  slow 
Levels: slow < fast < turbo


- Each element is comparible according to the order specified

In [57]:
print( f2[1] < f2[2] )

[1] TRUE


### Levels

In [58]:
X = c('apple','banana','apple','durian','rambutan','durian')
f1 = factor(X, levels = c('apple','banana','durian','rambutan','mango'))
print( levels(f1) )

[1] "apple"    "banana"   "durian"   "rambutan" "mango"   


#### Dropping Unused Levels
**droplevels remove unused category**

In [59]:
print( f1 )

[1] apple    banana   apple    durian   rambutan durian  
Levels: apple banana durian rambutan mango


In [60]:
f2 = droplevels(f1)
print( f2 )

[1] apple    banana   apple    durian   rambutan durian  
Levels: apple banana durian rambutan


#### Levels Convert To Integer
Levels can be converted to integer. Each integer represent a category

In [61]:
print( f1 )
print( as.integer(f1) )

[1] apple    banana   apple    durian   rambutan durian  
Levels: apple banana durian rambutan mango
[1] 1 2 1 3 4 3


## Date
- Date is in fact type of **double**
- There is **no is.date()** function

In [62]:
d = as.Date('2019-01-03')
print( d )
print( typeof(d) )
print( is.double(d) )

[1] "2019-01-03"
[1] "double"
[1] TRUE


### Creating Date
#### Base R

**System Date**

In [63]:
print( Sys.Date()  ) 

[1] "2019-01-04"


**Create From String**  
Default string format is "yyyy-mm-dd" or "yyyy/mm/dd", customizable

In [64]:
birthdays = as.Date(c('1971-07-11','1980-12-11','1981-11-20'))
birthdays = as.Date(c('1971/7/11','1980/12/11','1981/11/20'))
str(birthdays)

 Date[1:3], format: "1971-07-11" "1980-12-11" "1981-11-20"


In [65]:
str(birthdays)

 Date[1:3], format: "1971-07-11" "1980-12-11" "1981-11-20"


#### lubridate
**System Date**

In [66]:
print( today())

[1] "2019-01-04"


**Create From String**

In [67]:
birthdays = ymd(c('1971-07-11','1980-12-11','1981-11-20'))
birthdays = dmy(c('11-07-1973','11-12-1980','20-11-1981'))
birthdays = dmy(c('11/7/1973','11/12/1980','20/11/1981'))
str(birthdays)

 Date[1:3], format: "1973-07-11" "1980-12-11" "1981-11-20"


**Create From Number Vectors**

In [68]:
print( make_date(year = 2018, month=11, day=30) )

[1] "2018-11-30"


### Date Formating

In [69]:
print( format(today, format="%m %d %y") )
print( format(today, format="%b %d %y") )
print( format(today, format="%B %d %Y") )
print( format(today, format="%a %B %d %Y") )
print( format(today, format="%A %B %d %Y") )

[1] "function (tzone = \"\") " "{"                       
[3] "    as_date(now(tzone))"  "}"                       
[1] "function (tzone = \"\") " "{"                       
[3] "    as_date(now(tzone))"  "}"                       
[1] "function (tzone = \"\") " "{"                       
[3] "    as_date(now(tzone))"  "}"                       
[1] "function (tzone = \"\") " "{"                       
[3] "    as_date(now(tzone))"  "}"                       
[1] "function (tzone = \"\") " "{"                       
[3] "    as_date(now(tzone))"  "}"                       


### Year, Month, Day, Weekday

#### Base R
All functions below return string

In [70]:
print( strftime(birthdays,'%Y') )
print( strftime(birthdays,'%m') )
print( strftime(birthdays,'%d') )
print(  weekdays( birthdays )  )

[1] "1973" "1980" "1981"
[1] "07" "12" "11"
[1] "11" "11" "20"
[1] "Wednesday" "Thursday"  "Friday"   


#### lubridate

In [71]:
print( year(birthdays) )
print( month(birthdays))
print( day(birthdays)  )

[1] 1973 1980 1981
[1]  7 12 11
[1] 11 11 20


**Weekday**  
label=T, return factor of Weekday

In [72]:
print( wday(birthdays) )   # 1:Sunday, 7:Saturday

[1] 4 5 6


In [73]:
wd = wday( birthdays, label = T )
str(wd)

 Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 4 5 6


### Date Arithmetic (lubridate)

#### First Day Of Next Month

In [74]:
first_day_of_next_month = ceiling_date(today() ,unit = 'month')
print( first_day_of_next_month )

[1] "2019-02-01"


#### Last Day Of This Month
Use **-days(1)**

In [75]:
last_day_of_this_month = first_day_of_next_month - days(1)
print( last_day_of_this_month )

[1] "2019-01-31"


#### First Day of Next year

In [76]:
print( ceiling_date( today() ,unit = 'year') )

[1] "2020-01-01"


### Date Span

#### lubridate
**Today Date**

In [77]:
today()

**Tomorrow Date**

In [78]:
tomorrow = today() + ddays(1)
print( tomorrow )

[1] "2019-01-05"


**One year ago date**

In [79]:
last_year = today() - dyears(1)
print( last_year )

[1] "2018-01-04"


**One year and 12 weeks from Today**

In [80]:
future = today() + dyears(1) + dweeks(12)
print( future )

[1] "2020-03-28"


**Duration (difftime Object)**  
- Difference between two dates object is duration (difftime object)
- units can only be **auto, days, weeks, secs, hours, mins**
- units cannot be year or month, because their length is conistent

In [81]:
date1 = as.Date('2019-01-01')
date2 = as.Date('1973-07-26')
d.diff  = difftime (date1, date2, units = 'days')
w.diff  = difftime (date1, date2, units = 'auto')
str(d.diff)
str(y.diff)
str(date2-date1)

 'difftime' num 16595
 - attr(*, "units")= chr "days"


ERROR: Error in str(y.diff): object 'y.diff' not found


In [None]:
print( my.duration / ddays(1) )
print( my.duration / dyears(1) )
print( my.duration / dhours(1) )
print( my.duration / dminutes(1) )
print( my.duration / dseconds(1) )

**Age**

In [82]:
print( (today()-as.Date('1973-07-26'))/dyears(1))

[1] 45.47397


## Class and Typeof

### Class

In [83]:
print( class(2L) )
print( class(2.3) )
print( class('abc') )
print( class(factor('abc') ))
print( class(as.Date('2018-03-31')))
print( class(TRUE) )

[1] "integer"
[1] "numeric"
[1] "character"
[1] "factor"
[1] "Date"
[1] "logical"


### Typeof

In [84]:
print( typeof(2L) )
print( typeof(2.3) )
print( typeof('abc') )
print( typeof(factor('abc') ))
print( typeof(as.Date('2018-03-31')))
print( typeof(TRUE) )

[1] "integer"
[1] "double"
[1] "character"
[1] "integer"
[1] "double"
[1] "logical"


### is...

In [85]:
print( is.integer(2L) )
print( is.double(2.3) )
print( is.character('abc') )
print( is.factor(factor('abc') ))
print( is.double(as.Date('2018-03-31')))  # date is double

[1] TRUE
[1] TRUE
[1] TRUE
[1] TRUE
[1] TRUE


#### is.numeric
Observe that, although **date** is tyoepf double, bowever, it is not numeric

In [86]:
print( is.numeric(2L))
print( is.numeric(2.3))

# date is double, but not numeric
print( is.numeric(as.Date('2018-03-31')))  

[1] TRUE
[1] TRUE
[1] FALSE


# Built-In Data Structure

## Vector
- Vector can only contain single data type (atomic) elements

### Creating Vector

#### Empty Vector
- Empty vector is vector with no elements, also known as **NULL**
- Use **is.null()** to test NULL

In [87]:
empty = c()
print   (empty)
typeof  (empty)
is.null (empty)

NULL


#### Assignment

In [88]:
x1 = c(1,2,3,4,5)    
x2 = c('yongks','mahathir','limge','annuar','limks')
x3 = c(1,4:6,10:15)  ## assign with range of number
str (x1)
str (x2)
str (x3)

 num [1:5] 1 2 3 4 5
 chr [1:5] "yongks" "mahathir" "limge" "annuar" "limks"
 num [1:10] 1 4 5 6 10 11 12 13 14 15


### Naming & Attributes
- Each element of vector can have a name

#### Creating Names

In [89]:
y        = c(7,8,9,10)
names(y) = c('seven','eight','nine')
print(y)

seven eight  nine  <NA> 
    7     8     9    10 


#### Retrieving Names
- **names()** return a vector of all names of a vector
- Elements that has not been assinged names will have **NA**

In [90]:
names(y)

#### Name as Attribute

Naming vector **creates "name" attributes**

In [91]:
attributes (y)

### Accessing Elements
- Accessing element(s) will always return as a new vector

#### Access with Index Vector
Supply a **number vector in [  ]** to select the elements.   
Index number starts at 1.

In [92]:
z = c('aaa','bbb','ccc','ddd','eee','fff','ggg','hhh','iii','jjj')

print( z[3]   )       ## retrieve single element
print( z[c(1,3,4)]  ) ## retrieve element 1,3,4
print( z[1:3] )       ## retrieve first 3 elements
print( z[c(1:3, 7:9)])

[1] "ccc"
[1] "aaa" "ccc" "ddd"
[1] "aaa" "bbb" "ccc"
[1] "aaa" "bbb" "ccc" "ggg" "hhh" "iii"


Use **negate (-num)** to deselect item(s)

In [93]:
print( z[c(-1,-4)] )  ## negate specific elements
print( z[c(-1:-3)] )  ## deselect first 3 elements

[1] "bbb" "ccc" "eee" "fff" "ggg" "hhh" "iii" "jjj"
[1] "ddd" "eee" "fff" "ggg" "hhh" "iii" "jjj"


#### Access with Logical Vector
Conceptual wize, index a vector with T/F will return elements which match T

In [94]:
z = c('aaa','bbb','ccc','ddd','eee','fff','ggg','hhh','iii','jjj')
print( z[ c(T,T,T,F,F,F,F,F,F,F)] ) ## retrieve first 3 elements

[1] "aaa" "bbb" "ccc"


**This method can be used as 'filtering'**, 
by first 
- Form a logical vector based on matching criteria of the vector itself
- Index the data vector with this logical vector 
- If logical vector is shorter than data, **logical elements repeat itself**

In [95]:
## create the logical vector
criteria = z %in% c('fff','iii')
print( criteria )
print( z[criteria] )   ## apply the logical vector to vector indexing

 [1] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
[1] "fff" "iii"


Logical vector **repeat itself** if shorter than data length.  

In [96]:
## select every alternate element (odd position)
z[c(T,F)]

#### Access with Name
Index with a vector of names

In [97]:
z = c(1,2,3,4,5,6,7,8,9)
names(z) = c('one','two','three','four','five','six','seven','eight','nine')

In [98]:
print( z['seven']  )            # does not require c() if single element
print( z[c('seven','nine')]  )  # choose two elements by name

seven 
    7 
seven  nine 
    7     9 


### Remove Item(s)

#### Characteristic

In [99]:
is.atomic(x2)

### Characteristic of Vector

In [100]:
z = c(1,2,3,4,5,6,7,8,9)
names(z) = c('one','two','three','four','five','six','seven','eight','nine')
attributes()

ERROR: Error in attributes(): 0 arguments passed to 'attributes' which requires 1


## Data Frame
### Class and Typeof
Observe that dataframe is a type of **list**

In [101]:
print( class(iris) )
print( typeof(iris) )
print( is.data.frame(iris) )
print( is.list(iris) )

[1] "data.frame"
[1] "list"
[1] TRUE
[1] TRUE


### Creating
- Use **data.frame()** to create data frame from vectors
- All original vectors must have same length, otherwise will have error

#### Create From Vectors
- Character is auto converted to **Factor**
- **Rownames** default to 1,2,3, etc

In [102]:
x1 = c(1,2,3,4,5)
x2 = c('one','two','three','four','firve')
x3 = c('satu','dua','tiga','empat','lima')
X  = data.frame(X1=x1, x2, x3)
str(X)

'data.frame':	5 obs. of  3 variables:
 $ X1: num  1 2 3 4 5
 $ x2: Factor w/ 5 levels "firve","four",..: 3 5 4 2 1
 $ x3: Factor w/ 5 levels "dua","empat",..: 4 1 5 2 3


- Disable Auto Factor conversion

In [103]:
X = data.frame(x1,x2,x3, stringsAsFactors = FALSE)
str( X )

'data.frame':	5 obs. of  3 variables:
 $ x1: num  1 2 3 4 5
 $ x2: chr  "one" "two" "three" "four" ...
 $ x3: chr  "satu" "dua" "tiga" "empat" ...


- Specify Row Names

In [104]:
X = data.frame(x1,x2, row.names = x3)
print( X )

      x1    x2
satu   1   one
dua    2   two
tiga   3 three
empat  4  four
lima   5 firve


### Column Manipulation

#### Sample Data

In [105]:
x1 = c(1,2,3,4,5)
x2 = c('one','two','three','four','five')
x3 = c('satu','dua','tiga','empat','lima')
rn = c('_1_','_2_','_3_','_4_','_5_')
X = data.frame(x1,x2,x3, 
               stringsAsFactors = FALSE, 
               row.names=rn)
print( X )

    x1    x2    x3
_1_  1   one  satu
_2_  2   two   dua
_3_  3 three  tiga
_4_  4  four empat
_5_  5  five  lima


#### Column Names
Notice both **names** and **colnames** return all column names in vector

In [125]:
print( names(X) )
print( colnames( X )  )

[1] "x1" "x2" "x3" "x4" "x5" "x6"
[1] "x1" "x2" "x3" "x4" "x5" "x6"


#### Selecting One Column

**Index subsetting return vector**

In [287]:
print( X[,2] )    # column number
print( X[,'x2'] ) # column name

[1] "one"   "two"   "three" "four"  "five" 
[1] "one"   "two"   "three" "four"  "five" 


**subset( )  return dataframe** with rownames retained

In [258]:
subset(X, select=2)  # return Dataframe, not Vector, notice rownames maintained

Unnamed: 0,x2
_1_,one
_2_,two
_3_,three
_4_,four
_5_,five


In [259]:
subset(X, select='x2')

Unnamed: 0,x2
_1_,one
_2_,two
_3_,three
_4_,four
_5_,five


#### Select Multiple Columns
**By specific column names**

In [284]:
print( X[,c('x1','x2','x3')] )   # by column names

    x1    x2    x3
_1_  1   one  satu
_2_  2   two   dua
_3_  3 three  tiga
_4_  4  four empat
_5_  5  five  lima


**subset( )** way is better, because it support **minus -** 

In [291]:
subset (X, select = c('x1','x2','x3'))

Unnamed: 0,x1,x2,x3
_1_,1,one,satu
_2_,2,two,dua
_3_,3,three,tiga
_4_,4,four,empat
_5_,5,five,lima


In [292]:
subset (X, select = -c(x5,x6))   # use -

Unnamed: 0,x1,x2,x3
_1_,1,one,satu
_2_,2,two,dua
_3_,3,three,tiga
_4_,4,four,empat
_5_,5,five,lima


**By Column Number**  

In [253]:
X[ , c(1:3)]

Unnamed: 0,x1,x2,x3
_1_,1,one,satu
_2_,2,two,dua
_3_,3,three,tiga
_4_,4,four,empat
_5_,5,five,lima


In [295]:
X[, -c(4:6)]    # use -

Unnamed: 0,x1,x2,x3
_1_,1,one,satu
_2_,2,two,dua
_3_,3,three,tiga
_4_,4,four,empat
_5_,5,five,lima


**subset( )** does the same job

In [273]:
subset(X, select=1:3) 

Unnamed: 0,x1,x2,x3
_1_,1,one,satu
_2_,2,two,dua
_3_,3,three,tiga
_4_,4,four,empat
_5_,5,five,lima


In [274]:
subset(X, select=-(4:6))  # use -

Unnamed: 0,x1,x2,x3
_1_,1,one,satu
_2_,2,two,dua
_3_,3,three,tiga
_4_,4,four,empat
_5_,5,five,lima


**By logical vector**

In [254]:
print( X[, colnames(X) %in% c('x2','x3')] )  # by column names

       x2    x3
_1_   one  satu
_2_   two   dua
_3_ three  tiga
_4_  four empat
_5_  five  lima


**subset( ) does th same job**

In [300]:
subset( X, select = colnames(X) %in% c('x1','x2','x3'))

Unnamed: 0,x1,x2,x3
_1_,1,one,satu
_2_,2,two,dua
_3_,3,three,tiga
_4_,4,four,empat
_5_,5,five,lima


#### Duplicate Column

In [117]:
X$x4 = X$x3
X$x5 = X$x2
X$x6 = X$x1
str(X)

'data.frame':	5 obs. of  6 variables:
 $ x1: num  1 2 3 4 5
 $ x2: chr  "one" "two" "three" "four" ...
 $ x3: chr  "satu" "dua" "tiga" "empat" ...
 $ x4: chr  "satu" "dua" "tiga" "empat" ...
 $ x5: chr  "one" "two" "three" "four" ...
 $ x6: num  1 2 3 4 5


#### Remove Column(s)
**Remove Single Column**

In [111]:
X$x4 = NULL
str(X)

'data.frame':	5 obs. of  3 variables:
 $ x1: num  1 2 3 4 5
 $ x2: chr  "one" "two" "three" "four" ...
 $ x3: chr  "satu" "dua" "tiga" "empat" ...


**Remove Multiple Columns**  
There is no inplace method for column deletion. Use column selction method to deselect columns you want to remove, and save into the same variable name

#### Combining Column(s)
Use **cbind**, stands for column bind

In [116]:
## create a new vector
x5 = X$x1
x6 = X$x2
## combine it into dataframe
cbind(X,x5, x6)

Unnamed: 0,x1,x2,x3,x5,x6
_1_,1,one,satu,1,one
_2_,2,two,dua,2,two
_3_,3,three,tiga,3,three
_4_,4,four,empat,4,four
_5_,5,five,lima,5,five


### Select Rows
Single or Multi Rows selection **always return dataframe**

#### Select Single Row

In [304]:
print( X[3,] )     # by row number

    x1    x2   x3    x5 x6
_3_  3 three tiga three  3


In [305]:
print( X['_3_',])  # by row names

    x1    x2   x3    x5 x6
_3_  3 three tiga three  3


#### Select Multiple Rows

In [302]:
print( X[3:5,] )                  # by row number

    x1    x2    x3    x5 x6
_3_  3 three  tiga three  3
_4_  4  four empat  four  4
_5_  5  five  lima  five  5
    x1    x2    x3    x5 x6
_3_  3 three  tiga three  3
_4_  4  four empat  four  4
_5_  5  five  lima  five  5


In [303]:
print( X[c('_3_','_4_','_5_'),] ) # by row names

    x1    x2    x3    x5 x6
_3_  3 three  tiga three  3
_4_  4  four empat  four  4
_5_  5  five  lima  five  5


In [314]:
criteria = X$x1>=4 | X$x3=='tiga'  # construct a logical vector
X[criteria, ]

Unnamed: 0,x1,x2,x3,x5,x6
_3_,3,three,tiga,three,3
_4_,4,four,empat,four,4
_5_,5,five,lima,five,5


**subset( ) Method**

In [317]:
criteria = X$x1>=4 | X$x3=='tiga'  # construct a logical vector
subset( X, criteria )

Unnamed: 0,x1,x2,x3,x5,x6
_3_,3,three,tiga,three,3
_4_,4,four,empat,four,4
_5_,5,five,lima,five,5


### Data Frame Properties

#### Dimension
**dim(  )** returns vector of two elemens (rows,cols)

In [337]:
print( dim( iris )  )  # vector of two elements
print( dim(iris)[1] )  # get rows
print( dim(iris)[2] )  # get columns

[1] 150   5
[1] 150
[1] 5


**Number of Rows**

In [338]:
print( nrow(iris) )

[1] 150


**Number of Columns**  
It may not be intuitive, however, length() on dataframe return number of columns

In [339]:
print( ncol(iris) )
print( length(iris) )

[1] 5
[1] 5


#### Attributes
Let's see all available attributes for data frame

In [340]:
attributes(iris)

**Retrieve single attribute**

In [341]:
attr(iris,'names')

#### Names
**Column Names**  
- Observe that both **names()  and colnames()** return dataframe column names 
- However, **colnames(  )** is more intuitive

In [342]:
print( names(iris) )
print( colnames(iris) )

[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"     
[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"     


**Row Names**

In [343]:
rownames(iris)

## List
- List can hold many types of data including list
- Each data that it holds can have different length (unlike dataframe)

# Control Structure

## if..then..else
If else is **not vectorized**. It doesn't support multiple elements input

In [86]:
X = 1
if (X==1) 
    print('1')

[1] "1"


**{  } is necessary** when **else** presence. Otherwise error

In [345]:
X = 1
if (X ==1) {
    print('1')
} else {
    print('not one')
}

[1] "1"


## Looped ifelse
ifelse loop through all elements, and return value for True and False match

In [111]:
V = 0:5
print( V )
print( ifelse(V%%2, "Odd", "Even")) 

[1] 0 1 2 3 4 5
[1] "Even" "Odd"  "Even" "Odd"  "Even" "Odd" 


Since it is **vectorized**, the input can be **passed to function** to calculate return value 

In [1]:
V = -2:4
print( V )
print( ifelse(V>0, sqrt(V), sqrt(V*-1) )) 

[1] -2 -1  0  1  2  3  4


"NaNs produced"

[1] 1.414214 1.000000 0.000000 1.000000 1.414214 1.732051 2.000000


## For Loop

Loop through any vector, numeric or character

### Loop through Vectors
**Number Vector**

In [346]:
for(i in 1:5)
    print(i)

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5


**Character Vector**  

In [347]:
## Create the Data vector
students = c(10,15,16,18,20)
names(students) = c('ali','abu','ah kow','sammy','david')
print(students)

   ali    abu ah kow  sammy  david 
    10     15     16     18     20 


In [348]:
## Loop
for (i in 1:length(students)) {
    cat('Student name: ', names(students)[i], '\tage: ', students[i],'\n')
}

Student name:  ali 	age:  10 
Student name:  abu 	age:  15 
Student name:  ah kow 	age:  16 
Student name:  sammy 	age:  18 
Student name:  david 	age:  20 


### Break and Next
**Next Loop, skip anything remaining**

In [349]:
for(i in 1:10) {
    if (i%%2)  # if even number
        next
    else
        print(i)
}

[1] 2
[1] 4
[1] 6
[1] 8
[1] 10


**Exit Loop Immediately**

In [350]:
for(i in 1:10) {
    if (i>5)  # stop the loop entirely if i>5
        break
    else
        print(i)
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5


## While Loop

### Loop through Number Vector

In [351]:
i = 1
while (i <= 5) {
    print(i)
    i = i+1
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5


### Loop through Character Vector

In [352]:
students = c(10,15,16,18,20)
names(students) = c('ali','abu','ah kow','sammy','david')
print(students)

   ali    abu ah kow  sammy  david 
    10     15     16     18     20 


In [353]:
i = 1
while (i <= length(students)) {
    cat('Student name: ', names(students)[i], '\tage: ', students[i],'\n')
    i = i+1
}

Student name:  ali 	age:  10 
Student name:  abu 	age:  15 
Student name:  ah kow 	age:  16 
Student name:  sammy 	age:  18 
Student name:  david 	age:  20 


### Next and Break

**Next will continue to next loop, skip remaining statements**

In [354]:
i = 0
while (i <= 5) {
    i = i + 1
    if ( i%%2 ) ( # if odd number, skip
        next
    )
    print(i)
}

[1] 2
[1] 4
[1] 6


**Break will entirely exit the loop**

In [355]:
i = 0
while (i <= 5) {
    i = i + 1
    if ( i==3 ) ( 
        break
    )
    print(i)
}

[1] 1
[1] 2


## Repeat Loop
Repeat is a forever loop with no conditional matching  
Ensure you program has a if condition to **break**

In [356]:
i = 0
repeat {
    i = i + 1
    if (i%%2) { # if odd number, skip
        next
        }
    if (i>10) { # if >5, stop
        break
        }
    print(i)
}

[1] 2
[1] 4
[1] 6
[1] 8
[1] 10


## Apply Family
**apply( )** are much faster than for loop

#### Loop Through Each Element In Vector (sapply)

In [36]:
vc = 1:4
sapply( vc, function(x) sqrt((x+3)) ) 

#### Loop Through Data Frame (apply)
**Column Wise**  
Use **apply** with **MARGIN=2** for column

Return **mean** for each column. Result is a **vector**.

In [2]:
print( apply( mtcars, MARGIN=2, mean) )

       mpg        cyl       disp         hp       drat         wt       qsec 
 20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
        vs         am       gear       carb 
  0.437500   0.406250   3.687500   2.812500 


**Scale Min-Max** on all elements by Column. Return is a **dataframe**

In [5]:
apply( mtcars, MARGIN=2,
      function(x) { x/(max(x)-min(x))}
) %>% head

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,0.893617,1.5,0.399102,0.3886926,1.797235,0.6699054,1.959524,0,1,2.0,0.5714286
Mazda RX4 Wag,0.893617,1.5,0.399102,0.3886926,1.797235,0.7351061,2.02619,0,1,2.0,0.5714286
Datsun 710,0.9702128,1.0,0.2693939,0.3286219,1.774194,0.5931987,2.215476,1,1,2.0,0.1428571
Hornet 4 Drive,0.9106383,1.5,0.643552,0.3886926,1.419355,0.8220404,2.314286,1,0,1.5,0.1428571
Hornet Sportabout,0.7957447,2.0,0.8979795,0.6183746,1.451613,0.8795704,2.02619,0,0,1.5,0.2857143
Valiant,0.7702128,1.5,0.5612372,0.3710247,1.271889,0.8846842,2.407143,1,0,1.5,0.1428571


**Row Wise**  
Use **apply** with **MARGIN=1** for column

In [34]:
print( apply( mtcars, MARGIN=1, mean) )

       mpg        cyl       disp         hp       drat         wt       qsec 
 20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
        vs         am       gear       carb 
  0.437500   0.406250   3.687500   2.812500 
          Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
           29.90727            29.98136            23.59818            38.73955 
  Hornet Sportabout             Valiant          Duster 360           Merc 240D 
           53.66455            35.04909            59.72000            24.63455 
           Merc 230            Merc 280           Merc 280C          Merc 450SE 
           27.23364            31.86000            31.78727            46.43091 
         Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
           46.50000            46.35000            66.23273            66.05855 
  Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
           65.97227      

#### Loop Through Data Frame Groups
Example below group each category of cyl, and calculate mean for each group.

In [47]:
print( tapply(mtcars$mpg,mtcars$cyl, mean, na.rm=T) )

       4        6        8 
26.66364 19.74286 15.10000 


# Data Generation

## Numbers

### Sequantial Number
#### Incremental 
**Incremental by 1 Step**

In [357]:
print( 3:12)
print( seq (3, 12) )        # integer increment, default by=1

 [1]  3  4  5  6  7  8  9 10 11 12
 [1]  3  4  5  6  7  8  9 10 11 12


In [358]:
print( 3.3:12.5 )
print( seq (3.3, 12.5) )    # double increment, default by 1

 [1]  3.3  4.3  5.3  6.3  7.3  8.3  9.3 10.3 11.3 12.3
 [1]  3.3  4.3  5.3  6.3  7.3  8.3  9.3 10.3 11.3 12.3


**Incrementaal by x Step**

In [359]:
print( seq (3, 12, by = 4) )           # increment of integer
print( seq (3.25, 12.50, by = 2.25))   # increment of decimal 

[1]  3  7 11
[1]  3.25  5.50  7.75 10.00 12.25


In [360]:
seq(3, 17, length.out=6)   # 

**Incremental by Equal Spreading**

In [361]:
seq(15,3, length.out=6)

#### Decremental

**Decrement by 1 Step**

In [362]:
print( 12:3 )   
print( seq(12,3)  )            # default by -1

 [1] 12 11 10  9  8  7  6  5  4  3
 [1] 12 11 10  9  8  7  6  5  4  3


**Decrement by X Step**

In [363]:
print( seq(12,3, by = -3)  )               # integer
print( seq(12.50, 3.25, by = -1.25)  )     # double

[1] 12  9  6  3
[1] 12.50 11.25 10.00  8.75  7.50  6.25  5.00  3.75


**Decrement by Equal Spreading**

In [364]:
print( seq(12.50, 3.25, length.out=6) )    # double

[1] 12.50 10.65  8.80  6.95  5.10  3.25


## Random Numbers

### Normal Distribution

In [365]:
print( rnorm(8, mean=3,sd=1.25) )

[1] 0.1049548 3.7030897 2.0202811 2.7174325 1.0161213 3.6844053 5.3640338
[8] 1.9024036


### Unified Distribution

In [366]:
print (runif(8,min = 3, max=10))

[1] 6.186331 4.427298 9.820346 3.695859 8.773605 3.707184 9.804908 3.037540


## Factor

### Random Non Ordered Factor

In [367]:
rf2 = gl(n=2, k=3, length = 12, c('Aaa','Bbb','Ccc','Ddd'))
print( rf2 )


 [1] Aaa Aaa Aaa Bbb Bbb Bbb Aaa Aaa Aaa Bbb Bbb Bbb
Levels: Aaa Bbb Ccc Ddd


### Random Ordered Factor

In [368]:
rf1 = gl( 3, 5, 9, c('Aaa','Bbb','Ccc'), ordered=TRUE )
print( rf1 )

[1] Aaa Aaa Aaa Aaa Aaa Bbb Bbb Bbb Bbb
Levels: Aaa < Bbb < Ccc


## Sampling

### Without Replacement

Sample size MUST not be larger than population**

In [369]:
X = c(1,2,3,4,5,6,7,8)
print( sample(X, size = 5) )

[1] 4 7 3 2 5


### Sampling With Replacement

In [370]:
print( sample(X,size = 8, replace=TRUE) )

[1] 8 8 1 1 5 5 7 4


# Statistics

## Summary Stats
### Sample Data

In [372]:
set.seed(1234)
my.df = data.frame(
  com  = paste('C',sample(1:3, 50, replace = T),sep=''),
  dept = paste('D',sample(1:5, 50, replace = T),sep=''),
  x1 = rnorm(1:100, mean = 50, sd = 5),
  x2 = rnorm(1:100, mean = 20, sd = 3),
  x3 = rnorm(1:100, mean =  5, sd = 1),
  stringsAsFactors = F
)

In [373]:
print( head(my.df) )

  com dept       x1       x2       x3
1  C1   D1 40.96984 18.86829 5.436931
2  C2   D2 47.08962 20.29286 6.060124
3  C2   D4 44.45555 24.91623 5.452190
4  C2   D3 44.92519 17.37322 5.663199
5  C3   D1 49.18845 20.36528 3.863626
6  C2   D3 52.81528 24.08639 4.629502


In [374]:
str(my.df)

'data.frame':	100 obs. of  5 variables:
 $ com : chr  "C1" "C2" "C2" "C2" ...
 $ dept: chr  "D1" "D2" "D4" "D3" ...
 $ x1  : num  41 47.1 44.5 44.9 49.2 ...
 $ x2  : num  18.9 20.3 24.9 17.4 20.4 ...
 $ x3  : num  5.44 6.06 5.45 5.66 3.86 ...


### Mean

#### Singlve Vector Mean
Input **MUST be numeric vector**, otherwise error

In [375]:
print( mean( my.df$x1, na.rm=T) )

[1] 50.39877


#### Columns Means
Input **MUST be matrix or numeric dataframe**, otherwise error

In [376]:
print( colMeans(my.df[,c('x1','x2','x3')]) )

      x1       x2       x3 
50.39877 20.40973  5.05275 


###  Summation
#### Single Vector Sum
Supply matrix or multiple columns of numeric data will sum all accross all columns

In [377]:
sum(my.df[,c('x1','x2','x3')])

#### Columns Sum

In [378]:
print( colSums(my.df[,c('x1','x2','x3')], na.rm=T) )

      x1       x2       x3 
5039.877 2040.973  505.275 


## Frequency Table

### One Dimension

#### table

In [379]:
t1 = table(my.df$com)
print( t1 )


C1 C2 C3 
46 30 24 


In [380]:
print( class(t1)  )
print( typeof(t1) )

[1] "table"
[1] "integer"


#### tidyverse

In [381]:
my.df %>% count(com)

com,n
C1,46
C2,30
C3,24


**count( )** is shorthand for below

In [382]:
my.df %>% group_by(com) %>% 
   summarize(n = n())

com,n
C1,46
C2,30
C3,24


### Two Dimension
#### table

In [383]:
table(my.df$com, my.df$dep)

    
     D1 D2 D3 D4 D5
  C1 18 12  8  6  2
  C2 10  4  8  2  6
  C3  8  4  2  4  6

#### tidyverse

In [384]:
my.df %>% count(com,dept) %>%
   spread(key=dept, value=n)

com,D1,D2,D3,D4,D5
C1,18,12,8,6,2
C2,10,4,8,2,6
C3,8,4,2,4,6


## Proportion Table
### One Dimension
#### prop.table

In [385]:
t1 = table(my.df$com)
prop.table( t1 )


  C1   C2   C3 
0.46 0.30 0.24 

#### tidyverse

In [386]:
my.df %>% count(com) %>%
  mutate(pct=n/sum(n))

com,n,pct
C1,46,0.46
C2,30,0.3
C3,24,0.24


### Two Dimension

#### prop.table
Table-wise proportion

In [387]:
t2 = table(my.df$com, my.df$dep)
prop.table(t2)

    
       D1   D2   D3   D4   D5
  C1 0.18 0.12 0.08 0.06 0.02
  C2 0.10 0.04 0.08 0.02 0.06
  C3 0.08 0.04 0.02 0.04 0.06

**Row-wise proportion (margin=1)**

In [388]:
t2 = table(my.df$com, my.df$dep)
prop.table(t2,margin = 1)

    
             D1         D2         D3         D4         D5
  C1 0.39130435 0.26086957 0.17391304 0.13043478 0.04347826
  C2 0.33333333 0.13333333 0.26666667 0.06666667 0.20000000
  C3 0.33333333 0.16666667 0.08333333 0.16666667 0.25000000

**Column-wise proportion (margin=2)**

In [389]:
t2 = table(my.df$com, my.df$dep)
prop.table(t2,margin = 2)

    
            D1        D2        D3        D4        D5
  C1 0.5000000 0.6000000 0.4444444 0.5000000 0.1428571
  C2 0.2777778 0.2000000 0.4444444 0.1666667 0.4285714
  C3 0.2222222 0.2000000 0.1111111 0.3333333 0.4285714

#### tidyverse
**Table-Wise Proportion**

In [390]:
my.df %>% count(com,dept) %>%
    mutate(pct=n/sum(n)) %>%
    select(com, dept, pct) %>%
    spread(key=dept, value=pct)

com,D1,D2,D3,D4,D5
C1,0.18,0.12,0.08,0.06,0.02
C2,0.1,0.04,0.08,0.02,0.06
C3,0.08,0.04,0.02,0.04,0.06


**Row-Wise Proportion**

In [391]:
my.df %>% group_by(com, dept) %>%
    summarize (n = n()) %>%
    mutate(pct=n/sum(n)) %>%
    select(com, dept, pct) %>%
    spread(key=dept, value=pct)

com,D1,D2,D3,D4,D5
C1,0.3913043,0.2608696,0.17391304,0.13043478,0.04347826
C2,0.3333333,0.1333333,0.26666667,0.06666667,0.2
C3,0.3333333,0.1666667,0.08333333,0.16666667,0.25


**Column-Wise Proportion**

In [392]:
my.df %>% group_by(dept,com) %>%
    summarize(n=n()) %>%
    mutate(pct=n/sum(n)) %>%
    select(com,dept, pct) %>%
    spread(key=dept,value=pct)

com,D1,D2,D3,D4,D5
C1,0.5,0.6,0.4444444,0.5,0.1428571
C2,0.2777778,0.2,0.4444444,0.1666667,0.4285714
C3,0.2222222,0.2,0.1111111,0.3333333,0.4285714
