## Working with Data

In [1]:
# Set the working directory
setwd("C:/DataScience")

In [2]:
# Read a tab-delimited data file
cars <- read.table(
file="Cars.txt",
header=TRUE,
sep="\t",
quote="\"")

In [3]:
# Peek at the data
head(cars)

Name,Transmission,Cylinders,Fuel.Economy
Mazda RX4,Manual,6,21.0
Mazda RX4 Wag,Manual,6,21.0
Datsun 710,Manual,4,22.8
Hornet 4 Drive,Automatic,6,21.4
Hornet Sport,Automatic,8,18.7
Valiant,Automatic,6,18.1


In [4]:
# Load the dplyr library
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [5]:
# Select a subset of columns
temp <- select(.data=cars, Transmission, Cylinders, Fuel.Economy)

In [6]:
# Inspect the results
head(temp)

Transmission,Cylinders,Fuel.Economy
Manual,6,21.0
Manual,6,21.0
Manual,4,22.8
Automatic,6,21.4
Automatic,8,18.7
Automatic,6,18.1


In [7]:
# Filter a subset of rows
temp <- filter(.data=temp, Transmission=="Automatic")

In [8]:
# Inspect the results
head(temp)

Transmission,Cylinders,Fuel.Economy
Automatic,6,21.4
Automatic,8,18.7
Automatic,6,18.1
Automatic,8,14.3
Automatic,4,24.4
Automatic,4,22.8


In [9]:
# Compute a new column
temp <- mutate(.data=temp, Consumption=Fuel.Economy*0.425) # 0.425 is conversion paramter for miles/galon to km/litre

In [10]:
# Inspect the results
head(temp)

Transmission,Cylinders,Fuel.Economy,Consumption
Automatic,6,21.4,9.095
Automatic,8,18.7,7.9475
Automatic,6,18.1,7.6925
Automatic,8,14.3,6.0775
Automatic,4,24.4,10.37
Automatic,4,22.8,9.69


In [11]:
# Group by a column
temp <- group_by(.data=temp, Cylinders)

In [12]:
# Inspect the results
head(temp)

Transmission,Cylinders,Fuel.Economy,Consumption
Automatic,6,21.4,9.095
Automatic,8,18.7,7.9475
Automatic,6,18.1,7.6925
Automatic,8,14.3,6.0775
Automatic,4,24.4,10.37
Automatic,4,22.8,9.69


In [13]:
# Aggregate based on groups
temp <- summarize(.data=temp, Avg.Consumption=mean(Consumption))

In [14]:
# Inspect the results
head(temp)

Cylinders,Avg.Consumption
4,9.7325
6,8.128125
8,6.39625


In [15]:
# Arrange the rows in descending order
temp <- arrange(.data=temp, desc(Avg.Consumption))

In [16]:
# Inspect the results
head(temp)

Cylinders,Avg.Consumption
4,9.7325
6,8.128125
8,6.39625


In [17]:
# Convert to dataframe
efficiency <- as.data.frame(temp)

In [18]:
# Inspect the results
print(efficiency)

  Cylinders Avg.Consumption
1         4        9.732500
2         6        8.128125
3         8        6.396250


In [19]:
# Chain methods together
efficiency <- cars %>% 
select(Fuel.Economy, Cylinders, Transmission) %>% 
filter(Transmission=="Automatic") %>% 
mutate(Consumption=Fuel.Economy*0.425) %>%
group_by(Cylinders) %>% 
summarize(Avg.Consumption=mean(Consumption)) %>%
arrange(desc(Avg.Consumption)) %>%
as.data.frame() # %>% works as pipelining 

In [20]:
# Inspect the results
print(efficiency)

  Cylinders Avg.Consumption
1         4        9.732500
2         6        8.128125
3         8        6.396250


In [21]:
# Save the reults to a csv file
write.csv(x=efficiency, file="Fuel_Efficiency.csv", row.names=FALSE)