/
Data-Manipulation.R
186 lines (136 loc) · 5.62 KB
/
Data-Manipulation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
##### Data Manipulation #####
# Make sure packages are installed
install.packages(c("tidyr", "dplyr", "knitr",
"rmarkdown", "formatR"))
library(dplyr) # Needed for subsetting data
library(tidyr) # Needed for reshaping data
library(ggplot2) # Needed for graphing
# Read in our dataset
gap <- read.csv(file = "data/gapminder-FiveYearData.csv")
##### dplyr #####
### Subsetting Data
# Always a good idea to take a look at our dataset
head(gap)
str(gap)
# Using dplyr's select() function to subset our data by column
# We can specify only the columns we want
yr_country_gdp <- select(gap, year, country, gdpPercap)
# Check the result
head(yr_country_gdp)
# Pipes: (the %>%)
# Think of this like an assembly line, the pipe sends the data to the next step
# the gap dataset is piped to the select() function.
# It always becomes the first argument for the next function in line (in this case, select)
yr_country_gdp <- gap %>% select(year, country, gdpPercap)
# We can pipe through multiple steps
# gap dataset gets piped to filter().
# Filter subsets the dataset to only include rows from Europe
# Then, the result gets piped to select, where we subset by column
yr_country_gdp_eu <- gap %>%
filter(continent == "Europe") %>%
select(year, country, gdpPercap)
# Let's look at the result
head(yr_country_gdp_eu)
# Instead of using pipes, we could nest functions this way. This does the same as above
# Get's hard to read and understand if there are many (10, 20, 30+) steps.
yr_country_gdp_eu <- select(filter(gap, continent == "Europe"), year, country, gdpPercap)
# the filter function can do multiple filters at once to subset rows
Africa_07 <- gap %>%
filter(year == 2007, continent == "Africa") %>%
select(year, country, lifeExp)
# Check the result
head(Africa_07)
### Summarizing Data
# We can summarize data, making new columns in the process
# The following calculates the mean GDP for the entire dataset
mean_gdp <- gap %>%
summarize(meanGDP = mean(gdpPercap))
# Show the result
mean_gdp
# The above is not so useful, we could just do it this way instead
mean(gap$gdpPercap)
# Summarizing becomes more useful when we summarize separately for different continents
# The following calculates meanGDP for each continent by using a group_by() term
# group_by() and summarize() work hand-in-hand, you usually need both
gdp_by_cont <- gap %>%
group_by(continent) %>%
summarize(meanGDP = mean(gdpPercap))
# Display the result, 4 rows of data with continent name, and mean GDP
gdp_by_cont
# You can group by multiple columns to show summarizes for each continent-year combination
# Summarize can create multiple summaries: in this case: mean of 2 columns, sd, sample size
gdp_by_cont <- gap %>%
group_by(continent, year) %>%
summarize(meanGDP = mean(gdpPercap),
sd_gdp = sd(gdpPercap),
mean_pop = mean(pop),
sample_size = n()
)
# Take a look at the result
head(gdp_by_cont)
# summarize() and other
gdp_by_cont %>%
data.frame() %>%
head()
# We can even use pipes to send datasets to ggplot
# Again, whatever is piped to ggplot becomes ggplot()'s first argument
gdp_by_cont %>%
ggplot(aes(x = mean_pop, y = meanGDP)) +
geom_point()
# We can create new columns (without summarizing), using the mutate() function
# You can do math with column names (as below), or any other R function
bill_gdp <- gap %>%
filter(year == 2007) %>%
mutate(billGDP = gdpPercap * pop / 10^9)
# Take a look at the result
head(bill_gdp)
# Most of data wrangling with dplyr can be accomplised with these five functions
# Subsetting with select() and filter()
# Summarizing with group_by() and summarize()
# Making new columns with mutate()
##### tidyr #####
### Wide vs Long Data
# Read in the same dataset, but in wide format
gap_wide <- read.csv(file = "data/gapminder_wide.csv")
# Take a look at this new dataset
# More columns and less rows than the original dataset
# Wide format is more like datasheets.
# But ggplot and many analyses need long format, more like the original gap dataset
head(gap_wide)
str(gap_wide)
# Remind ourselves what the gap dataset looks like
head(gap)
### Let's reshape from wide to long
# First, we use gather to put all the pop, lifeExp and gdpPerCap columns into two columns
# Obstype_year column becomes the original column names
# obs_values becomes the values
# This will make a dataset with many more rows and many fewer columns
# Note: tidyr works with pipes too
gap_long <- gap_wide %>%
gather(obstype_year, obs_values, starts_with('pop'),
starts_with('lifeExp'), starts_with('gdpPerCap'))
# Take a look at the result
str(gap_long)
head(gap_long)
# In addition to the above step, we also need to separate the obstype_year column
# It contains two kinds of information in it: the observation type, and the year
# We give it two column names to put the separated data into, and a separator to use
gap_long <- gap_long %>%
separate(obstype_year, into = c("obs_type", "year"),
sep = "_")
# Take a look at the result
head(gap_long)
str(gap_long)
# This should look like the same format as the gap dataset, though perhaps sorted differently
head(gap)
str(gap)
# The following takes the long dataset and goes back to the original wide format.
# We're just reversing the separate() and gather() steps
wide <- gap_long %>%
unite(var_names, obs_type, year, sep = "_") %>%
spread(var_names, obs_values)
# Take a look at the result
str(wide)
# Most of reshaping data can be done with four functions
# gather() and spread() to go back and forth between wide and long
# separate() and unite() to combine or separate columns