-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdplyr.R
111 lines (81 loc) Β· 3.35 KB
/
dplyr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#################################### dplyr #####################################
library(tidyverse)
library(lubridate)
library(nycflights13)
##################### using the select() function from dplyr ###################
# load in the nyc flights data
nycflights13::flights
df = nycflights13::flights %>%
slice(1:1000)
df = nycflights13::flights %>%
sample_n(1000)
glimpse(df) # peek at the data
# use select() to only keep certain variables
select(df, origin, dep_delay, arr_delay)
# or
df %>% select(origin, dep_delay, arr_delay)
# use select and the - to keep everything except certain variables
df %>% select(-origin)
# we can use select to reorder the variables
df %>% select(hour, minute) #but this will drop all other variables
df %>% select(hour, minute, everything()) #this will include everything and change the order
# we can select a range of variables
df %>% select(year:day) #by name
df %>% select(1:3) #by position
# we can rename variables while we select them
df %>% select(flight_year = year, flight_day = day, origin)
# helper functions
df %>% select(starts_with('arr_'))
df %>% select(ends_with('time'))
df %>% select(contains('_'))
df %>% select(-contains('_'))
################## using filter(), arrange(), and mutate() #####################
# filter allows us to use a conditional to only return certain rows
# a conditional must evaluate to a TRUE/FALSE logical vector
# filter data so that we only have cases (rows) of flights before Sept
df %>% dplyr::filter(month < 9)
# arrange sorts the data by a variable
df %>% select(time_hour, everything()) %>%
arrange(time_hour)
# arrange in descending order
df %>% select(time_hour, everything()) %>%
arrange(desc(time_hour))
# arrange by multiple columns
df %>% arrange(year, desc(month), day)
# rename variables using the rename command
df %>% select(flight_day = day, everything())
df %>% rename(flight_day = day) #this is the easier way to do the above
# mutate (transform) variables to add a new variable/field
df %>% mutate(date = as_date(time_hour)) %>%
select(date, everything())
df %>% mutate(log_distance = log(distance)) %>%
select(log_distance, everything())
####################### more advanced dplyr functions ##########################
# do something to all of the variables
df %>% select(distance, arr_time, dep_time) %>%
mutate_all(log)
# this works for other functions too, like rename, using regex
rename_function = function(x) paste0(x, "_123")
df %>% rename_all(rename_function) %>%
glimpse() # this is inefficient
df %>% rename_all(function(x) paste0(x, "_123")) %>%
glimpse # this does the same as above, but more efficient
df %>% rename_all(~ paste0(., '_123')) # this does the same as above, but EVEN more efficient
# we can only choose what variables to work on with the _if selector
df %>% select_if(is.numeric)
df %>% mutate_if(is.numeric, function(x) log(abs(x)))
df %>% mutate_if(is.numeric, ~ log(abs(.)))
# _at allows us to only specific certain variables
df %>% rename_at(
vars(contains('time')),
function(x) paste0(x, "__123")
) %>% glimpse()
df %>% mutate_at(
vars(month, day, dep_time),
function(x) x^2
)
# be aware that you can use pipes inside of dplyr functions too
df %>% mutate_if(
is.character,
~ str_to_lower(.) %>% str_c("CONCAT")
) %>% glimpse()