# 3. Data transformation

Taken from [the book](https://r4ds.hadley.nz/data-transform).

In [2]:
library(nycflights13)
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


## Exploring the data

In [3]:
glimpse(flights)

Rows: 336,776
Columns: 19
$ year           [3m[90m<int>[39m[23m 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            [3m[90m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       [3m[90m<int>[39m[23m 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time [3m[90m<int>[39m[23m 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      [3m[90m<dbl>[39m[23m 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       [3m[90m<int>[39m[23m 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time [3m[90m<int>[39m[23m 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      [3m[90m<dbl>[39m[23m 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        [3m[90m<chr>[39m[23m "UA", "UA", "AA", "B6", "DL", "UA", "B6",

In [4]:
head(flights)

year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
2013,1,1,517,515,2,830,819,11,UA,1545,N14228,EWR,IAH,227,1400,5,15,2013-01-01 05:00:00
2013,1,1,533,529,4,850,830,20,UA,1714,N24211,LGA,IAH,227,1416,5,29,2013-01-01 05:00:00
2013,1,1,542,540,2,923,850,33,AA,1141,N619AA,JFK,MIA,160,1089,5,40,2013-01-01 05:00:00
2013,1,1,544,545,-1,1004,1022,-18,B6,725,N804JB,JFK,BQN,183,1576,5,45,2013-01-01 05:00:00
2013,1,1,554,600,-6,812,837,-25,DL,461,N668DN,LGA,ATL,116,762,6,0,2013-01-01 06:00:00
2013,1,1,554,558,-4,740,728,12,UA,1696,N39463,EWR,ORD,150,719,5,58,2013-01-01 05:00:00


## Intro to `dplyr`

In [6]:
flights |>
  filter(dest == "IAH") |> 
  group_by(year, month, day) |> 
  summarize(
    arr_delay = mean(arr_delay, na.rm = TRUE)
  ) |>
  head()

[1m[22m`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.


year,month,day,arr_delay
<int>,<int>,<int>,<dbl>
2013,1,1,17.85
2013,1,2,7.0
2013,1,3,18.315789
2013,1,4,-3.2
2013,1,5,20.230769
2013,1,6,9.277778


In [9]:
flights |> 
  filter(dep_delay > 120) |>
  head(10)

year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
2013,1,1,848,1835,853,1001,1950,851,MQ,3944,N942MQ,JFK,BWI,41,184,18,35,2013-01-01 18:00:00
2013,1,1,957,733,144,1056,853,123,UA,856,N534UA,EWR,BOS,37,200,7,33,2013-01-01 07:00:00
2013,1,1,1114,900,134,1447,1222,145,UA,1086,N76502,LGA,IAH,248,1416,9,0,2013-01-01 09:00:00
2013,1,1,1540,1338,122,2020,1825,115,B6,705,N570JB,JFK,SJU,193,1598,13,38,2013-01-01 13:00:00
2013,1,1,1815,1325,290,2120,1542,338,EV,4417,N17185,EWR,OMA,213,1134,13,25,2013-01-01 13:00:00
2013,1,1,1842,1422,260,1958,1535,263,EV,4633,N18120,EWR,BTV,46,266,14,22,2013-01-01 14:00:00
2013,1,1,1856,1645,131,2212,2005,127,AA,181,N323AA,JFK,LAX,336,2475,16,45,2013-01-01 16:00:00
2013,1,1,1934,1725,129,2126,1855,151,MQ,4255,N909MQ,JFK,BNA,154,765,17,25,2013-01-01 17:00:00
2013,1,1,1938,1703,155,2109,1823,166,EV,4300,N18557,EWR,RIC,68,277,17,3,2013-01-01 17:00:00
2013,1,1,1942,1705,157,2124,1830,174,MQ,4410,N835MQ,JFK,DCA,60,213,17,5,2013-01-01 17:00:00


In [21]:
# Flights that departed on January 1
flights |> 
  filter(month == 1 & day == 1) |>
  sample_n(10)


year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
2013,1,1,1318,1322,-4,1358,1416,-18,EV,4106,N19554,EWR,BDL,25,116,13,22,2013-01-01 13:00:00
2013,1,1,805,800,5,1118,1106,12,B6,3,N570JB,JFK,FLL,165,1069,8,0,2013-01-01 08:00:00
2013,1,1,857,905,-8,1107,1120,-13,DL,181,N321NB,LGA,DTW,110,502,9,5,2013-01-01 09:00:00
2013,1,1,1251,1252,-1,1611,1555,16,B6,85,N657JB,JFK,FLL,173,1069,12,52,2013-01-01 12:00:00
2013,1,1,629,630,-1,824,810,14,AA,303,N3CYAA,LGA,ORD,140,733,6,30,2013-01-01 06:00:00
2013,1,1,1515,1437,38,1834,1742,52,B6,347,N178JB,JFK,SRQ,171,1041,14,37,2013-01-01 14:00:00
2013,1,1,1005,1000,5,1239,1234,5,UA,1625,N81449,EWR,DEN,254,1605,10,0,2013-01-01 10:00:00
2013,1,1,1720,1725,-5,2121,2105,16,DL,513,N723TW,JFK,LAX,363,2475,17,25,2013-01-01 17:00:00
2013,1,1,1456,1500,-4,1649,1632,17,UA,685,N802UA,LGA,ORD,140,733,15,0,2013-01-01 15:00:00
2013,1,1,1422,1410,12,1613,1555,18,MQ,4491,N737MQ,LGA,CLE,93,419,14,10,2013-01-01 14:00:00


In [20]:
# Flights that departed in January or February
flights |> 
  filter(month == 1 | month == 2) |>
  sample_n(10)

year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
2013,1,16,2008,2005,3,2254,2258,-4,UA,405,N411UA,EWR,MCO,141,937,20,5,2013-01-16 20:00:00
2013,1,28,900,905,-5,1125,1115,10,MQ,4478,N739MQ,LGA,DTW,109,502,9,5,2013-01-28 09:00:00
2013,2,16,1758,1745,13,2114,2136,-22,DL,31,N721TW,JFK,SFO,348,2586,17,45,2013-02-16 17:00:00
2013,1,18,1541,1530,11,1755,1734,21,US,1665,N716UW,LGA,CLT,86,544,15,30,2013-01-18 15:00:00
2013,1,24,2348,2359,-11,418,444,-26,B6,739,N605JB,JFK,PSE,193,1617,23,59,2013-01-24 23:00:00
2013,2,18,1600,1604,-4,1718,1739,-21,UA,1053,N76254,EWR,ORD,120,719,16,4,2013-02-18 16:00:00
2013,2,27,2020,1900,80,2144,2018,86,EV,5714,N827AS,JFK,IAD,48,228,19,0,2013-02-27 19:00:00
2013,2,18,2022,2010,12,2325,2321,4,UA,1299,N37255,EWR,RSW,164,1068,20,10,2013-02-18 20:00:00
2013,1,21,601,608,-7,654,725,-31,UA,733,N822UA,EWR,BOS,32,200,6,8,2013-01-21 06:00:00
2013,2,28,1459,1500,-1,1747,1742,5,DL,2347,N6708D,LGA,ATL,112,762,15,0,2013-02-28 15:00:00


In [23]:
# A shorter way to select flights that departed in January or February
flights |> 
  filter(month %in% c(1, 2)) |>
  sample_n(10)

year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>
2013,1,24,919.0,920,-1.0,1242.0,1233,9.0,UA,1275,N57869,EWR,LAX,332.0,2454,9,20,2013-01-24 09:00:00
2013,1,4,810.0,810,0.0,1029.0,1030,-1.0,FL,346,N899AT,LGA,ATL,115.0,762,8,10,2013-01-04 08:00:00
2013,1,6,1126.0,1130,-4.0,1253.0,1306,-13.0,EV,4431,N16151,EWR,RDU,75.0,416,11,30,2013-01-06 11:00:00
2013,1,22,1105.0,1105,0.0,1241.0,1245,-4.0,WN,542,N443WN,LGA,MDW,121.0,725,11,5,2013-01-22 11:00:00
2013,1,31,1801.0,1800,1.0,1922.0,1913,9.0,US,2185,N748UW,LGA,DCA,47.0,214,18,0,2013-01-31 18:00:00
2013,1,30,1523.0,1345,98.0,1754.0,1641,73.0,B6,1783,N805JB,JFK,MCO,139.0,944,13,45,2013-01-30 13:00:00
2013,1,25,730.0,710,20.0,933.0,850,43.0,MQ,3737,N507MQ,EWR,ORD,129.0,719,7,10,2013-01-25 07:00:00
2013,1,5,537.0,540,-3.0,831.0,850,-19.0,AA,1141,N5DBAA,JFK,MIA,153.0,1089,5,40,2013-01-05 05:00:00
2013,2,26,1556.0,1605,-9.0,1912.0,1911,1.0,B6,157,N794JB,JFK,MCO,157.0,944,16,5,2013-02-26 16:00:00
2013,2,9,,1600,,,1730,,9E,3453,,JFK,BOS,,187,16,0,2013-02-09 16:00:00
