# Waze (R)

In [1]:
# Libraries
library(tidyverse)
library(readxl)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.2     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.1.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


Course 2 tasks:

- Import data
- Create a dataframe 
- Inspect data 
- Identify outliers
- Create a data visualization
- Share an executive summary with the Waze data team 

In [2]:
df <- read_csv(
    file.path(r"(data/waze_dataset.csv)"),
    col_names = TRUE,
    col_types = NULL
)

[1mRows: [22m[34m14999[39m [1mColumns: [22m[34m13[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): label, device
[32mdbl[39m (11): ID, sessions, drives, total_sessions, n_days_after_onboarding, tot...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
# Check df dimensions
expected_shape <- c(14999, 13)
sapply(seq(length(expected_shape)), \(i) expected_shape[i] == dim(df)[i])

In [4]:
# View the top rows
df |> head(n=10)

ID,label,sessions,drives,total_sessions,n_days_after_onboarding,total_navigations_fav1,total_navigations_fav2,driven_km_drives,duration_minutes_drives,activity_days,driving_days,device
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
0,retained,283,226,296.74827,2276,208,0,2628.8451,1985.7751,28,19,Android
1,retained,133,107,326.8966,1225,19,64,13715.9206,3160.4729,13,11,iPhone
2,retained,114,95,135.52293,2651,0,0,3059.1488,1610.7359,14,8,Android
3,retained,49,40,67.58922,15,322,7,913.5911,587.1965,7,3,iPhone
4,retained,84,68,168.24702,1562,166,5,3950.202,1219.5559,27,18,Android
5,retained,113,103,279.54444,2637,0,0,901.2387,439.1014,15,11,iPhone
6,retained,3,2,236.72531,360,185,18,5249.1728,726.5772,28,23,iPhone
7,retained,39,35,176.07284,2999,0,0,7892.0525,2466.9817,22,20,iPhone
8,retained,57,46,183.53202,424,0,26,2651.7098,1594.343,25,20,Android
9,churned,84,68,244.80211,2997,72,0,6043.4603,2341.8385,7,3,iPhone


In [5]:
# Inspect the structure
df |> str()

spc_tbl_ [14,999 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ ID                     : num [1:14999] 0 1 2 3 4 5 6 7 8 9 ...
 $ label                  : chr [1:14999] "retained" "retained" "retained" "retained" ...
 $ sessions               : num [1:14999] 283 133 114 49 84 113 3 39 57 84 ...
 $ drives                 : num [1:14999] 226 107 95 40 68 103 2 35 46 68 ...
 $ total_sessions         : num [1:14999] 296.7 326.9 135.5 67.6 168.2 ...
 $ n_days_after_onboarding: num [1:14999] 2276 1225 2651 15 1562 ...
 $ total_navigations_fav1 : num [1:14999] 208 19 0 322 166 0 185 0 0 72 ...
 $ total_navigations_fav2 : num [1:14999] 0 64 0 7 5 0 18 0 26 0 ...
 $ driven_km_drives       : num [1:14999] 2629 13716 3059 914 3950 ...
 $ duration_minutes_drives: num [1:14999] 1986 3160 1611 587 1220 ...
 $ activity_days          : num [1:14999] 28 13 14 7 27 15 28 22 25 7 ...
 $ driving_days           : num [1:14999] 19 11 8 3 18 11 23 20 20 3 ...
 $ device                 : chr [1:14999] "Andr

In [11]:
# Isolate rows with null values
df_filtered_null <- df |>
    filter(if_any(everything(), is.na))

df_filtered_null |> head(n=10)

ID,label,sessions,drives,total_sessions,n_days_after_onboarding,total_navigations_fav1,total_navigations_fav2,driven_km_drives,duration_minutes_drives,activity_days,driving_days,device
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
77,,63,50,133.10416,783,201,0,2649.016,1517.21,19,13,iPhone
80,,116,93,436.06018,1584,283,62,4183.41,3121.89,18,15,iPhone
98,,78,64,583.49279,3414,0,0,1811.141,642.1891,12,11,Android
111,,106,102,113.37906,2228,14,0,2817.482,2011.7243,17,13,Android
142,,32,26,222.12931,208,55,10,2459.816,874.4276,11,7,iPhone
162,,3,3,17.25022,3203,22,0,11819.366,3351.0196,3,0,iPhone
176,,283,226,529.53394,1110,6,0,3028.601,2489.697,17,11,iPhone
199,,27,23,196.83961,2800,156,0,11579.565,3373.4826,23,18,iPhone
266,,70,58,137.15128,3264,45,0,1305.947,996.3944,21,18,Android
283,,170,137,354.29766,869,18,25,3914.401,2500.267,3,2,Android


In [12]:
# Display summary stats of rows with null values
df_filtered_null |>
    summary()

       ID           label              sessions          drives     
 Min.   :   77   Length:700         Min.   :  0.00   Min.   :  0.0  
 1st Qu.: 3744   Class :character   1st Qu.: 23.00   1st Qu.: 20.0  
 Median : 7443   Mode  :character   Median : 56.00   Median : 47.5  
 Mean   : 7406                      Mean   : 80.84   Mean   : 67.8  
 3rd Qu.:11007                      3rd Qu.:112.25   3rd Qu.: 94.0  
 Max.   :14993                      Max.   :556.00   Max.   :445.0  
 total_sessions     n_days_after_onboarding total_navigations_fav1
 Min.   :   5.583   Min.   :  16            Min.   :   0.0        
 1st Qu.:  94.056   1st Qu.: 869            1st Qu.:   4.0        
 Median : 177.256   Median :1650            Median :  62.5        
 Mean   : 198.483   Mean   :1709            Mean   : 118.7        
 3rd Qu.: 266.058   3rd Qu.:2509            3rd Qu.: 169.2        
 Max.   :1076.880   Max.   :3498            Max.   :1096.0        
 total_navigations_fav2 driven_km_drives  durati

In [15]:
# Isolate rows without null values
df_filtered_nonnull <- df |>
    filter(!if_any(everything(), is.na))

df_filtered_nonnull |> head(n=10)

ID,label,sessions,drives,total_sessions,n_days_after_onboarding,total_navigations_fav1,total_navigations_fav2,driven_km_drives,duration_minutes_drives,activity_days,driving_days,device
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
0,retained,283,226,296.74827,2276,208,0,2628.8451,1985.7751,28,19,Android
1,retained,133,107,326.8966,1225,19,64,13715.9206,3160.4729,13,11,iPhone
2,retained,114,95,135.52293,2651,0,0,3059.1488,1610.7359,14,8,Android
3,retained,49,40,67.58922,15,322,7,913.5911,587.1965,7,3,iPhone
4,retained,84,68,168.24702,1562,166,5,3950.202,1219.5559,27,18,Android
5,retained,113,103,279.54444,2637,0,0,901.2387,439.1014,15,11,iPhone
6,retained,3,2,236.72531,360,185,18,5249.1728,726.5772,28,23,iPhone
7,retained,39,35,176.07284,2999,0,0,7892.0525,2466.9817,22,20,iPhone
8,retained,57,46,183.53202,424,0,26,2651.7098,1594.343,25,20,Android
9,churned,84,68,244.80211,2997,72,0,6043.4603,2341.8385,7,3,iPhone


In [17]:
# Display summary stats of rows without null values
df_filtered_nonnull |> 
    summary()

       ID           label              sessions          drives      
 Min.   :    0   Length:14299       Min.   :  0.00   Min.   :  0.00  
 1st Qu.: 3750   Class :character   1st Qu.: 23.00   1st Qu.: 20.00  
 Median : 7504   Mode  :character   Median : 56.00   Median : 48.00  
 Mean   : 7504                      Mean   : 80.62   Mean   : 67.26  
 3rd Qu.:11258                      3rd Qu.:111.00   3rd Qu.: 93.00  
 Max.   :14998                      Max.   :743.00   Max.   :596.00  
 total_sessions      n_days_after_onboarding total_navigations_fav1
 Min.   :   0.2202   Min.   :   4.0          Min.   :   0.0        
 1st Qu.:  90.4577   1st Qu.: 878.5          1st Qu.:  10.0        
 Median : 158.7186   Median :1749.0          Median :  71.0        
 Mean   : 189.5474   Mean   :1751.8          Mean   : 121.7        
 3rd Qu.: 253.5404   3rd Qu.:2627.5          3rd Qu.: 178.0        
 Max.   :1216.1546   Max.   :3500.0          Max.   :1236.0        
 total_navigations_fav2 driven_km_

In [20]:
# Get count of null values by device
df |>
    group_by(device) |>
    summarise(
        across(
                .cols = everything(),
                .fns = ~sum(is.na(.x))
            )
        ) |>
    ungroup()

device,ID,label,sessions,drives,total_sessions,n_days_after_onboarding,total_navigations_fav1,total_navigations_fav2,driven_km_drives,duration_minutes_drives,activity_days,driving_days
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
Android,0,253,0,0,0,0,0,0,0,0,0,0
iPhone,0,447,0,0,0,0,0,0,0,0,0,0
