-
Notifications
You must be signed in to change notification settings - Fork 8
/
regression_model.R
129 lines (112 loc) · 4.09 KB
/
regression_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
source("helpers.R")
library(glmnet)
# set up regression data
regression_collisions = query("
SELECT
unique_key,
extract(hour FROM collision_time) AS hour_of_day,
extract(dow FROM collision_time) AS day_of_week,
extract(year FROM collision_time) AS year,
coalesce(on_street_name, off_street_name) AS street_name,
borough,
number_of_motorists_injured + number_of_cyclists_injured + number_of_pedestrians_injured > 0 AS has_injury,
number_of_motorists_killed + number_of_cyclists_killed + number_of_pedestrians_killed > 0 AS has_fatality
FROM collisions
WHERE borough IS NOT NULL
ORDER BY unique_key
")
regression_contributing_factors = query("
WITH candidates AS (
SELECT contributing_factor
FROM collisions_contributing_factors
GROUP BY contributing_factor
HAVING COUNT(*) >= 5000
)
SELECT *
FROM collisions_contributing_factors
WHERE contributing_factor IN (SELECT contributing_factor FROM candidates)
")
regression_vehicle_types = query("
WITH candidates AS (
SELECT vehicle_type
FROM collisions_vehicles
GROUP BY vehicle_type
HAVING COUNT(*) >= 5000
)
SELECT *
FROM collisions_vehicles
WHERE vehicle_type IN (SELECT vehicle_type FROM candidates)
")
regression_vehicles_involved = query("
SELECT collision_unique_key, count(*)::int AS num_vehicles
FROM collisions_vehicles
GROUP BY collision_unique_key
ORDER BY collision_unique_key
")
regression_collisions = regression_collisions %>%
inner_join(regression_vehicles_involved, by = c("unique_key" = "collision_unique_key")) %>%
mutate(
hour_of_day = factor(hour_of_day, levels = c(12:23, 0:11)),
day_of_week = factor(day_of_week),
weekday = factor(day_of_week %in% 1:5),
year = factor(year),
borough = fct_relevel(factor(borough), "Manhattan"),
num_vehicles = factor(num_vehicles, levels = c(2, 1, 3, 4, 5)),
street_type = fct_relevel(factor(case_when(
grepl("expressway|expy|expwy|parkway|pkwy|highway|bqe|turnpike|fdr|thruway", street_name) ~ "highway",
grepl("street| st$", street_name) ~ "street",
grepl("avenue|broadway|bowery| ave$", street_name) ~ "avenue",
grepl(" road| rd$", street_name) ~ "road",
grepl(" lane| ln$", street_name) ~ "lane",
grepl(" drive| dr$", street_name) ~ "drive",
grepl("boulevard|blvd", street_name) ~ "boulevard",
grepl(" place| pl$", street_name) ~ "place",
grepl("bridge", street_name) ~ "bridge",
grepl("tunnel", street_name) ~ "tunnel",
!is.na(street_name) ~ "other",
TRUE ~ "unknown"
)), "unknown")
)
for(f in sort(unique(regression_contributing_factors$contributing_factor))) {
fname = paste0("cf_", gsub(".", "_", make.names(f), fixed = TRUE))
factor_unique_keys = regression_contributing_factors %>%
filter(contributing_factor == f) %>%
pull(collision_unique_key) %>%
unique()
regression_collisions = regression_collisions %>%
mutate(!!fname := as.numeric(unique_key %in% factor_unique_keys))
}
for(v in sort(unique(regression_vehicle_types$vehicle_type))) {
vname = paste0("vt_", gsub(".", "_", make.names(v), fixed = TRUE))
vehicle_unique_keys = regression_vehicle_types %>%
filter(vehicle_type == v) %>%
pull(collision_unique_key) %>%
unique()
regression_collisions = regression_collisions %>%
mutate(!!vname := as.numeric(unique_key %in% vehicle_unique_keys))
}
# build model matrices
injury_model_matrix = sparse.model.matrix(
has_injury ~ . - 1,
select(regression_collisions, -day_of_week, -street_name, -unique_key, -has_fatality)
)
fatality_model_matrix = sparse.model.matrix(
has_fatality ~ . - 1,
select(regression_collisions, -day_of_week, -street_name, -unique_key, -has_injury)
)
# run regularized regressions
injury_cvfit = cv.glmnet(
x = injury_model_matrix,
y = regression_collisions$has_injury,
family = "binomial"
)
fatality_cvfit = cv.glmnet(
x = fatality_model_matrix,
y = regression_collisions$has_fatality,
family = "binomial"
)
# check on lambda values and coefficients
plot(injury_cvfit)
coef(injury_cvfit, s = "lambda.1se")
plot(fatality_cvfit)
coef(fatality_cvfit, s = "lambda.1se")