In [1]:
import zipfile
import urllib2
import os

source_url = 'ftp://ftp.nhtsa.dot.gov/GES/GES12/GES12_Flatfile.zip'
zip_name = 'GES12_Flatfile.zip'
cwd = os.getcwd()
dir_path  = os.path.join(cwd, 'GES2012')
zip_path = os.path.join(dir_path, zip_name)


if not os.path.exists(dir_path):
    os.makedirs(dir_path)


if not os.path.exists(zip_path):
    response = urllib2.urlopen(source_url)
    with open(zip_path, 'wb') as fh:
        x = response.read()
        fh.write(x)

with zipfile.ZipFile(os.path.join(dir_path, zip_name), 'r') as z:
    z.extractall(dir_path)

In [2]:
os.listdir(dir_path)

['2012GESFlatFileTXT.sas',
 'ACCIDENT.TXT',
 'CEVENT.TXT',
 'DAMAGE.TXT',
 'DISTRACT.TXT',
 'DRIMPAIR.TXT',
 'FACTOR.TXT',
 'GES12_Flatfile.zip',
 'MANEUVER.TXT',
 'NMCRASH.TXT',
 'NMIMPAIR.TXT',
 'NMPRIOR.TXT',
 'PARKWORK.TXT',
 'PERSON.TXT',
 'SAFETYEQ.TXT',
 'VEHICLE.TXT',
 'VEVENT.TXT',
 'VIOLATN.TXT',
 'VISION.TXT',
 'VSOE.TXT']

In [3]:
import pandas as pd
import numpy as np
import sklearn

cwd = os.getcwd()
dir_path  = os.path.join(cwd, 'GES2012')
input_file_path = os.path.join(dir_path, 'PERSON.TXT')

input_data = pd.read_csv(input_file_path, delimiter='\t')

In [4]:
sorted(input_data.columns)

['AGE',
 'AGE_IM',
 'AIR_BAG',
 'ALC_RES',
 'ALC_STATUS',
 'ATST_TYP',
 'BODY_TYP',
 'CASENUM',
 'DRINKING',
 'DRUGRES1',
 'DRUGRES2',
 'DRUGRES3',
 'DRUGS',
 'DRUGTST1',
 'DRUGTST2',
 'DRUGTST3',
 'DSTATUS',
 'EJECTION',
 'EJECT_IM',
 'EMER_USE',
 'FIRE_EXP',
 'HARM_EV',
 'HOSPITAL',
 'HOUR',
 'IMPACT1',
 'INJSEV_IM',
 'INJ_SEV',
 'LOCATION',
 'MAKE',
 'MAN_COLL',
 'MINUTE',
 'MOD_YEAR',
 'MONTH',
 'PERALCH_IM',
 'PER_NO',
 'PER_TYP',
 'PJ',
 'PSU',
 'PSUSTRAT',
 'P_SF1',
 'P_SF2',
 'P_SF3',
 'REGION',
 'REST_MIS',
 'REST_USE',
 'ROLLOVER',
 'SCH_BUS',
 'SEAT_IM',
 'SEAT_POS',
 'SEX',
 'SEX_IM',
 'SPEC_USE',
 'STRATUM',
 'STR_VEH',
 'TOW_VEH',
 'VEH_NO',
 'VE_FORMS',
 'WEIGHT']

In [5]:
input_data.INJSEV_IM.value_counts()

0    100840
2     20758
1     19380
3      9738
5      1179
4      1178
6         4
dtype: int64

In [6]:
# Drop those odd cases
input_data = input_data[input_data.INJSEV_IM != 6]

for column_name in input_data.columns:
    n_nans = input_data[column_name].isnull().sum()
    if n_nans > 0:
        print column_name, n_nans

MAKE 5162
BODY_TYP 5162
MOD_YEAR 5162
TOW_VEH 5162
SPEC_USE 5162
EMER_USE 5162
ROLLOVER 5162
IMPACT1 5162
FIRE_EXP 5162


In [7]:
print input_data.shape
data = input_data[~input_data.MAKE.isnull()]
discarded = data.pop('INJ_SEV')
target = data.pop('INJSEV_IM')
print data.shape

(153073, 58)
(147911, 56)


In [8]:
target = (target == 4).astype('float')

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV


xtrain, xtest, ytrain, ytest = sklearn.cross_validation.train_test_split(
    data.values, target.values, train_size=0.5)

linreg = LinearRegression()
linreg.fit(xtrain, ytrain)

lr_preds = linreg.predict(xtest)
lr_perf = roc_auc_score(ytest, lr_preds)
print 'OLS: Area under the ROC curve = {}'.format(lr_perf)

OLS: Area under the ROC curve = 0.934323972017


In [10]:
from sklearn.linear_model import Ridge

ridge = GridSearchCV(Ridge(),
                     {'alpha': np.logspace(-10, 10, 10)})
ridge.fit(xtrain, ytrain)
ridge_preds = ridge.predict(xtest)
ridge_performance = roc_auc_score(ytest, ridge_preds)
print 'Ridge: Area under the ROC curve = {}'.format(ridge_performance)

Ridge: Area under the ROC curve = 0.934324583909


In [11]:
from sklearn.linear_model import Lasso
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV

lasso = GridSearchCV(Lasso(),
                     {'alpha': np.logspace(-10, -8, 5)})
lasso.fit(xtrain, ytrain)
lasso_preds = lasso.predict(xtest)
lasso_performance = roc_auc_score(ytest, lasso_preds)
print 'Lasso: Area under the ROC curve = {}'.format(lasso_performance)

Lasso: Area under the ROC curve = 0.934325729579




In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV

gbm = GradientBoostingClassifier(n_estimators=500)

gbm.fit(xtrain, ytrain)
gbm_preds = gbm.predict_proba(xtest)[:, 1]
gbm_performance = roc_auc_score(ytest, gbm_preds)

print 'GBM: Area under the ROC curve = {}'.format(gbm_performance)

GBM: Area under the ROC curve = 0.973040837762


In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

tree = GridSearchCV(DecisionTreeClassifier(),
                    {'max_depth': np.arange(3, 10)})

tree.fit(xtrain, ytrain)
tree_preds = tree.predict_proba(xtest)[:, 1]
tree_performance = roc_auc_score(ytest, tree_preds)

print 'DecisionTree: Area under the ROC curve = {}'.format(tree_performance)

DecisionTree: Area under the ROC curve = 0.915532659042


In [14]:
importances = pd.Series(gbm.feature_importances_, index=data.columns)
print importances.order(ascending=False)[:10]

STRATUM     0.116825
EJECT_IM    0.062802
HOSPITAL    0.058124
WEIGHT      0.056561
HARM_EV     0.053303
DRINKING    0.047378
MOD_YEAR    0.043622
ALC_RES     0.041724
HOUR        0.034329
AGE         0.031445
dtype: float64
