# Package Import

### Standard Packages

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
import bokeh.plotting as bkp
import bokeh.io as bki
import bokeh.palettes as bkpal
from bokeh.transform import dodge


In [3]:
bki.output_notebook()

In [4]:
from bokeh.io import curdoc
from bokeh.themes import Theme

fsz = '16pt'

curdoc().theme = Theme(json={'attrs': {
    # apply defaults to Title properties
    'Title': {
        'text_font_size': fsz
    },
    # apply defaults to Axis properties
    'Axis': {
        'axis_label_text_font_size': fsz,
        'major_label_text_font_size' : fsz
    },
     # apply defaults to Legend properties
    'Legend': {
        'label_text_font_size': fsz
    }
}})

### Local Exchangeability Package

In [11]:
import localexch

ModuleNotFoundError: No module named 'localexch'

# Data Preprocessing

### Extract Relevant Columns, Simplify, Tidy

In [101]:
df = pd.read_csv('bicycle-crash-data-chapel-hill-region.csv', sep=';')
#print(df.columns)

#preprocessing data

#extract used columns
cols_used = ['BikeInjury', 'Biker Intox.', 'BikeAge', 'BikeSex', 'BikeDir', 'BikePos', 
        'DrvrVehTyp', 'LightCond', 'SpeedLimit', 'Day of Week', 'CrashHour']
df = df[cols_used]

#extract rows without missing data
ageknown = (df.BikeAge != '999') & (df.BikeAge != '70+')
lightknown = (df['LightCond'] != 'Unknown') & (df['LightCond'] != 'Other')
vehknown = df['DrvrVehTyp'] != 'Unknown'
spdknown = df['SpeedLimit'] != 'Unknown'
injknown = df['BikeInjury'] != 'Unknown Injury'
dirknown = df['BikeDir'] != 'Unknown'
posknown = df['BikePos'] != 'Unknown'
df = df[ageknown & lightknown & vehknown & spdknown & injknown & dirknown & posknown]

#convert ages to integers
df['BikeAge'] = df['BikeAge'].astype(int)

#simplify injury type
def simplify_inj(inj):
    if 'Minor' in inj:
        return 'Minor'
    elif 'Serious' in inj:
        return 'Serious'
    elif 'Killed' in inj:
        return 'Killed'
    elif 'Possible' in inj:
        return 'Possible'
    elif 'No In' in inj:
        return 'None'
    else:
        return 'NA'
df['BikeInjury'] = df['BikeInjury'].apply(simplify_inj)

#simplify light condition
df['LightCond'] = df['LightCond'].apply(lambda x : x[:4])

#simplify vehicle types
def simplify_veh(veh):
    if 'Truck' in veh or 'truck' in veh or 'Motor Home' in veh:
        return 'Truck'
    elif 'Bus' in veh:
        return 'Bus'
    elif 'EMS' in veh or 'Police' in veh:
        return 'Emergency'
    else:
        return veh
df['DrvrVehTyp'] = df['DrvrVehTyp'].apply(simplify_veh)

#simplify bike direction
def simplify_dir(dr):
    if 'With' in dr:
        return 'With'
    elif 'Not' in dr:
        return 'NA'
    else:
        return 'Against'
df['BikeDir'] = df['BikeDir'].apply(simplify_dir)

#simplify bike position
def simplify_pos(pos):
    if 'With' in pos:
        return 'With'
    elif 'Not' in dr:
        return 'NA'
    else:
        return 'Against'
df['BikeDir'] = df['BikeDir'].apply(simplify_dir)

#simplify day of week
day_map = {'Sunday': 0,
           'Monday': 1,
           'Tuesday': 2,
           'Wednesday': 3,
           'Thursday': 4,
           'Friday': 5,
           'Saturday': 6}
df['Day of Week'] = df['Day of Week'].apply(lambda x : day_map[x])

#simplify speed limit
def simplify_spd(spd):
    return spd.strip().split(' ')[2]
df['SpeedLimit'] = df['SpeedLimit'].apply(simplify_spd)
df['SpeedLimit'] = df['SpeedLimit'].astype(int)

print('Single Dataframe Version:')
print(df.head())

print('')
print('')
print('Split Covariate/Outcome:')

#create a version of the data with covariates and observations separate
Y = df['BikeInjury']
X = df.drop(['BikeInjury'], axis=1)

print('X:')
print(X.head())
print('Y:')
print(Y.head())

Single Dataframe Version:
  BikeInjury Biker Intox.  BikeAge BikeSex  BikeDir  \
0       None           No       12    Male     With   
2   Possible           No       36    Male     With   
3   Possible           No       63    Male     With   
4   Possible           No       14    Male     With   
5       None           No       57    Male  Against   

                      BikePos     DrvrVehTyp LightCond  SpeedLimit  \
0                 Travel Lane  Passenger Car      Dayl          35   
2  Bike Lane / Paved Shoulder  Passenger Car      Dayl          45   
3                 Travel Lane  Sport Utility      Dayl          35   
4                 Travel Lane  Passenger Car      Dayl          35   
5                 Non-Roadway         Pickup      Dayl          15   

   Day of Week  CrashHour  
0            6         18  
2            4         17  
3            3         18  
4            3         15  
5            6         13  


Split Covariate/Outcome:
X:
  Biker Intox.  BikeAge 

# Data Visualization

In [88]:
#obtain rows of df for intoxicated biker / severe outcomes
intox = (df['Biker Intox.'] == 'Yes')
severe = (df['BikeInjury'] == 'Serious') | (df['BikeInjury'] == 'Killed') 

indices = [ (~severe & ~intox), (~severe & intox), (severe & ~intox),  (severe & intox) ]
colors = [bkpal.Category10[10][0], bkpal.Category10[10][0], bkpal.Category10[10][1], bkpal.Category10[10][1]]
alphas = [0.3, 0.7, 0.3, 0.7]
labels=['Not Severe, Not Intoxicated', 'Not Severe, Intoxicated', 'Severe, Not Intoxicated', 'Severe, Intoxicated']

coln = df.DrvrVehTyp.unique()
vals = np.zeros(coln.shape[0])
totalcts = df.DrvrVehTyp.value_counts()[coln]
fig = bkp.figure(width=800, height=400, y_range=coln,
                x_axis_label='Fraction')
print(totalcts)
for i in range(4):
    cts = df.DrvrVehTyp[indices[i]].value_counts()[coln]
    cts[np.isnan(cts)] = 0
    fig.hbar(y = coln, left = vals/totalcts, right=(vals+cts)/totalcts,
                 fill_color=colors[i], line_color='black', line_width=0.5, height=.7, fill_alpha=alphas[i])
    vals = vals + cts
bkp.show(fig)


coln = np.sort(df.SpeedLimit.unique())
vals = np.zeros(coln.shape[0])
totalcts = df.SpeedLimit.value_counts()[coln]
fig = bkp.figure(width=600, height=250, y_range=[str(x) for x in coln],
                x_axis_label='Fraction',
                y_axis_label='Speed Limit (MPH)')
for i in range(4):
    cts = df.SpeedLimit[indices[i]].value_counts()[coln]
    cts[np.isnan(cts)] = 0
    fig.hbar(y = [str(x) for x in coln], left = vals/totalcts, right=(vals+cts)/totalcts,
                 fill_color=colors[i], line_color='black', line_width=0.5, height=.7, fill_alpha=alphas[i])
    vals = vals + cts
bkp.show(fig)


coln = np.sort(df.BikeSex.unique())
vals = np.zeros(coln.shape[0])
totalcts = df.BikeSex.value_counts()[coln]
fig = bkp.figure(width=600, height=250, y_range=[str(x) for x in coln],
                x_axis_label='Fraction',
                y_axis_label='Speed Limit (MPH)')
for i in range(4):
    cts = df.BikeSex[indices[i]].value_counts()[coln]
    cts[np.isnan(cts)] = 0
    fig.hbar(y = [str(x) for x in coln], left = vals/totalcts, right=(vals+cts)/totalcts,
                 fill_color=colors[i], line_color='black', line_width=0.5, height=.7, fill_alpha=alphas[i])
    vals = vals + cts
bkp.show(fig)

coln = np.array(['Dawn', 'Dayl', 'Dusk', 'Dark'])
vals = np.zeros(coln.shape[0])
totalcts = df.LightCond.value_counts()[coln]
fig = bkp.figure(width=600, height=200, y_range=[str(x) for x in coln],
                x_axis_label='Fraction')
for i in range(4):
    cts = df.LightCond[indices[i]].value_counts()[coln]
    cts[np.isnan(cts)] = 0
    fig.hbar(y = [str(x) for x in coln], left = vals/totalcts, right=(vals+cts)/totalcts,
                 fill_color=colors[i], line_color='black', line_width=0.5, height=.7, fill_alpha=alphas[i])
    vals = vals + cts
bkp.show(fig)


coln = np.sort(df.BikeAge.unique())
vals = np.zeros(coln.shape[0])
totalcts = df.BikeAge.value_counts()[coln]
fig = bkp.figure(width=600, height=2000, y_range=[str(x) for x in coln],
                x_axis_label='Fraction',
                y_axis_label='Speed Limit (MPH)')
for i in range(4):
    cts = df.BikeAge[indices[i]].value_counts()[coln]
    cts[np.isnan(cts)] = 0
    fig.hbar(y = [str(x) for x in coln], left = vals/totalcts, right=(vals+cts)/totalcts,
                 fill_color=colors[i], line_color='black', line_width=0.5, height=.7, fill_alpha=alphas[i])
    vals = vals + cts
bkp.show(fig)


coln = np.sort(df.LightCond.unique())
vals = np.zeros(coln.shape[0])
totalcts = df.LightCond.value_counts()
fig = bkp.figure(width=600, height=400, y_range=[str(x) for x in coln],
                x_axis_label='Fraction', x_range=(0, 5))
for i in range(4):
    cts = df.LightCond[indices[i]].value_counts()[coln]
    cts[np.isnan(cts)] = 0
    fig.hbar(y = [str(x) for x in coln], left = vals/totalcts, right=(vals+cts)/totalcts,
                 fill_color=colors[i], line_color='black', line_width=0.5, height=.7, fill_alpha=alphas[i],
                legend=labels[i])
    vals = vals + cts
bkp.show(fig)

Passenger Car           5051
Sport Utility           1623
Pickup                  1271
Truck                    261
Van                      466
Motorcycle                50
Bus                       50
Emergency                 50
Tractor/Semi-Trailer      24
Taxicab                   17
Moped                      6
Name: DrvrVehTyp, dtype: int64


# 

In [85]:
unique_bin_cts = df.groupby(['BikeAge','BikeSex','Biker Intox.', 'BikeDir', 'BikePos', 'Day of Week', 'CrashHour', 'DrvrVehTyp','LightCond','SpeedLimit']).count()
#print(unique_bin_cts)
#unique_bin_cts = df.groupby(['BikeSex','Biker Intox.', 'BikeDir', 'BikePos', 'DrvrVehTyp','LightCond','SpeedLimit']).count()


fig = bkp.figure(x_axis_type='log')
hist, edges = np.histogram(unique_bin_cts['BikeInjury'], bins=np.logspace(0, 3, 12))
fig.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color=bkpal.Category10[10][0], line_color="white", alpha=0.4, legend='Frac Severe, Not Intox')
bkp.show(fig)

# Local Permutation Test

Does biker intoxication affect probability of severe injury?

The null hypothesis is that Biker Intoxication doesn't affect the probability of severe injury. Therefore we assume the observations are exchangeable w.r.t. that covariate, while the others are dealt with via the premetric:

### Premetric

In [None]:
#X columns: ['Biker Intox.', 'BikeAge', 'BikeSex', 'BikeDir', 'BikePos', 
#        'DrvrVehTyp', 'LightCond', 'SpeedLimit', 'Day of Week', 'CrashHour']

age_w = 0.1
spd_w = 0.1
day_w = 0.1
hr_w = 0.1

def premetric(x1, x2):
    cat_idcs = [2,3,4,5,6]
    if (x1[cat_idcs] != x2[cat_idcs]).any():
        return 1
    else:
        return min(1, 
                  age_w*np.fabs(x1[1]-x2[1]) + 
                  spd_w*np.fabs(x1[7]-x2[7]) +
                  day_w*np.fabs(x1[8]-x2[8]) + 
                  hr_w*np.fabs(x1[9]-x2[9])
                  )           

### Test Statistic

In [94]:
def test_stat(X, Y):
    intox = (X['Biker Intox.'] == 'Yes')
    severe = (Y == 'Serious') | (Y == 'Killed') 
    return (severe & intox).sum()/intox.sum() - (severe & ~intox).sum()/(~intox).sum()

### Permutation Group Sampler

In [36]:
def construct_groups(X, premetric, alpha):
    groups = [[0]]
    for i in range(1, X.shape[0]):
        for j in range(len(groups)):
            if (np.array([premetric(X.loc[i], X.loc[k]) for k in groups[j]]) <= alpha/2).all():
                groups[j].append(i)
        groups.append([i])
    return groups

def construct_pairs(X, match_coln, premetric, alpha):
    

auto_sampler = localexch.design_clustered_sampler(X, premetric)

auto_sampler_2 = localexch.design_paired_sampler(X, premetric, 'Biker Intox.')

def sampler(X, premetric, alpha):
    #paired matching sampler

NameError: name 'localexch' is not defined

### Run the test

In [37]:
localexch.test(X, Y, test_stat, group_sampler, premetric, n_samples)

NameError: name 'localexch' is not defined

### Visualization

# Local Estimation

What is the probability of severe outcome as a function of biker age / intoxication state

In [None]:
localexch.estimate(prediction_pts, data, premetric)

In [219]:

intox = (df['Biker Intox.'] == 'Yes')
severe = (df['BikeInjury'] == 'Serious') | (df['BikeInjury'] == 'Killed')

print((severe & intox).sum() / ((severe & intox).sum() +(~severe & intox).sum() ))
print((severe & ~intox).sum() / ((severe & ~intox).sum() +(~severe & ~intox).sum() ))


fig = bkp.figure()

hist1, edges = np.histogram(df['BikeAge'][~severe & ~intox], bins=np.arange(0, 100, 1))
hist2, edges = np.histogram(df['BikeAge'][severe & ~intox], bins=np.arange(0, 100, 1))
fig.quad(top=hist2/(hist1+hist2), bottom=0, left=edges[:-1], right=edges[1:],
           fill_color=bkpal.Category10[10][0], line_color="white", alpha=0.4, legend='Frac Severe, Not Intox')

hist1, edges = np.histogram(df['BikeAge'][~severe & intox], bins=np.arange(0, 100, 1))
hist2, edges = np.histogram(df['BikeAge'][severe & intox], bins=np.arange(0, 100, 1))
fig.quad(top=hist2/(hist1+hist2), bottom=0, left=edges[:-1], right=edges[1:],
           fill_color=bkpal.Category10[10][1], line_color="white", alpha=0.4, legend='Frac Severe, Intox')


bkp.show(fig)

0.1984251968503937
0.07363245236631838


  if sys.path[0] == '':


In [None]:
fig = bkp.figure()
hist, edges = np.histogram(df['BikeAge'], bins=np.arange(0, 100, 1))
fig.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color=bkpal.Category10[10][0], line_color="white", alpha=0.4)
bkp.show(fig)

In [185]:
print(df.BikeInjury.unique())



['None' 'Possible' 'Minor' 'Serious' 'Killed']


In [119]:
intoxrows = (df['Biker Intox.'] == 'Yes')
intox = df[cols_used][intoxrows]
nintox = df[cols_used][~intoxrows]
print(intox.shape)
print(nintox.shape)
injuries = (intox['BikeInjury'] == 'A: Suspected Serious Injury') | (intox['BikeInjury'] == 'K: Killed')
print(injuries.sum())
injuries = (nintox['BikeInjury'] == 'A: Suspected Serious Injury') | (nintox['BikeInjury'] == 'K: Killed')
print(injuries.sum())

(796, 29)
(10470, 29)
139
688


In [120]:
139/796

0.17462311557788945

In [121]:
688/10470

0.06571155682903534