In [5]:
# Import packages
import numpy as np
import pandas as pd
import scipy.stats
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import itertools
import math
import os

<h1>Load Data</h1>

In [6]:
# Load data from files
df = pd.DataFrame()
path = 'data/'
for file in os.listdir(path):     # Concatinates all the csv's in the path folder and gives them labels based on year
    if file.lower().endswith(".csv"):
        file_name = path+"/"+file
        data = pd.read_csv(file_name) 
        data['Year'] = file[:file.index('.')]
        df = pd.concat([df, data]) # classified based on folder path

        
df = df[["SYSTEM", "DATE", "TIME", "SEVERITY", "ROAD_DESC", "VEHICLES", "CONTOUR", "CONDITION", "LIGHTING", "WEATHER", "CITY", "COUNTY", "LATITUDE", "LONGITUDE", "AGE_1", "AGE_2"]]
df

Unnamed: 0,SYSTEM,DATE,TIME,SEVERITY,ROAD_DESC,VEHICLES,CONTOUR,CONDITION,LIGHTING,WEATHER,CITY,COUNTY,LATITUDE,LONGITUDE,AGE_1,AGE_2
0,CITY STREET,1/1/2008,0.0,PDO,AT INTERSECTION,1,UNKNOWN,UNKNOWN,UNKNOWN,NONE,FAIRPLAY,PARK,0.000000,0.000000,50,0
1,CITY STREET,1/1/2008,12.0,PDO,UNKNOWN,1,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,PUEBLO,PUEBLO,0.000000,0.000000,0,0
2,CITY STREET,1/1/2008,13.0,INJ,INTERSECTION RELATED,2,STRAIGHT ON-LEVEL,WET W/VIS ICY ROAD TREATMENT,DARK-LIGHTED,UNKNOWN,DENVER,DENVER,39.738436,-104.982325,59,42
3,CITY STREET,1/1/2008,29.0,PDO,NON-INTERSECTION,1,CURVE ON-GRADE,DRY,DARK-UNLIGHTED,UNKNOWN,DURANGO,LA PLATA,0.000000,0.000000,18,0
4,CITY STREET,1/1/2008,30.0,PDO,NON-INTERSECTION,2,STRAIGHT ON-LEVEL,SNOWY,DARK-LIGHTED,UNKNOWN,LAKEWOOD,JEFFERSON,39.672860,-105.147678,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112186,STATE HIGHWAY,12/31/2007,2342.0,INJ,AT INTERSECTION,2,STRAIGHT ON-LEVEL,DRY,DARK-LIGHTED,NONE,PUEBLO,PUEBLO,38.219990,-104.643721,22,23
112187,CITY STREET,12/31/2007,2343.0,PDO,INTERSECTION RELATED,2,STRAIGHT ON-LEVEL,DRY,DARK-LIGHTED,NONE,DENVER,DENVER,39.733690,-105.000150,19,26
112188,INTERSTATE,12/31/2007,2343.0,PDO,NON-INTERSECTION,1,STRAIGHT ON-LEVEL,SNOWY,DARK-UNLIGHTED,NONE,VAIL,EAGLE,39.634044,-106.411499,34,0
112189,COUNTY ROAD,12/31/2007,2350.0,PDO,INTERSECTION RELATED,1,STRAIGHT ON-GRADE,ICY,DARK-LIGHTED,NONE,,DOUGLAS,39.559040,-104.896640,18,0


<h1>Function Definitions</h1>

In [120]:
# This function will get the nominal data points and count of a given column. Must be entered with quotations
def get_count(column):
    
    dict = {}
    for i in column:
        if not i in dict:
            dict[i] = 0

    for i in column:
        dict[i] = dict[i] + 1
    count = pd.DataFrame(columns=(column.name, "COUNT"))   

    for i in dict:
        data = {column.name:[i], "COUNT": [dict[i]]}
        temp = pd.DataFrame(data)
        count = pd.concat([count, temp])
    return count

In [122]:
# This is how we can access individual elements of the data frame
# for j in range(count_a.shape[0]):
#     print(count_a[col_a][j], count_a["COUNT"][j])

#This function will take 2 columns with nominal data points and will calculate the lift for all the data points
def lift(col_a, col_b, dataframe):
    col_a = pd.Series(col_a)
    col_b = pd.Series(col_b)
    lift_vals = []
    df_len = len(dataframe)
    count_a = get_count(col_a)
    count_b = get_count(col_b)
    combos = list(itertools.product(count_a[col_a.name], count_b[col_b.name]))
    
    count_a = count_a.rename({col_a.name: "NAME"}, axis=1)
    count_b = count_b.rename({col_b.name: "NAME"}, axis=1)
    count = pd.concat((count_a, count_b))
    
    dict = {}
    for (i, j) in count.iterrows():
        dict[j.NAME] = j.COUNT
    
    #Now that we got all the combinations, we gotta calculate the lift of each one
    for(i,j) in combos:
        a_and_b = len(df.loc[(df[col_a.name]== i) & (df[col_b.name]== j)])
        lift = (a_and_b/df_len)/((dict[i]/df_len)*(dict[j]/df_len))
        print("The lift of", i,"and",j, "=", lift)

#---------------------------------------------------------------------------------------------------------------
newframe = pre_processing2(df.LIGHTING, "LIGHTING", df.SEVERITY, "SEVERITY")
lift(newframe.LIGHTING, newframe.SEVERITY, newframe)

The lift of DARK-LIGHTED and INJ = 1.0083382249706225
The lift of DARK-LIGHTED and PDO = 0.9958633456245762
The lift of DARK-LIGHTED and FAT = 1.2455616476979654
The lift of DARK-UNLIGHTED and INJ = 0.9502644977631927
The lift of DARK-UNLIGHTED and PDO = 1.0078363196149338
The lift of DARK-UNLIGHTED and FAT = 2.41290114058314
The lift of DAYLIGHT and INJ = 1.0061998418941558
The lift of DAYLIGHT and PDO = 0.9995331061019059
The lift of DAYLIGHT and FAT = 0.7367211652057148
The lift of DAWN OR DUSK and INJ = 0.9782549232647272
The lift of DAWN OR DUSK and PDO = 1.0053604434083223
The lift of DAWN OR DUSK and FAT = 1.2871664827487923


In [49]:
# Pass in the data frame and an array of the column names you want to convert
def create_labels(df, columns):
    for i in columns:
        labels = LabelEncoder().fit_transform(df[i])
        df[i] = labels
    return df

In [125]:
#In the case of our chi squared, c = 2, r = 2, therefore d = 1
#Somethings gotta be wrong with my logic here, according to this everything's correlated >:(
def chi_squared(col_a, col_b, dataframe):
    N = len(dataframe)
    col_a = pd.Series(col_a)
    col_b = pd.Series(col_b)
    count_a = get_count(col_a)
    count_b = get_count(col_b)
    combos = list(itertools.product(count_a[col_a.name], count_b[col_b.name]))
    
    count_a = count_a.rename({col_a.name: "NAME"}, axis=1)
    count_b = count_b.rename({col_b.name: "NAME"}, axis=1)
    count = pd.concat((count_a, count_b))
    
    dict = {}
    for (i, j) in count.iterrows():
        dict[j.NAME] = j.COUNT
            
    for(i,j) in combos:
        # This gets us all of actual values between 2 nominal sets of data
        a_and_b = len(df.loc[(df[col_a.name]== i) & (df[col_b.name]== j)])
        a_and_notb = len(df.loc[(df[col_a.name]== i) & (df[col_b.name]!= j)])
        nota_and_b = len(df.loc[(df[col_a.name]!= i) & (df[col_b.name]== j)])
        nota_and_notb = len(df.loc[(df[col_a.name]!= i) & (df[col_b.name]!= j)])
        
        # This gets us all of the expected values between 2 nominal sets of data
        e_a_and_b = (dict[i]*dict[j])/len(df)
        e_a_and_notb = (dict[i]*(N-dict[j]))/len(df)
        e_nota_and_b = ((N-dict[i])*dict[j])/len(df)
        e_nota_and_notb = ((N-dict[i])*(N-dict[j]))/len(df)
        
        #plugging it all into chi squared
        chi_a_b = pow((a_and_b - e_a_and_b),2)/e_a_and_b
        chi_a_notb = pow((a_and_notb - e_a_and_notb),2)/e_a_and_notb
        chi_nota_b = pow((nota_and_b - e_nota_and_b),2)/e_nota_and_b
        chi_nota_notb = pow((nota_and_notb - e_nota_and_notb),2)/e_nota_and_notb
        
        ans = chi_a_b + chi_a_notb + chi_nota_b + chi_nota_notb
        
        corr = check_correlation(ans,.001)
        
        print("Chi squared of ",i,"and ",j, "= ", ans, "which indicates these are ", corr)

#-----------------------------------------------------------------------------------------------------------        
newframe = pre_processing2(df.LIGHTING, "LIGHTING", df.SEVERITY, "SEVERITY")
print(chi_squared(newframe.LIGHTING, newframe.SEVERITY, newframe))        

Chi squared of  DARK-LIGHTED and  INJ =  97.10617962990213 which indicates these are  correlated
Chi squared of  DARK-LIGHTED and  PDO =  106.81577605852266 which indicates these are  correlated
Chi squared of  DARK-LIGHTED and  FAT =  160.89308710714295 which indicates these are  correlated
Chi squared of  DARK-UNLIGHTED and  INJ =  196.5644611790934 which indicates these are  correlated
Chi squared of  DARK-UNLIGHTED and  PDO =  109.82237987378788 which indicates these are  correlated
Chi squared of  DARK-UNLIGHTED and  FAT =  1413.0665678706482 which indicates these are  correlated
Chi squared of  DAYLIGHT and  INJ =  204.42090099256654 which indicates these are  correlated
Chi squared of  DAYLIGHT and  PDO =  145.12631619095507 which indicates these are  correlated
Chi squared of  DAYLIGHT and  FAT =  1149.226734577513 which indicates these are  correlated
Chi squared of  DAWN OR DUSK and  INJ =  93.94420389508915 which indicates these are  correlated
Chi squared of  DAWN OR DUSK a

In [124]:
#This function assumes d = 1, and tells if there is a correlation or not according to the critical values of the Chi Squared Distribution

def check_correlation(value, prob):
    if prob == .05:
        if value > 3.841:
            return "correlated"
        else:
            return "not correlated"
    elif prob == .01:
        if value > 6.635:
            return "correlated"
        else:
            return "not correlated"
    elif prob == .001:
        if value > 10.828:
            return "correlated"
        else:
            return "not correlated"
    else:
        print("Enter a probability of .05, .01, or .001")
        return

<h1>Pre Processing</h1>

In [37]:
# Returns a data frame use .squeeze() to turn into a Series
# .name can only be used on a Series
def pre_processing(data, name):
    # remove all 0, null, and UNKONW rows
    data = data.to_frame()
    
#     print(data[data[name] == "UNKNOWN"])
    
    if (data[name].dtypes == object):
        data = data[data[name] != "UNKNOWN"]
    else:
        data = data[data[name] != 0]
        
    data = data.dropna()
    return data

In [131]:
# Returns a dataframe that contains the 2 entered names. It removes a row if one of the columns contains
# 0, null, or UNKNOWN
def pre_processing2(data1, name1,  data2, name2):
    data = data1.to_frame()
    data[name2] = data2
    
    if (data[name1].dtypes == object) or (data[name2].dtypes == object):
        data = data[(data[[name1,name2]] != "UNKNOWN").all(axis=1)]
    else:
        data = data[(data[[name1,name2]] != 0).all(axis=1)]
        
    data = data.dropna()
    
    return data

In [118]:
# pre_processing(df.SEVERITY, "SEVERITY").squeeze()
data = pre_processing2(df.LIGHTING, "LIGHTING", df.SEVERITY, "SEVERITY")
    

2           DARK-LIGHTED
3         DARK-UNLIGHTED
4           DARK-LIGHTED
5           DARK-LIGHTED
6           DARK-LIGHTED
               ...      
112186      DARK-LIGHTED
112187      DARK-LIGHTED
112188    DARK-UNLIGHTED
112189      DARK-LIGHTED
112190    DARK-UNLIGHTED
Name: LIGHTING, Length: 1443640, dtype: object


<h1>Hypothesis</h1>

<h3>Weather Conditions: Wet, Icy, Snowy, and Dark road conditions show an increase
in accidents/severity</h3>

In [130]:
newframe = pre_processing2(df.WEATHER, "WEATHER",df.SEVERITY, "SEVERITY")
lift(newframe.WEATHER, newframe.SEVERITY, newframe)
chi_squared(newframe.WEATHER, newframe.SEVERITY, newframe)

The lift of NONE and PDO = 0.9930095899358082
The lift of NONE and INJ = 1.0199469612367908
The lift of NONE and FAT = 1.0677330273212862
The lift of WIND and PDO = 0.9568740551639691
The lift of WIND and INJ = 1.11373617121853
The lift of WIND and FAT = 1.848202603229225
The lift of RAIN and PDO = 0.9969170555732916
The lift of RAIN and INJ = 1.0156792752906552
The lift of RAIN and FAT = 0.7121879402296735
The lift of SNOW/SLEET/HAIL and PDO = 1.0708853452328773
The lift of SNOW/SLEET/HAIL and INJ = 0.7950367443835328
The lift of SNOW/SLEET/HAIL and FAT = 0.43750377910038907
The lift of DUST and PDO = 0.861901716908675
The lift of DUST and INJ = 1.4073482200971188
The lift of DUST and FAT = 1.7247030502219918
The lift of FOG and PDO = 0.9795453458630856
The lift of FOG and INJ = 1.0510410362752658
The lift of FOG and FAT = 1.5363564049433178
Chi squared of  NONE and  PDO =  561361.7775487015 which indicates these are  correlated
Chi squared of  NONE and  INJ =  560414.4611499908 which

<h3>Lighting: Poor Lighting will show a correlation to fatal accidents</h3>

In [126]:
# lighting = pre_processing(df.LIGHTING, "LIGHTING").squeeze()
# severity = pre_processing(df.SEVERITY, "SEVERITY").squeeze()
newframe = pre_processing2(df.LIGHTING, "LIGHTING",df.SEVERITY, "SEVERITY")
lift(newframe.LIGHTING, newframe.SEVERITY, newframe)
chi_squared(newframe.LIGHTING, newframe.SEVERITY, newframe)

The lift of DARK-LIGHTED and INJ = 1.0083382249706225
The lift of DARK-LIGHTED and PDO = 0.9958633456245762
The lift of DARK-LIGHTED and FAT = 1.2455616476979654
The lift of DARK-UNLIGHTED and INJ = 0.9502644977631927
The lift of DARK-UNLIGHTED and PDO = 1.0078363196149338
The lift of DARK-UNLIGHTED and FAT = 2.41290114058314
The lift of DAYLIGHT and INJ = 1.0061998418941558
The lift of DAYLIGHT and PDO = 0.9995331061019059
The lift of DAYLIGHT and FAT = 0.7367211652057148
The lift of DAWN OR DUSK and INJ = 0.9782549232647272
The lift of DAWN OR DUSK and PDO = 1.0053604434083223
The lift of DAWN OR DUSK and FAT = 1.2871664827487923
Chi squared of  DARK-LIGHTED and  INJ =  97.10617962990213 which indicates these are  correlated
Chi squared of  DARK-LIGHTED and  PDO =  106.81577605852266 which indicates these are  correlated
Chi squared of  DARK-LIGHTED and  FAT =  160.89308710714295 which indicates these are  correlated
Chi squared of  DARK-UNLIGHTED and  INJ =  196.5644611790934 which 

<h3>Road Conditions: Rough road conditions show an increase in accidents</h3>

In [133]:
# road = pre_processing(df.ROAD_DESC, "ROAD_DESC").squeeze()
# cont = pre_processing(df.CONTOUR, "CONTOUR").squeeze()
newframe = pre_processing2(df.ROAD_DESC, "ROAD_DESC",df.SEVERITY, "SEVERITY")
lift(newframe.ROAD_DESC, newframe.SEVERITY, newframe)
chi_squared(newframe.ROAD_DESC, newframe.SEVERITY, newframe)

The lift of AT INTERSECTION and PDO = 0.9451966582168044
The lift of AT INTERSECTION and INJ = 1.1742252380055493
The lift of AT INTERSECTION and FAT = 0.7462292533934156
The lift of INTERSECTION RELATED and PDO = 1.017089778001948
The lift of INTERSECTION RELATED and INJ = 0.9584872339478292
The lift of INTERSECTION RELATED and FAT = 0.36923658743835513
The lift of NON-INTERSECTION and PDO = 1.0234690915073053
The lift of NON-INTERSECTION and INJ = 0.9194044311393419
The lift of NON-INTERSECTION and FAT = 1.4401601985969
The lift of AT DRIVEWAY ACCESS and PDO = 1.0315422989790621
The lift of AT DRIVEWAY ACCESS and INJ = 0.91174857792152
The lift of AT DRIVEWAY ACCESS and FAT = 0.48006239290136915
The lift of RAMP and PDO = 1.0212248522792429
The lift of RAMP and INJ = 0.9387040196298474
The lift of RAMP and FAT = 0.7560027977585895
The lift of ROUNDABOUT and PDO = 1.1717300100585375
The lift of ROUNDABOUT and INJ = 0.4859170129414694
The lift of ROUNDABOUT and FAT = 0.0304100998592300

<h3>Contour: A harsher contour to the road will be correlated with higher fatalities</h3>

In [132]:
newframe = pre_processing2(df.CONTOUR, "CONTOUR",df.SEVERITY, "SEVERITY")
lift(newframe.CONTOUR, newframe.SEVERITY, newframe)
chi_squared(newframe.CONTOUR, newframe.SEVERITY, newframe)

The lift of STRAIGHT ON-LEVEL and INJ = 0.9865068542707448
The lift of STRAIGHT ON-LEVEL and PDO = 1.0055891728831923
The lift of STRAIGHT ON-LEVEL and FAT = 0.7897209941593247
The lift of CURVE ON-GRADE and INJ = 1.124256433387627
The lift of CURVE ON-GRADE and PDO = 0.9504459879336999
The lift of CURVE ON-GRADE and FAT = 2.6082601176759437
The lift of STRAIGHT ON-GRADE and INJ = 0.991278083234747
The lift of STRAIGHT ON-GRADE and PDO = 1.0036492389098128
The lift of STRAIGHT ON-GRADE and FAT = 0.8578382236917824
The lift of CURVE ON-LEVEL and INJ = 1.038308408748852
The lift of CURVE ON-LEVEL and PDO = 0.9809101163643315
The lift of CURVE ON-LEVEL and FAT = 2.14883851219767
The lift of HILLCREST and INJ = 1.077331315368525
The lift of HILLCREST and PDO = 0.9758405523946292
The lift of HILLCREST and FAT = 0.8565858964430374
Chi squared of  STRAIGHT ON-LEVEL and  INJ =  168.3894305910089 which indicates these are  correlated
Chi squared of  STRAIGHT ON-LEVEL and  PDO =  257.98142309712

<h3>Age: is there a correlation between age and accidents</h3>

<h3>System: type of road/speed limit will be correlated with severity of accident</h3>

In [135]:
newframe = pre_processing2(df.SYSTEM, "SYSTEM",df.SEVERITY, "SEVERITY")

lift(newframe.SYSTEM, newframe.SEVERITY, newframe)
chi_squared(newframe.SYSTEM, newframe.SEVERITY, newframe)

The lift of CITY STREET and PDO = 1.030666779219851
The lift of CITY STREET and INJ = 0.9140406079908453
The lift of CITY STREET and FAT = 0.5017380586883238
The lift of COUNTY ROAD and PDO = 0.9306355556239375
The lift of COUNTY ROAD and INJ = 1.1971514207986031
The lift of COUNTY ROAD and FAT = 1.976197729570637
The lift of STATE HIGHWAY and PDO = 0.9776203973897087
The lift of STATE HIGHWAY and INJ = 1.062941385096866
The lift of STATE HIGHWAY and FAT = 1.3519214212649315
The lift of INTERSTATE and PDO = 0.9952312061084693
The lift of INTERSTATE and INJ = 1.01147967636742
The lift of INTERSTATE and FAT = 1.1820475258469834
The lift of FRONTAGE ROAD and PDO = 0.9872722405508428
The lift of FRONTAGE ROAD and INJ = 1.0225707924773964
The lift of FRONTAGE ROAD and FAT = 1.9328888190537523
The lift of UNKNOWN/PRIVATE and PDO = 1.3287988284563335
The lift of UNKNOWN/PRIVATE and INJ = 0.0
The lift of UNKNOWN/PRIVATE and FAT = 0.0
Chi squared of  CITY STREET and  PDO =  3402.815211068571 wh

<h3>Road Conditions: Rough road conditions show an increase in accidents</h3>

In [136]:
newframe = pre_processing2(df.CONDITION, "CONDITION",df.SEVERITY, "SEVERITY")

lift(newframe.CONDITION, newframe.SEVERITY, newframe)
chi_squared(newframe.CONDITION, newframe.SEVERITY, newframe)

The lift of WET W/VIS ICY ROAD TREATMENT and INJ = 1.0634216824319558
The lift of WET W/VIS ICY ROAD TREATMENT and PDO = 0.9785061140287686
The lift of WET W/VIS ICY ROAD TREATMENT and FAT = 1.1660914802454145
The lift of DRY and INJ = 1.0358749262734535
The lift of DRY and PDO = 0.9878404831877825
The lift of DRY and FAT = 1.0941811170873623
The lift of SNOWY and INJ = 0.6539584431708662
The lift of SNOWY and PDO = 1.1157411570465967
The lift of SNOWY and FAT = 0.35597171062967536
The lift of ICY and INJ = 0.7649403916607098
The lift of ICY and PDO = 1.0784231159199151
The lift of ICY and FAT = 0.5963198237731716
The lift of SLUSHY and INJ = 0.863551907637802
The lift of SLUSHY and PDO = 1.0464054096646553
The lift of SLUSHY and FAT = 0.6148762795666357
The lift of SNOWY W/VIS ICY ROAD TREATMENT and INJ = 0.7027311611290573
The lift of SNOWY W/VIS ICY ROAD TREATMENT and PDO = 1.1007285649361929
The lift of SNOWY W/VIS ICY ROAD TREATMENT and FAT = 0.22443044926706357
The lift of DRY W/

<h3>City: Does the city your in have a correlation with likelyhood of accident</h3>

In [55]:
lift(df.CITY, df.SEVERITY)
chi_squared(df.CITY, df.SEVERITY)

The lift of FAIRPLAY and PDO = 1.0362192698971406
The lift of FAIRPLAY and INJ = 0.8304124636046625
The lift of FAIRPLAY and FAT = 4.18260589713283
The lift of PUEBLO and PDO = 0.9595259191422915
The lift of PUEBLO and INJ = 1.1296584213537602
The lift of PUEBLO and FAT = 0.7595523620844016
The lift of DENVER and PDO = 1.058240573852441
The lift of DENVER and INJ = 0.8287800377606337
The lift of DENVER and FAT = 0.4953517317696174
The lift of DURANGO and PDO = 1.1135472028332698
The lift of DURANGO and INJ = 0.6610046664072418
The lift of DURANGO and FAT = 0.3031680029175943
The lift of LAKEWOOD and PDO = 1.0228905518653593
The lift of LAKEWOOD and INJ = 0.9339978476370828
The lift of LAKEWOOD and FAT = 0.7300071139234586
The lift of LONGMONT and PDO = 0.9759389096908075
The lift of LONGMONT and INJ = 1.0828275480981076
The lift of LONGMONT and FAT = 0.5385915366717636
The lift of THORNTON and PDO = 1.0311235564784946
The lift of THORNTON and INJ = 0.911539490195978
The lift of THORNTO

The lift of LITTLETON and INJ = 0.7386103751059497
The lift of LITTLETON and FAT = 0.40084281597492305
The lift of GUNNISON and PDO = 1.100791153250043
The lift of GUNNISON and INJ = 0.6770393906374031
The lift of GUNNISON and FAT = 1.6030381251317805
The lift of VAIL and PDO = 1.1851069509163212
The lift of VAIL and INJ = 0.4304163227172458
The lift of VAIL and FAT = 0.802930684726098
The lift of EVANS and PDO = 1.0534562211273968
The lift of EVANS and INJ = 0.8413568520313432
The lift of EVANS and FAT = 0.6192818560956116
The lift of GREENWOOD VILLAGE and PDO = 1.0572034595479025
The lift of GREENWOOD VILLAGE and INJ = 0.8375285167577574
The lift of GREENWOOD VILLAGE and FAT = 0.18856010191992265
The lift of ERIE and PDO = 0.9854309044254287
The lift of ERIE and INJ = 1.0523102987585662
The lift of ERIE and FAT = 0.601060043226735
The lift of CRAIG and PDO = 1.1309720365794074
The lift of CRAIG and INJ = 0.6055903762926954
The lift of CRAIG and FAT = 0.38418880010180767
The lift of G

The lift of CRESTED BUTTE and FAT = 0.0
The lift of TRINIDAD and PDO = 1.112084856433412
The lift of TRINIDAD and INJ = 0.6657069716524852
The lift of TRINIDAD and FAT = 0.2935004567301363
The lift of MORRISON and PDO = 1.031203257499967
The lift of MORRISON and INJ = 0.9000066899578941
The lift of MORRISON and FAT = 1.187250111425725
The lift of LARKSPUR and PDO = 1.1419364932046614
The lift of LARKSPUR and INJ = 0.5785757292586462
The lift of LARKSPUR and FAT = 0.0
The lift of SILT and PDO = 1.138970424391143
The lift of SILT and INJ = 0.5502429090146397
The lift of SILT and FAT = 2.0785898607331843
The lift of GRANBY and PDO = 1.1141467100133873
The lift of GRANBY and INJ = 0.6487960314421742
The lift of GRANBY and FAT = 0.8767385438220738
The lift of CALHAN and PDO = 0.9686570899027477
The lift of CALHAN and INJ = 1.1150950711360823
The lift of CALHAN and FAT = 0.0
The lift of PARACHUTE and PDO = 1.14107378620904
The lift of PARACHUTE and INJ = 0.5812469006771995
The lift of PARACH

The lift of JULESBURG and INJ = 0.9946698739880024
The lift of JULESBURG and FAT = 2.504967268063069
The lift of RAYMER and PDO = 0.9301591799194334
The lift of RAYMER and INJ = 1.234294889085112
The lift of RAYMER and FAT = 0.0
The lift of COLORADO CITY and PDO = 1.1161910159033201
The lift of COLORADO CITY and INJ = 0.6582906075120597
The lift of COLORADO CITY and FAT = 0.0
The lift of LYONS and PDO = 1.0418281894469845
The lift of LYONS and INJ = 0.8692217528768394
The lift of LYONS and FAT = 1.070197283538682
The lift of HOT SULPHUR SPRINGS and PDO = 1.008054283656529
The lift of HOT SULPHUR SPRINGS and INJ = 0.9931108302983658
The lift of HOT SULPHUR SPRINGS and FAT = 0.0
The lift of YAMPA and PDO = 1.3287988284563335
The lift of YAMPA and INJ = 0.0
The lift of YAMPA and FAT = 0.0
The lift of CARR and PDO = 0.5315195313825334
The lift of CARR and INJ = 2.468589778170224
The lift of CARR and FAT = 0.0
The lift of SOMERSET and PDO = 0.9441465360084473
The lift of SOMERSET and INJ = 

The lift of HILLROSE and INJ = 0.0
The lift of HILLROSE and FAT = 0.0
The lift of RICO and PDO = 1.1811545141834077
The lift of RICO and INJ = 0.45714625521670815
The lift of RICO and FAT = 0.0
The lift of GRANADA and PDO = 1.0630390627650668
The lift of GRANADA and INJ = 0.8228632593900747
The lift of GRANADA and FAT = 0.0
The lift of ADAMS CITY and PDO = 1.0796490481207708
The lift of ADAMS CITY and INJ = 0.7714343056781949
The lift of ADAMS CITY and FAT = 0.0
The lift of GILL and PDO = 1.1475989882122881
The lift of GILL and INJ = 0.5610431314023235
The lift of GILL and FAT = 0.0
The lift of HASWELL and PDO = 0.6643994142281667
The lift of HASWELL and INJ = 2.0571581484751866
The lift of HASWELL and FAT = 0.0
The lift of SANFORD and PDO = 1.0335101999104817
The lift of SANFORD and INJ = 0.9142925104334163
The lift of SANFORD and FAT = 0.0
The lift of ARRIBA and PDO = 0.7751326499328611
The lift of ARRIBA and INJ = 1.7142984570626554
The lift of ARRIBA and FAT = 0.0
The lift of GROVE

The lift of LIVERMORE and PDO = 0.7972792970738001
The lift of LIVERMORE and INJ = 1.6457265187801493
The lift of LIVERMORE and FAT = 0.0
The lift of LAPORTE and PDO = 1.0630390627650668
The lift of LAPORTE and INJ = 0.8228632593900747
The lift of LAPORTE and FAT = 0.0
The lift of NATURITA and PDO = 0.9491420203259525
The lift of NATURITA and INJ = 0.5877594709929105
The lift of NATURITA and FAT = 32.564574484819886
The lift of VICTOR and PDO = 1.1959189456107002
The lift of VICTOR and INJ = 0.4114316296950373
The lift of VICTOR and FAT = 0.0
The lift of MCCLAVE and PDO = 0.8858658856375556
The lift of MCCLAVE and INJ = 1.3714387656501243
The lift of MCCLAVE and FAT = 0.0
The lift of DE BEQUE and PDO = 1.1243682394630514
The lift of DE BEQUE and INJ = 0.6329717379923651
The lift of DE BEQUE and FAT = 0.0
The lift of WILD HORSE and PDO = 1.3287988284563335
The lift of WILD HORSE and INJ = 0.0
The lift of WILD HORSE and FAT = 0.0
The lift of ILIFF and PDO = 1.0335101999104817
The lift of

The lift of HESPERUS and FAT = 0.0
The lift of HYGIENE and PDO = 1.3287988284563335
The lift of HYGIENE and INJ = 0.0
The lift of HYGIENE and FAT = 0.0
The lift of WESTON and PDO = 0.8858658856375556
The lift of WESTON and INJ = 1.3714387656501243
The lift of WESTON and FAT = 0.0
The lift of GOLD HILL and PDO = 0.6643994142281667
The lift of GOLD HILL and INJ = 2.0571581484751866
The lift of GOLD HILL and FAT = 0.0
The lift of BRISTOL and PDO = 0.0
The lift of BRISTOL and INJ = 0.0
The lift of BRISTOL and FAT = 227.95202139373922
The lift of ORCHARD and PDO = 1.3287988284563335
The lift of ORCHARD and INJ = 0.0
The lift of ORCHARD and FAT = 0.0
The lift of LAY and PDO = 1.3287988284563335
The lift of LAY and INJ = 0.0
The lift of LAY and FAT = 0.0
The lift of ALMONT and PDO = 1.3287988284563335
The lift of ALMONT and INJ = 0.0
The lift of ALMONT and FAT = 0.0
The lift of GOULD and PDO = 1.3287988284563335
The lift of GOULD and INJ = 0.0
The lift of GOULD and FAT = 0.0
The lift of LAWSO

The lift of PETERSON FIELD and INJ = 0.0
The lift of PETERSON FIELD and FAT = 0.0
The lift of MOSCA and PDO = 0.0
The lift of MOSCA and INJ = 4.114316296950373
The lift of MOSCA and FAT = 0.0
The lift of MILNER and PDO = 1.3287988284563335
The lift of MILNER and INJ = 0.0
The lift of MILNER and FAT = 0.0
The lift of GARFIELD and PDO = 0.0
The lift of GARFIELD and INJ = 4.114316296950373
The lift of GARFIELD and FAT = 0.0
The lift of ATWOOD and PDO = 1.3287988284563335
The lift of ATWOOD and INJ = 0.0
The lift of ATWOOD and FAT = 0.0
The lift of KIM and PDO = 1.3287988284563335
The lift of KIM and INJ = 0.0
The lift of KIM and FAT = 0.0


In [56]:
columns = ['SYSTEM', 'DATE', 'TIME', 'SEVERITY', 'ROAD_DESC', 'VEHICLES', 'CONTOUR', 'CONDITION', 'LIGHTING', 
             'WEATHER', 'CITY', 'COUNTY', 'LATITUDE', 'LONGITUDE', 'AGE_1', 'AGE_2']
df_all = df[columns]
print(len(df_all))
for i in columns:
    df_all = df_all[df_all[i] != "UNKOWN"]
    df_all = df_all[df_all[i] != 0]
    
df_all = df_all.dropna()
print(len(df_all))



df_all = create_labels(df_all, columns)
df_all.corr(method='pearson')




1449091
453546


Unnamed: 0,SYSTEM,DATE,TIME,SEVERITY,ROAD_DESC,VEHICLES,CONTOUR,CONDITION,LIGHTING,WEATHER,CITY,COUNTY,LATITUDE,LONGITUDE,AGE_1,AGE_2
SYSTEM,1.0,0.007521,-0.004583,0.014081,0.10212,0.02626,-0.004274,-0.01467,-0.00621,-0.517414,0.183329,0.21727,0.200653,-0.325183,0.016372,0.020628
DATE,0.007521,1.0,0.01714,-0.018361,0.002119,0.002621,0.014976,-0.100984,0.19467,-0.047904,-0.000807,0.001119,-0.004545,0.000844,-0.002546,0.006461
TIME,-0.004583,0.01714,1.0,-0.022576,-0.035274,-0.001442,0.017712,-0.001722,-0.367608,0.006614,-0.000702,0.002053,-0.009929,-0.002344,-0.048435,-0.06126
SEVERITY,0.014081,-0.018361,-0.022576,1.0,0.066554,-0.12067,-0.012963,0.015877,0.051919,0.006577,0.029645,0.002318,0.011861,-0.045228,-0.007802,-0.005335
ROAD_DESC,0.10212,0.002119,-0.035274,0.066554,1.0,0.105394,-0.140894,-0.012547,0.035232,-0.085481,0.035378,-0.044558,0.051559,-0.023302,-0.057233,0.000608
VEHICLES,0.02626,0.002621,-0.001442,-0.12067,0.105394,1.0,0.017445,-0.00913,0.013406,-0.019842,0.000355,-0.025751,0.013237,0.002102,-0.017714,-0.027686
CONTOUR,-0.004274,0.014976,0.017712,-0.012963,-0.140894,0.017445,1.0,-0.021204,-0.005125,-0.000475,-0.024493,0.01578,0.054888,0.002912,-0.002924,-0.001321
CONDITION,-0.01467,-0.100984,-0.001722,0.015877,-0.012547,-0.00913,-0.021204,1.0,-0.061277,0.212061,-0.004028,0.007905,0.00594,-0.021322,-0.023931,-0.010668
LIGHTING,-0.00621,0.19467,-0.367608,0.051919,0.035232,0.013406,-0.005125,-0.061277,1.0,-0.023577,0.017943,0.028121,-0.013277,-0.026123,0.07594,0.102664
WEATHER,-0.517414,-0.047904,0.006614,0.006577,-0.085481,-0.019842,-0.000475,0.212061,-0.023577,1.0,-0.10478,-0.081854,-0.125728,0.169089,-0.018184,-0.00862
