# Imports 

In [18]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import functions as f

In [19]:
df = pd.read_csv('master_modeling_csv.csv')

In [20]:
df.drop(columns=['Unnamed: 0'], inplace=True)

## Combining values for easier target varibale capture

In [21]:
# standardize the dataframes text
df.person_injury_severity = df.person_injury_severity.str.lower()
# combine values to make a more accurate target variable
df.loc[df['person_injury_severity'] == 'c - possible injury', 'person_injury_severity'] = 'b - suspected minor injury'
# drop all the instances where 99 is. inplace = true means to save the change 
df.drop(df.loc[df['person_injury_severity'] == '99 - unknown'].index, inplace=True)

In [22]:
# Trust but verify our code. 
df.person_injury_severity.value_counts()

person_injury_severity
b - suspected minor injury      8125
a - suspected serious injury    4006
n - not injured                 1155
k - fatal injury                 848
Name: count, dtype: int64

# Preprocessing

In [23]:
# First I want to see what my data looks like 
df.head()

Unnamed: 0,crash_id,person_age,charge,person_ethnicity,crash_date,day_of_week,person_gender,person_helmet,driver_license_class,has_motocycle_endorsment,driver_license_state,driver_license_type,person_injury_severity,license_plate_state,vehicle_body_style,vehicle_color,vehicle_defect_1,vehicle_make,vehicle_model_name,vehicle_model_year
0,16189632.0,37.0,operate unregistered motor vehicle,w - white,2018-01-01,monday,1 - male,1 - not worn,c - class c,0.0,tx - texas,1 - driver license,a - suspected serious injury,tx - texas,mc - motorcycle,blu - blue,no data,other (explain in narrative),other (explain in narrative) (other (explain i...,no data
1,16203470.0,30.0,"no class ""m"" license",h - hispanic,2018-01-04,thursday,1 - male,"3 - worn, not damaged",c - class c,0.0,tx - texas,1 - driver license,b - suspected minor injury,tx - texas,mc - motorcycle,gry - gray,no data,suzuki,gsx-r600 (suzuki),2004
2,16192023.0,21.0,no charges,w - white,2018-01-05,friday,1 - male,"2 - worn, damaged",c - class c,0.0,tx - texas,1 - driver license,a - suspected serious injury,tx - texas,mc - motorcycle,blu - blue,no data,yamaha,yzfr6 (yamaha),2017
3,16196720.0,18.0,no driver license no insurance,h - hispanic,2018-01-05,friday,1 - male,1 - not worn,5 - unlicensed,0.0,tx - texas,4 - id card,b - suspected minor injury,tx - texas,mc - motorcycle,blu - blue,no data,yamaha,rz500 (yamaha),2002
4,16189103.0,28.0,no charges,w - white,2018-01-06,saturday,1 - male,"3 - worn, not damaged",cm - class c and m,1.0,tx - texas,1 - driver license,b - suspected minor injury,tx - texas,mc - motorcycle,blk - black,no data,harley-davidson,fxdf (harley-davidson),2009


    Takeaways:
    - My values will need to be converted to dummie variables
    - person_injury_severity is our targert variable and i dont want to encode that 
    

In [24]:
# Checking my dataframe infomartion to see what needs to be done 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14184 entries, 0 to 14183
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   crash_id                  14134 non-null  float64
 1   person_age                14068 non-null  float64
 2   charge                    14091 non-null  object 
 3   person_ethnicity          14134 non-null  object 
 4   crash_date                14134 non-null  object 
 5   day_of_week               14134 non-null  object 
 6   person_gender             14134 non-null  object 
 7   person_helmet             14134 non-null  object 
 8   driver_license_class      14060 non-null  object 
 9   has_motocycle_endorsment  14134 non-null  float64
 10  driver_license_state      13537 non-null  object 
 11  driver_license_type       14060 non-null  object 
 12  person_injury_severity    14134 non-null  object 
 13  license_plate_state       14134 non-null  object 
 14  vehicl

In [25]:
object_cols = []
encoded_df = df.copy()

# Iterate over each column in the DataFrame
for col in df.columns:
    # Check if the column is of type 'object' and not the target column 'person_injury_severity'
    if df[col].dtype == 'object' and col != 'person_injury_severity':
        # Add the column name to the list of object columns
        object_cols.append(col)
        
        # Perform one-hot encoding using pd.get_dummies
        # drop_first=True drops the first category to avoid multicollinearity
        # prefix=col adds the column name as a prefix to the encoded columns
        # dtype=int sets the data type of the encoded columns to integer
        dummies = pd.get_dummies(data=df[col], columns=[col], drop_first=True, prefix=col, dtype=int)
        
        # Concatenate the encoded columns with the original DataFrame
        encoded_df = pd.concat([encoded_df, dummies], axis=1)
        
        # Drop the original categorical column from the encoded DataFrame
        encoded_df.drop(col, axis=1, inplace=True)

# Remove the target column 'person_injury_severity' from the encoded DataFrame
encoded_df = encoded_df[[col for col in encoded_df.columns if col != 'person_injury_severity']]

# Add the target column 'person_injury_severity' back to the encoded DataFrame
encoded_df['person_injury_severity'] = df['person_injury_severity']


In [26]:
encoded_df.head()

Unnamed: 0,crash_id,person_age,has_motocycle_endorsment,"charge_00001031 - no operator license, 00003197 - fail to have liability ins., 00003340 - unregistered motorcycle, 00002071 - motorcycle operator w/o helmet","charge_00001032 - no valid motorcycle license, 00002071 - motorcycle operator w/o helmet","charge_00001032-no valid motorcycle license, 00003341-unregistered motor vehicle/expired registration, 00003197-fail to have liability ins.",charge_00001037 - driving while license invalid,"charge_00001037 - driving while license invalid, 00001032 - no valid motorcycle license, 00003340 - operate unregistered motorcycle",charge_020.01,charge_19428683,...,vehicle_model_year_2016,vehicle_model_year_2017,vehicle_model_year_2018,vehicle_model_year_2019,vehicle_model_year_2020,vehicle_model_year_2021,vehicle_model_year_2022,vehicle_model_year_2023,vehicle_model_year_no data,person_injury_severity
0,16189632.0,37.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,a - suspected serious injury
1,16203470.0,30.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,b - suspected minor injury
2,16192023.0,21.0,0.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,a - suspected serious injury
3,16196720.0,18.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,b - suspected minor injury
4,16189103.0,28.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,b - suspected minor injury


     This seems to be end of our preproceessing part

# Splitting the data 

In [27]:
# Split our data into train, validate and test
train, validate, test = f.split(encoded_df)

In [28]:
# split our values for modeling

x_train = train.drop(columns= 'person_injury_severity')
y_train = train['person_injury_severity']

x_validate = validate.drop(columns= 'person_injury_severity')
y_validate = validate['person_injury_severity']

x_test= test.drop(columns= 'person_injury_severity')
y_test = test['person_injury_severity']

# Make our baseline model

In [29]:
df.person_injury_severity.value_counts()

person_injury_severity
b - suspected minor injury      8125
a - suspected serious injury    4006
n - not injured                 1155
k - fatal injury                 848
Name: count, dtype: int64

    Since suspected minor injury occurs most often this will be our baseline

In [30]:
pred_df_train = pd.DataFrame()
pred_df_train['actual'] = y_train

In [31]:
pred_df_train['baseline_predictions'] = 'b - suspected minor injury' 

In [32]:
pred_df_train['baseline_predictions']

8812     b - suspected minor injury
4411     b - suspected minor injury
6370     b - suspected minor injury
13586    b - suspected minor injury
9801     b - suspected minor injury
                    ...            
740      b - suspected minor injury
11119    b - suspected minor injury
6639     b - suspected minor injury
6437     b - suspected minor injury
3381     b - suspected minor injury
Name: baseline_predictions, Length: 7942, dtype: object

In [33]:
pred_df_train.actual.value_counts()

actual
b - suspected minor injury      4599
a - suspected serious injury    2201
n - not injured                  655
k - fatal injury                 466
Name: count, dtype: int64

In [34]:
print(classification_report(pred_df_train['actual'],pred_df_train['baseline_predictions']))

TypeError: '<' not supported between instances of 'float' and 'str'

# Descion Tree Classifier:

### make the model

In [None]:
clf = DecisionTreeClassifier(max_depth= 5, random_state= 666)

### Fit the model 

In [None]:
clf.fit(x_train,y_train)

In [None]:
plt.figure(figsize=(13, 7))
plot_tree(clf, feature_names=x_train.columns, class_names=clf.classes_, rounded=True)
plt.show()

In [None]:
pred_df_train['clf_prediction'] = clf_pred = clf.predict(x_train)

In [None]:
pred_df_train

In [None]:
print(classification_report(pred_df_train.actual,pred_df_train.clf_prediction))

# KNN: trainingn

In [None]:
knn = KNeighborsClassifier(n_neighbors=50, weights='uniform')
knn.fit(x_train, y_train)
pred_df_train['knn'] = knn.predict(x_train)
print(classification_report(pred_df_train.actual,pred_df_train.knn))

# Logistic Regression:

In [None]:
# First we get the weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# make the model
logit = LogisticRegression(C=1, class_weight=dict(zip(np.unique(y_train), class_weights)), random_state=666, intercept_scaling=1, solver='lbfgs')
# fit the model
logit.fit(x_train, y_train)


In [None]:
# check the intercepts and the coefficients of the logistic regression model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)


In [None]:
# make predictions 
pred_df_train['logistic'] = logit.predict(x_train)

In [None]:
print(classification_report(pred_df_train['actual'], pred_df_train['logistic']))

In [None]:
for cols in pred_df_train.columns:
    if (pred_df_train[cols] != pred_df_train['actual']).any():
        print(f'classification report for {cols}:')
        print('=====================================')
        print(classification_report(pred_df_train['actual'], pred_df_train[cols]))
    else:
        print(f"The {cols} column matches the 'actual' column.")
