# Imports 

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore', category=UndefinedMetricWarning, module='sklearn')
import functions as f

NameError: name 'UndefinedMetricWarning' is not defined

In [None]:
df = pd.read_csv('person_updated.csv')

## Conbining values for easier target varibale capture

In [None]:
df.person_injury_severity = df.person_injury_severity.str.lower()
df.loc[df['person_injury_severity'] == 'c - possible injury', 'person_injury_severity'] = 'b - suspected minor injury'
df.drop(df.loc[df['person_injury_severity'] == '99 - unknown'].index, inplace=True)

In [None]:
df.person_injury_severity.value_counts()

# Preprocessing

In [None]:
df.head()

    First I am going to convert non numerical values to numerical values

In [None]:
df.info()

In [None]:
object_cols = []
encoded_df = df.copy()  
for col in df.columns:
    if df[col].dtype == 'object' and col != 'person_injury_severity':
        object_cols.append(col)
        dummies = pd.get_dummies(data=df[col], columns=[col], drop_first=True, prefix=col, dtype=int)
        encoded_df = pd.concat([encoded_df, dummies], axis=1)
        encoded_df.drop(col, axis=1, inplace=True)
encoded_df = encoded_df[[col for col in encoded_df.columns if col != 'person_injury_severity']]
encoded_df['person_injury_severity'] = df['person_injury_severity']

In [None]:
encoded_df.head()

     This seems to be end of our preproceessing part

# Splitting the data 

In [None]:
train, validate, test = f.split(encoded_df)

In [None]:
x_train = train.drop(columns= 'person_injury_severity')
y_train = train['person_injury_severity']

x_validate = validate.drop(columns= 'person_injury_severity')
y_validate = validate['person_injury_severity']

x_test= test.drop(columns= 'person_injury_severity')
y_test = test['person_injury_severity']

# Make our baseline model

In [None]:
df.person_injury_severity.value_counts()

    Since suspected minor injury occurs most often this will be our baseline

In [None]:
pred_df_train = pd.DataFrame()
pred_df_train['actual'] = y_train

In [None]:
pred_df_train['baseline_predictions'] = 'b - suspected minor injury' 

In [None]:
pred_df_train['baseline_predictions']

In [None]:
pred_df_train.actual.value_counts()

In [None]:
print(classification_report(pred_df_train['actual'],pred_df_train['baseline_predictions']))

# Descion Tree Classifier:

### make the model

In [None]:
clf = DecisionTreeClassifier(max_depth= 5, random_state= 666)

### Fit the model 

In [None]:
clf.fit(x_train,y_train)

In [None]:
plt.figure(figsize=(13, 7))
plot_tree(clf, feature_names=x_train.columns, class_names=clf.classes_, rounded=True)
plt.show()

In [None]:
pred_df_train['clf_prediction'] = clf_pred = clf.predict(x_train)

In [None]:
pred_df_train

In [None]:
print(classification_report(pred_df_train.actual,pred_df_train.clf_prediction))

# KNN: 

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(x_train, y_train)
pred_df_train['knn'] = knn.predict(x_train)
print(classification_report(pred_df_train.actual,pred_df_train.knn))

# Logistic Regression:

In [None]:
# First we get the weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# make the model
logit = LogisticRegression(C=1, class_weight=dict(zip(np.unique(y_train), class_weights)), random_state=666, intercept_scaling=1, solver='lbfgs')
# fit the model
logit.fit(x_train, y_train)


In [None]:
# check the intercepts and the coefficients of the logistic regression model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)


In [None]:
# make predictions 
pred_df_train['logistic'] = logit.predict(x_train)

In [None]:
print(classification_report(pred_df_train['actual'], pred_df_train['logistic']))

In [None]:
for cols in pred_df_train.columns:
    if (pred_df_train[cols] != pred_df_train['actual']).any():
        print(f'classification report for {cols}:')
        print('=====================================')
        print(classification_report(pred_df_train['actual'], pred_df_train[cols]))
    else:
        print(f"The {cols} column matches the 'actual' column.")
