# **Hypothesis Testing - Driver Substance Abuse v/s Hit/Run**

In [12]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('crash.csv')

# Function to simplify substance abuse descriptions
def simplify_substance(description):
    if pd.isna(description) or 'NONE DETECTED' in description or 'UNKNOWN' in description or 'N/A' in description:
        return 'No'
    else:
        return 'Yes'

# Apply simplification
df['Driver Substance Abuse'] = df['Driver Substance Abuse'].apply(simplify_substance)


# Encode categories for KNN imputation
le = LabelEncoder()
df['Driver Substance Abuse'] = le.fit_transform(df['Driver Substance Abuse'])

# KNN Imputation
imputer = KNNImputer(n_neighbors=5)
df[['Driver Substance Abuse']] = imputer.fit_transform(df[['Driver Substance Abuse']])

# Convert back to original categories
df['Driver Substance Abuse'] = le.inverse_transform(df['Driver Substance Abuse'].round().astype(int))

from scipy.stats import chi2_contingency

# Assuming 'Hit/Run' has been properly encoded as 0 for 'No' and 1 for 'Yes'
def perform_chi_square(column):
    contingency_table = pd.crosstab(df[column], df['Hit/Run'])
    chi2, p, dof, ex = chi2_contingency(contingency_table)
    print(f"Chi-square Statistic for {column}: {chi2}, P-value: {p}")
    if p < 0.05:
        print(f"The null hypothesis can be rejected for {column}. There is a statistically significant association between {column} and hit-and-run incidents.")
    else:
        print(f"The null hypothesis cannot be rejected for {column}. There is no statistically significant association between {column} and hit-and-run incidents.")

# Chi-square test for 'Driver Substance Abuse'
perform_chi_square('Driver Substance Abuse')



Chi-square Statistic for Driver Substance Abuse: 81.65375509944661, P-value: 1.6214344512600915e-19
The null hypothesis can be rejected for Driver Substance Abuse. There is a statistically significant association between Driver Substance Abuse and hit-and-run incidents.


# **PREDICTION MODEL THAT PREDICTS HIT/RUN**

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Load the dataset
# df = pd.read_csv('crash.csv')

# Preliminary data cleaning
# Convert 'Crash Date/Time' to datetime and extract relevant parts
df['Crash Date/Time'] = pd.to_datetime(df['Crash Date/Time'])
df['Hour'] = df['Crash Date/Time'].dt.hour
df['Month'] = df['Crash Date/Time'].dt.month
df['DayOfWeek'] = df['Crash Date/Time'].dt.weekday

# Assume these columns are relevant based on the description
relevant_columns = ['Weather', 'Light', 'Traffic Control', 'Road Condition', 'Surface Condition', 'Hour', 'Month', 'DayOfWeek', 'Hit/Run','Driver Substance Abuse']

# Filter the dataset
df = df[relevant_columns]

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Encode categorical variables
categorical_features = ['Weather', 'Light', 'Traffic Control', 'Road Condition', 'Surface Condition', 'DayOfWeek', 'Month', 'Driver Substance Abuse']
df = pd.get_dummies(df, columns=categorical_features)

# Prepare the target and features for the model
df['Hit/Run'] = (df['Hit/Run'] == 'Yes').astype(int)
y = df['Hit/Run']
X = df.drop('Hit/Run', axis=1)

# Split the data into training and testing sets
# random_state=42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# Train a RandomForest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


  df['Crash Date/Time'] = pd.to_datetime(df['Crash Date/Time'])


Accuracy: 0.7993022778575826
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.96      0.89     15876
           1       0.35      0.10      0.15      3616

    accuracy                           0.80     19492
   macro avg       0.59      0.53      0.52     19492
weighted avg       0.74      0.80      0.75     19492

