In [None]:
import pandas as pd
import pyarrow as pa
import polars as pl
import vaex as vx
#import pyarrow.parquet as pq
#import dask.dataframe as dd

import numpy as np
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt

import plotly.graph_objects as go
from nltk.corpus import stopwords

from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import CategoricalNB, BernoulliNB, MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn.impute import SimpleImputer

In [None]:
df_original = pd.read_csv("/kaggle/input/us-accidents/US_Accidents_March23.csv",index_col = 0,parse_dates=['Start_Time','End_Time'],infer_datetime_format=True)
df_base = df_original.copy()
df_base.info()

In [None]:
df_base.describe()

In [None]:
df_obj = df_base.select_dtypes(include = ['object'])
df_num = df_base.select_dtypes(exclude = ['object'])

In [None]:
df_obj_nunique = df_obj.nunique().reset_index()
df_obj_nunique.columns = ['columns_name','unique_value_count']
df_obj_nunique['unique_ratio'] = df_obj_nunique['unique_value_count']/df_base.shape[0]
df_obj_nunique.sort_values(by='unique_ratio', ascending =False)

In [None]:
df_num_nunique = df_num.nunique().reset_index()
df_num_nunique.columns = ['columns_name','unique_value_count']
df_num_nunique['unique_ratio'] = df_num_nunique['unique_value_count']/df_base.shape[0]
df_num_nunique.sort_values(by='unique_ratio', ascending =False)

In [None]:
df_missing = df_num.isnull().sum(axis = 0).reset_index()
df_missing.columns = ['columns_name', 'missing_count']
df_missing['missing_ratio'] = df_missing['missing_count']/df_base.shape[0]
df_missing.query('missing_ratio > 0').sort_values(by = 'missing_ratio',ascending=False)

In [None]:
df_missing = df_obj.isnull().sum(axis = 0).reset_index()
df_missing.columns = ['columns_name', 'missing_count']
df_missing['missing_ratio'] = df_missing['missing_count']/df_base.shape[0]
df_missing.query('missing_ratio > 0').sort_values(by = 'missing_ratio',ascending=False)

In [None]:
df_clean = df_base[[col for col in df_base.columns if col not in ['End_Lat','End_Lng', 'Precipitation(in)', 'Wind_Chill(F)', 'Turning_Loop', 'Country']]]
df_clean_obj = df_clean.select_dtypes(include = ['object'])
df_clean_num = df_clean.select_dtypes(exclude = ['object','datetime64']) #Excluding date time since it has no null values and imputation doesn't work on this dt
len(df_clean_obj.columns)+len(df_clean_num.columns)

In [None]:
obj_imputer = SimpleImputer(strategy = 'most_frequent')
df_cl_obj = pd.DataFrame(obj_imputer.fit_transform(df_clean_obj),columns=df_clean_obj.columns, index=df_clean_obj.index)
# df_cl_obj.head()
num_imputer = SimpleImputer(strategy = 'mean')
df_cl_num = pd.DataFrame(num_imputer.fit_transform(df_clean_num),columns=df_clean_num.columns, index=df_clean_num.index)
# df_cl_num.head()
print((df_clean_obj.County == df_cl_obj.County).all()) 
# Verify imputation didn't messed up entries, by comparing non null columns 
# If non-null columns stay same as the original db for every entry we are good.

print((df_clean_num['Distance(mi)'] == df_cl_num['Distance(mi)']).all())

In [None]:
df_clean_obj = pd.concat([df_cl_obj,df_base.loc[:,['Start_Time','End_Time']]],axis = 1) #Setting the original obj_df with imputed one
df_clean_num = df_cl_num

In [None]:
mapping_1 = {'Day':1, 'Night': 0}
times = ["Civil_Twilight","Nautical_Twilight","Astronomical_Twilight","Sunrise_Sunset"]
df_clean_obj[times] = (df_clean_obj[times].replace(mapping_1))

mapping_2 = {'US/Eastern':1,'US/Pacific':2,'US/Central':3,'US/Mountain':4}
df_clean_obj[['Timezone']] = (df_clean_obj[["Timezone"]].replace(mapping_2))
# Extract year, month, day, hour and weekday
df_clean_obj['Year'] = (df_clean_obj['Start_Time'].dt.year).astype('uint16')
df_clean_obj['Month'] = (df_clean_obj['Start_Time'].dt.strftime('%b')).astype('category')
df_clean_obj['Day'] = (df_clean_obj['Start_Time'].dt.day).astype('uint8')
df_clean_obj['Hour'] = (df_clean_obj['Start_Time'].dt.hour).astype('uint8')
df_clean_obj['Min'] = (df_clean_obj['Start_Time'].dt.minute).astype('float32')
df_clean_obj['Weekday'] = (df_clean_obj['Start_Time'].dt.strftime("%a")).astype('category')

# Extract the amount of time in the unit of mins for each accident, rounded to the nearest integer
td = "Time_Duration(min)"
df_clean_obj[td] = (round((df_clean_obj['End_Time'] - df_clean_obj['Start_Time'])/np.timedelta64(1,'m'),3)).astype('float32')

In [None]:
neg_outliers = df_clean_obj[td]<=0

# Set outliers to NAN
df_clean_obj[neg_outliers] = np.nan

# **Drop rows with negative td**
df_clean_obj.dropna(subset = [td], axis = 0, inplace = True)

In [None]:
obj_dtypes = {'Source':'category',
        'Description':'string', 
        'Street':'category',
        'City':'category', 
        'County':'category',
        'State':'category', 
        'Zipcode':'category',
        'Timezone':'uint8',
        'Airport_Code':'category',
        'Weather_Timestamp':'string',
        'Wind_Direction':'category',
        'Weather_Condition':'string',
        'Sunrise_Sunset':'bool',
        'Civil_Twilight':'bool',
        'Nautical_Twilight':'bool', 
        'Astronomical_Twilight':'bool',
        'Start_Time':'datetime64[ns]',
        'End_Time':'datetime64[ns]',
        'Year':'uint16',
        'Day':'uint8',
        'Hour':'uint8',
        'Min': 'uint8',
        'Weekday': 'category',
        'Time_Duration(min)':'float32',
        'Month':'category'}

num_dtypes = { 'Severity':'uint8',
            'Start_Lat': 'float32',
            'Start_Lng': 'float32',
            'Distance(mi)': 'float32',
            'Temperature(F)': 'float32','Humidity(%)': 'float32',
            'Pressure(in)': 'float32',
            'Visibility(mi)': 'float32',
            'Wind_Speed(mph)': 'float32',
            'Amenity':'bool',
            'Bump':"bool",
            'Crossing':'bool',
            'Give_Way':'bool',
            'Junction':'bool',
            'No_Exit':'bool',
            'Railway':'bool',
            'Roundabout':'bool',
            'Station':'bool', 
            'Stop':'bool',
            'Traffic_Calming':'bool',
            'Traffic_Signal':'bool',
             }
df_clean_obj = df_clean_obj.astype(obj_dtypes)
df_clean_num = df_clean_num.astype(num_dtypes)

In [None]:
df_clean = pd.concat([df_clean_num,df_clean_obj],axis = 1)
df_clean.head()
df_clean.shape
df_clean.describe()
df_clean.info()

In [None]:
df_missing = df_clean.isnull().sum(axis = 0).reset_index()
df_missing.columns = ['columns_name', 'missing_count']
df_missing['missing_ratio'] = df_missing['missing_count']/df_clean.shape[0]
df_missing['data_type'] = [df_clean[col].dtypes for col in df_missing.columns_name[:]]
df_missing.query('missing_ratio > 0').sort_values(by = ['missing_ratio','data_type'], ascending = False)    

In [None]:
%matplotlib inline
state_acc_counts = pd.DataFrame(df_clean['State'].value_counts())
z = state_acc_counts.values.flatten()
x = state_acc_counts.index.to_list()
fig = go.Figure(data = go.Choropleth(locations = x, z = z, locationmode = "USA-states", colorscale = 'ylorrd'))

fig.update_layout(title_text = "Number of Accidents for each State in US", geo_scope = "usa")
fig.show()
## Observing which states have most accidents
fig,axs = plt.subplots(figsize = (10,6))

x = state_acc_counts[0:15].index.to_list()
y = state_acc_counts[0:15].values.flatten()

sns.barplot(x=x, y = y, palette='rainbow')
axs.tick_params(axis = 'x', rotation = 90)
axs.set_ylabel("Number of Accidents")
axs.set_xlabel("States")
plt.title("Top 15 States with Highest Number of Accidents")
plt.savefig("Top_15_States_Accidents.png",bbox_inches = 'tight', dpi = 300)
plt.show()

In [None]:
city_acc_counts = pd.DataFrame(df_clean['City'].value_counts()).reset_index()
city_acc_counts.columns = ['City',"Number of Accidents"]
city_acc_counts.sort_values(by = 'Number of Accidents', ascending = False,inplace = True)
x = city_acc_counts['City'][:15].to_list()
y = city_acc_counts["Number of Accidents"][:15]
## Observing Top 20 cities have most accidents
fig,axs = plt.subplots(figsize = (10,6))
sns.barplot(x = x,y=y, ax = axs, palette = 'rainbow')
axs.tick_params(axis = 'x', rotation = 90)
axs.set_yscale("log")
axs.set_ylabel("Number of Accidents")
axs.set_xlabel("Cities")
plt.title("Top 15 Cities with Highest Number of Accidents")
plt.savefig("Top_15_Cities_Accidents.png",bbox_inches = 'tight', dpi = 300)
plt.show()

In [None]:
weekday_acc_counts = pd.DataFrame(df_clean['Weekday'].value_counts()).reset_index()
weekday_acc_counts.columns = ["Day","Number of Accidents"]
days = weekday_acc_counts['Day']
acc = weekday_acc_counts["Number of Accidents"]
dc = {days[i]:acc[i] for i in range(7)}
plt.figure(figsize=(20, 8))
plt.title("Number of accidents for each weekday")
sns.barplot(x=list(dc.keys()), y = list(dc.values()),palette='rainbow')
plt.xlabel("Weekday")
plt.ylabel("Number of Accidents")
plt.savefig("Accidents_Weekday_Distribution.png",bbox_inches = 'tight', dpi = 300)
plt.show()  

In [None]:
counts = df_clean["Weather_Condition"].value_counts()[:15]
plt.figure(figsize=(20, 8))
plt.title("Histogram distribution of the top 15 weather conditions")
sns.barplot(x = counts.index,y= counts.values)
plt.xlabel("Weather Condition")
plt.ylabel("Value")
plt.savefig("Weather_Accident_Distribution.png",bbox_inches = 'tight', dpi = 300)
plt.show()

In [None]:
stop = stopwords.words("english") + ["-"]
severity_levels = [1, 2, 3, 4]

fig, axs = plt.subplots(2, 2, figsize=(18, 12))
axs = axs.ravel()

for i, severity in enumerate(severity_levels):
    ax = axs[i]
    df_desc = df_clean[df_clean["Severity"] == severity]["Description"]

    # Split the description using vectorized operations
    df_words = df_desc.str.cat(sep=' ').lower().split()

    # Count the words and filter out stopwords
    counts = pd.Series(df_words).value_counts().loc[lambda x: ~x.index.isin(stop)][:10]

    # Plot the barplot
    sns.barplot(x=counts.values, y=counts.index, orient="h", ax=ax, palette='rainbow')
    ax.set_title(f"Top 10 words used to describe an accident with severity {severity}")
    ax.set_xlabel("Value")
    ax.set_ylabel("Word")

plt.savefig("Frequent_Words_PerSeverityLevel.png",bbox_inches = 'tight', dpi = 300)
plt.tight_layout()
plt.show()
    

In [None]:
fig,axs = plt.subplots(figsize = (10,6))
sns.countplot(x = 'Severity', data = df_clean, ax = axs, order=df_clean.Severity.value_counts().index, palette='rainbow')
axs.tick_params(axis = 'x', rotation = 0)
#axs.set_yscale("log")
axs.set_ylabel("Accident Count")


plt.title("Number of Accidents per Severity Level")
plt.savefig("Num_Accidents_Per_Severity_Level.png",bbox_inches = 'tight', dpi = 300)
plt.show()

In [None]:
fig,axs = plt.subplots(figsize = (10,6))
sns.countplot(x = 'Source', data = df_clean, ax = axs, order=df_clean.Source.value_counts().index, palette='rainbow')
axs.tick_params(axis = 'x', rotation = 0)
#axs.set_yscale("log")
axs.set_ylabel("Accident Count")

plt.title("Number of Accidents per Source Level")
plt.savefig("Accidents_Per_Source_Level.png",bbox_inches = 'tight', dpi = 300)
plt.show()

In [None]:
corr_matrix = df_clean.corr()
plt.figure(figsize=(30, 30))
sns.heatmap(corr_matrix,annot=True,linewidths=1,linecolor='k',square=True,mask=False, vmin=-1, vmax=1,cbar_kws={"orientation": "vertical"},cbar=True)
plt.gca().patch.set(hatch="X", edgecolor="#666")
plt.show()

In [None]:
x_cols = [col for col in df_clean.columns if col not in ['Severity'] if df_clean[col].dtype in ['float32','uint8','bool','uint16']]

labels = []
values = []
for col in x_cols:
    labels.append(col)
    values.append(np.corrcoef(df_clean[col].values, df_clean.Severity.values)[0,1])
corr_df = pd.DataFrame({'col_labels':labels, 'corr_values':values})
corr_df = (corr_df.sort_values(by='corr_values',ascending = False)).reset_index()
corr_df

ind = np.arange(len(labels))
width = 0.9
fig, ax = plt.subplots(figsize=(20,12))
rects = ax.barh(ind, np.array(corr_df.corr_values.values), color='orange')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.col_labels.values, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation coefficient of the variables")
plt.show()

In [None]:
less_corr_features = corr_df[(corr_df.corr_values>=-0.01) & (corr_df.corr_values<=0.01)].col_labels[:]
less_corr = less_corr_features.to_list()
less_corr

In [None]:
grouped = df_clean.groupby(['Year', 'Severity'])['Start_Lat'].count()

# Convert the result to a DataFrame
df_pivot = grouped.reset_index()

# Pivot the DataFrame
df_pivot = df_pivot.pivot(index='Year', columns='Severity', values='Start_Lat')

# Display the pivoted DataFrame
#print(df_pivot)

# Plot the stacked bar chart
ax = df_pivot.plot.bar(stacked=True)

# Set the y-axis scale to logarithmic
ax.set_yscale("log")
# Set the title and axis labels
plt.title("Accidents Organized by Severity Level per Year")
plt.xlabel("Year")
plt.ylabel("Number of Accidents")
plt.savefig("Accidents_Organized_by_Severity_Level_per_Year.png",bbox_inches = 'tight', dpi = 300)
# Show the plot
plt.show()


In [None]:
dropped_cols = less_corr + ["Description",'Street','County','Zipcode','State','Airport_Code','Weather_Timestamp','Start_Time','End_Time']
dropped_cols    
df_clean = df_clean.drop(dropped_cols,axis = 1)
df_clean.head()
df_clean.describe().round(3)

In [None]:
print("Number of rows:", len(df_clean.index))
df_clean.drop_duplicates(inplace=True)
print("Number of rows after drop of duplicates:", len(df_clean.index))

In [None]:
df_clean = df_clean[df_clean["Pressure(in)"] != 0]
len(df_clean.index)

In [None]:
unique_weather = (df_clean["Weather_Condition"].unique())

print(len(unique_weather))
print(list((unique_weather)))

In [None]:
df_clean.loc[df_clean["Weather_Condition"].str.contains("Rain|Drizzle|Shower|Precipitation", na=False), "Weather_Condition"] = "Rain"
df_clean.loc[df_clean["Weather_Condition"].str.contains("Cloudy|Overcast", na=False), "Weather_Condition"] = "Cloudy"
df_clean.loc[df_clean["Weather_Condition"].str.contains("Snow|Sleet|Wintry", na=False), "Weather_Condition"] = "Snow"
df_clean.loc[df_clean["Weather_Condition"].str.contains("Fog|Mist", na=False), "Weather_Condition"] = "Fog"
df_clean.loc[df_clean["Weather_Condition"].str.contains("Clear|Fair", na=False), "Weather_Condition"] = "Clear"
df_clean.loc[df_clean["Weather_Condition"].str.contains("Smoke|Volcanic Ash", na=False), "Weather_Condition"] = "Smoke"
df_clean.loc[df_clean["Weather_Condition"].str.contains("Thunder|T-Storm", na=False), "Weather_Condition"] = "Thunderstorm"
df_clean.loc[df_clean["Weather_Condition"].str.contains("Sand|Dust", na=False), "Weather_Condition"] = "Sand"
df_clean.loc[df_clean["Weather_Condition"].str.contains("Wind|Squalls", na=False), "Weather_Condition"] = "Windy"
df_clean.loc[df_clean["Weather_Condition"].str.contains("Hail|Pellets", na=False), "Weather_Condition"] = "Hail"

df_clean["Weather_Condition"] = df_clean["Weather_Condition"].astype('category')
df_clean.Weather_Condition.value_counts()

In [None]:
df_clean["Wind_Direction"].unique().to_list()

In [None]:
df_clean.isna().sum()   

In [None]:
df_clean.describe().round(3)
df_clean.info()

In [None]:
severity_counts = df_clean["Severity"].value_counts()

plt.figure(figsize=(10, 8))
plt.title("Histogram for the severity")
sns.barplot(x = severity_counts.index,y=severity_counts.values)
plt.xlabel("Severity")
plt.ylabel("Value")
plt.savefig("UnBalanced_Severity.png",bbox_inches = 'tight', dpi = 300)
plt.show()

In [None]:
size = len(df_clean[df_clean["Severity"]==1].index)
size

In [None]:
x = pd.DataFrame()
for i in range(1,5):
    S = df_clean[df_clean["Severity"]==i]
    x = pd.concat([x,S.sample(size, random_state=42)],axis = 0)
df_balanced = x
df_balanced.info()

In [None]:
df_balanced.shape

In [None]:
severity_counts = df_balanced["Severity"].value_counts()

plt.figure(figsize=(10, 8))
plt.title("Histogram for the severity")
sns.barplot(x = severity_counts.index,y=severity_counts.values)
plt.xlabel("Severity")
plt.ylabel("Value")
plt.savefig("Balanced_Severity.png",bbox_inches = 'tight', dpi = 300)
plt.show()

In [None]:
num_features = (df_balanced.select_dtypes(include = ['float32','uint8','uint16',],exclude = ['bool']).columns).to_list()
cat_features = (df_balanced.select_dtypes(exclude = ['float32','uint8','uint16','bool','int64']).columns).to_list()
bool_features = (df_balanced.select_dtypes(include= ['bool']).columns).to_list()
cat_features,num_features,bool_features 

In [None]:
num_features.remove("Severity")
scaler = MinMaxScaler()
#features = ['Temperature(F)','Distance(mi)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(,'Start_Lng','Start_Lat','Year', 'Month','Weekday','Day','Hour','Minute']
df_balanced[num_features] = scaler.fit_transform(df_balanced[num_features])
df_balanced

In [None]:
df_balanced.describe()

In [None]:
df_balanced = df_balanced.replace([True, False], [1, 0])
df_balanced[bool_features] = df_balanced[bool_features].astype('uint8')
df_balanced.head()

In [None]:
name_p = "test.pickle"
df_balanced.to_pickle(name_p)
df_balanced = pd.read_pickle(name_p)
onehot_cols = list(set(cat_features) - set(["City"]))
onehot_cols 

In [None]:
df_balanced[onehot_cols].nunique()

In [None]:
df_balanced = pd.get_dummies(df_balanced, columns=onehot_cols, drop_first=True)
df_balanced.head()

In [None]:
df_balanced.info()

In [None]:
city_counts = df_balanced["City"].value_counts()
city_counts


In [None]:
zero_count_cities = city_counts[city_counts == 0].index
instances = df_balanced[df_balanced["City"].isin(zero_count_cities)]
instances.sort_values(by = ["City"]) 

In [None]:
city_counts[city_counts > 0]
city_counts[ (0<city_counts) & (city_counts < 6)] 

In [None]:
df_balanced["city_mean_encoded"] = (df_balanced.groupby("City")["Severity"].transform("mean")).astype('float32')

df_balanced[["City", "Severity", "city_mean_encoded"]].sort_values(by = "Severity",ascending = True)

In [None]:
X = df_balanced.copy()
y = X.pop('Severity')
from category_encoders import MEstimateEncoder

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=["City"], m = 3)

# Fit the encoder on the encoding split.
encoder.fit(X, y)

# Encode the Zipcode column to create the final training data
X_train = encoder.transform(X)

In [None]:
plt.figure(dpi=90)
ax = sns.countplot(x = y, palette= 'rainbow')
ax = sns.kdeplot(X_train.city_mean_encoded, color='purple', ax=ax)
ax = sns.kdeplot(X_train.City, color='red', ax=ax)
ax.set_xlabel("Severity")
ax.set_ylim([0,10])
ax.legend(labels=['mean_encoding', 'm_encoding'],loc = 'upper right');
plt.savefig("Target_Encoding.png",bbox_inches = 'tight', dpi = 300)
plt.show()

In [None]:
df_balanced.drop(["City"],inplace = True,axis = 1)
df_balanced
df_balanced.describe()

In [None]:
df_balanced.to_pickle("final_df.pickle")
df_model = pd.read_pickle("final_df.pickle")
df_model.info()