In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

## Loading Dataset


In [3]:
df=pd.read_excel("/content/Accident_Information.xlsx")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

##Cleaning Data

In [None]:
df_clean=df.copy()

In [None]:
Missing_count=df_clean.isnull().sum()
percentage=df.isnull().sum()*100/df_clean.shape[0]

In [None]:
missing_data = pd.DataFrame({
    'Missing_Count': Missing_count,
    'Missing_Percentage': percentage
})
missing_data[missing_data['Missing_Count']>0].sort_values(by='Missing_Count',ascending=False)

In [None]:
drop_cols = ['Accident_Index','Local_Authority_(District)','Local_Authority_(Highway)','LSOA_of_Accident_Location','Police_Force',
'1st_Road_Number','2nd_Road_Number','Carriageway_Hazards','Special_Conditions_at_Site','Location_Easting_OSGR','Location_Northing_OSGR','1st_Road_Class'
,'2nd_Road_Class','InScotland','Police_Force','Pedestrian_Crossing-Physical_Facilities','Pedestrian_Crossing-Human_Control','Junction_Control','Junction_Control'
,'Day_of_Week']
df_clean.drop(columns=drop_cols,inplace=True)


In [None]:
df_clean.dropna(inplace=True)

In [None]:
df_clean.duplicated().sum()

In [None]:
df_clean.drop_duplicates(inplace=True)

In [None]:
df_clean.columns

In [None]:
df_clean.columns=df_clean.columns.str.lower().str.strip().str.replace('-','_')
df_clean.columns

In [None]:
print(df_clean['road_surface_conditions'].unique())
print("----------------------------------------------------------------")
print(df_clean['weather_conditions'].unique())

In [None]:
df_clean['road_surface_conditions'].replace({'road_surface_conditions':'Unknown'},inplace=True)
df_clean['weather_conditions'].replace({'Data missing or out of range':'Unknown','Other':'Unknown','Raining no high winds':'Raining',
        'Raining + high winds':'Raining','Snowing no high winds':'Snowing','Snowing + high winds':'Snowing',
                                        'Fog or mist':'Fog','Fine no high winds':'Normal','Fine + high winds':'Normal'},inplace=True)

In [None]:
df_clean=df_clean[df_clean['urban_or_rural_area']!='Unallocated']

In [None]:
df_clean['urban_or_rural_area']

In [None]:
df_clean.rename(columns={'did_police_officer_attend_scene_of_accident':'no_of_police_at_scene'},inplace=True)

In [None]:
df_clean['light_conditions'].replace({'Daylight':1,'Darkness - no lighting':0,'Darkness - lights lit':0,
                                      'Darkness - lighting unknown':0,'Darkness - lights unlit':0},inplace=True)

## Perfoming feature engineering

In [None]:
df_clean['datetime'] = pd.to_datetime(df_clean['date'].astype(str) + ' ' + df_clean['time'].astype(str))

In [None]:
df_clean['month']=df_clean.datetime.dt.month
df_clean['hour']=df_clean.datetime.dt.hour
df_clean['day_of_week']=df_clean.datetime.dt.day_name()
df_clean['is_weekend']=df_clean.datetime.dt.dayofweek.isin([5,6]).astype(int)
df_clean['is_night'] = ((df_clean['hour'] < 6) | (df_clean['hour'] > 19)).astype(int)


In [None]:
def season(month):
  if month in [12,1,2]:
     return 'Winter'
  if month in [3,4,5]:
    return 'Spring'
  if month in [6,7,8]:
    return 'Summer'
  else:
    return 'Autumn'

In [None]:
df_clean['season']=df_clean['month'].apply(season)


In [None]:
df_clean.drop(['date','time'],axis=1,inplace=True)

## Removing Outliers

In [None]:
df_clean.describe()

In [None]:
sns.boxplot(x='accident_severity', y='number_of_vehicles', data=df_clean)

In [None]:
df_clean[df_clean['number_of_vehicles']>20]

In [None]:
df_clean=df_clean[df_clean['number_of_vehicles']<=10]

In [None]:
sns.boxplot(x='accident_severity', y='number_of_casualties', data=df_clean)

In [None]:
df_clean[df_clean['number_of_casualties']>=20]

In [None]:
df_clean=df_clean[df_clean['number_of_casualties']<=15]

In [None]:
#df.to_excel("accident_clean.xlsx")

## EDA

In [None]:
df_clean.head(1)

In [None]:
df_clean.info()

In [None]:
season_trends = df_clean.groupby('season')['number_of_casualties'].sum()
season_trends.plot(kind='line', marker='o', color='teal')



In [None]:
weekend_trends = df_clean.groupby(['hour','is_weekend'])['number_of_casualties'].sum().unstack()
weekend_trends.plot(kind='line', marker='o', figsize=(10,5))
plt.title("Casualties by Hour: Weekday vs Weekend")
plt.tight_layout()
plt.show()

In [None]:
dow_casualties = df_clean.groupby('day_of_week')['number_of_casualties'].sum()
dow_casualties = dow_casualties.reindex(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
dow_casualties.plot(kind='line', marker='o', color='orange')


In [None]:
monthly_casualties = df_clean.groupby('month')['number_of_casualties'].sum()
plt.figure(figsize=(10,5))
plt.plot(monthly_casualties.index, monthly_casualties.values, marker='o', linestyle='-', color='teal', linewidth=2)
plt.title("Total Number of Casualties by Month", fontsize=14, fontweight='bold')
plt.xlabel("Month", fontsize=12)
plt.ylabel("Number of Casualties", fontsize=12)
plt.xticks(monthly_casualties.index)
for x, y in zip(monthly_casualties.index, monthly_casualties.values):
    plt.text(x, y+1, str(y), ha='center', fontsize=9)
plt.tight_layout()
plt.show()

In [None]:
counts = df_clean['accident_severity'].value_counts()
labels = counts.index
plt.figure(figsize=(6,6))
plt.pie(counts,labels=labels,autopct='%1.1f%%')
plt.title(f"Accident Severity Distribution", fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8,5))
plt.title('Casualities based on accident severity')
df_clean.groupby('accident_severity')['number_of_casualties'].sum().plot(kind='bar')
plt.xlabel("Accident Severity")
plt.tight_layout()
plt.show()

In [None]:
sns.scatterplot(data=df_clean,x='number_of_casualties',y='number_of_vehicles',hue='accident_severity')

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(data=df_clean,x='accident_severity',y='number_of_casualties',hue='weather_conditions',
            estimator=sum)
plt.title("Number of Casualties by Accident Severity and Road Surface Conditions", fontsize=14)
plt.xlabel("Accident Severity")
plt.ylabel("Total Number of Casualties")
plt.legend(title="Road Surface Conditions")
plt.tight_layout()
plt.show()

In [None]:
counts = df_clean.groupby(['accident_severity','urban_or_rural_area']).size().unstack()
counts.plot(kind='bar',stacked=True, figsize=(8,6))
plt.title("Accident Severity by Urban/Rural Area (Stacked)")
plt.xlabel("Accident Severity")
plt.ylabel("Number of Accidents")
plt.legend(title="Urban/Rural")
plt.tight_layout()
plt.show()


In [None]:
sns.scatterplot(x='speed_limit', y='number_of_casualties', hue='accident_severity', data=df_clean)


In [None]:
pivot = df_clean.pivot_table(index='accident_severity', columns='urban_or_rural_area', values='number_of_casualties', aggfunc='sum')
sns.heatmap(pivot, annot=True, fmt="d", cmap='YlGnBu')
plt.title("Heatmap of Number of Casualties by Severity and Area")
plt.show()



In [None]:
df_clean.info()

In [None]:
df_clean['is_night'].map({0:"Day",1:"Night"}).value_counts().plot(kind='bar')

In [None]:
piv=df_clean.pivot_table(index='accident_severity',columns=df_clean['is_night'].map({0:"Day",1:"Night"}),values='number_of_casualties',
                         aggfunc='count')
piv.plot(kind='bar', figsize=(8,6))
plt.title("Number of Casualties by Accident Severity and Time of Day")
plt.xlabel("Accident Severity")
plt.ylabel("Number of Casualties")
plt.xticks(rotation=0)
plt.legend(title="Time of Day")
plt.tight_layout()
plt.show()

In [None]:
df_clean.info()

## Feature selection for machine learning

In [None]:
df_clean['accident_severity']=df_clean['accident_severity'].map({'Slight':0,'Serious':1,'Fatal':2})

In [None]:
df_clean1=df_clean[df_clean['year']==2007]
df_clean2=df_clean[df_clean['year']==2008]

In [None]:
df_clean1.drop(columns=['no_of_police_at_scene','junction_detail','latitude','longitude','year',
                       'datetime','month','hour','light_conditions'],inplace=True)
df_clean2.drop(columns=['no_of_police_at_scene','junction_detail','latitude','longitude','year',
                       'datetime','month','hour','light_conditions'],inplace=True)

In [None]:
num=df_clean1.select_dtypes(include='number')
sns.heatmap(num.corr(),annot=True,fmt=".2g")

In [None]:
df_clean1.info()

In [None]:
for i in df_clean1.columns:
  print(i,df_clean1[i].nunique())

In [None]:
df_clean1.head(1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [None]:
X=df_clean1.iloc[:,1:]
y=df_clean1.iloc[:,0]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=10)

In [None]:
catcols=['road_surface_conditions','road_type','weather_conditions','day_of_week','season']
two=['urban_or_rural_area']
num=X.select_dtypes(include='number').columns.tolist()

In [None]:
processor=ColumnTransformer(
    transformers=[
        ('ohe',OneHotEncoder(drop='first'),catcols),
        ('scaler',StandardScaler(),num),
        ('oe',OrdinalEncoder(),two)
    ]
)

In [None]:
X_train_scaled=processor.fit_transform(X_train)
X_test_scaled=processor.transform(X_test)

In [None]:
y_train.value_counts()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report,accuracy_score,f1_score,confusion_matrix

In [None]:
models={
    'Random Forest':RandomForestClassifier(),
    'Gradient Boosting':GradientBoostingClassifier(),
    'Decision Tree':DecisionTreeClassifier(),
    'AdaBoost':AdaBoostClassifier(),
    'SVC':SVC(),
    'Logistic Regression':LogisticRegression(multi_class='multinomial'),
    'KNN':KNeighborsClassifier(),
    'XGB':XGBClassifier(objective='multi:softmax'),
    'CAT':CatBoostClassifier(verbose=0, random_seed=42)

}
def showperformance(test,pred):
  print("Classification report")
  print(classification_report(test,pred))
  print("Accuracy: ",accuracy_score(test,pred))
  print("F1-score: ",f1_score(test,pred,average='weighted'))
  print('Confusion Matrix')
  print(confusion_matrix(test,pred))

for i in models:
  print("\n")
  print(f'-------------{i}-------------')
  model=models[i].fit(X_train_scaled,y_train)
  y_pred=model.predict(X_test_scaled)
  showperformance(y_test,y_pred)
  print("="*35)