In [102]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import pickle


In [84]:
import warnings 
warnings.filterwarnings('ignore')

In [66]:
df = pd.read_csv('RTA Dataset.csv')

In [67]:
df.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,...,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Time                         12316 non-null  object
 1   Day_of_week                  12316 non-null  object
 2   Age_band_of_driver           12316 non-null  object
 3   Sex_of_driver                12316 non-null  object
 4   Educational_level            11575 non-null  object
 5   Vehicle_driver_relation      11737 non-null  object
 6   Driving_experience           11487 non-null  object
 7   Type_of_vehicle              11366 non-null  object
 8   Owner_of_vehicle             11834 non-null  object
 9   Service_year_of_vehicle      8388 non-null   object
 10  Defect_of_vehicle            7889 non-null   object
 11  Area_accident_occured        12077 non-null  object
 12  Lanes_or_Medians             11931 non-null  object
 13  Road_allignment              12

In [70]:
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S')

# Extract hours, minutes, and seconds into separate columns
df['h'] = df['Time'].dt.hour
df['m'] = df['Time'].dt.minute
df['s'] = df['Time'].dt.second


In [71]:
day_list = []

for i in df['h']:
    if i < 5 or i > 18:
        day_list.append('Night')
    else:
        day_list.append('Day')

df['Day'] = day_list

df.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity,h,m,s,Day
0,1900-01-01 17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,na,,,Not a Pedestrian,Moving Backward,Slight Injury,17,2,0,Day
1,1900-01-01 17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,na,,,Not a Pedestrian,Overtaking,Slight Injury,17,2,0,Day
2,1900-01-01 17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,...,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury,17,2,0,Day
3,1900-01-01 01:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,...,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury,1,6,0,Night
4,1900-01-01 01:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,...,na,,,Not a Pedestrian,Overtaking,Slight Injury,1,6,0,Night


In [72]:
# Separate categorical and numerical columns
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include='number').columns

In [73]:
# Fill missing categorical with most frequent and numerical with median
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

imputer_num = SimpleImputer(strategy='median')
df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Time                         12316 non-null  datetime64[ns]
 1   Day_of_week                  12316 non-null  object        
 2   Age_band_of_driver           12316 non-null  object        
 3   Sex_of_driver                12316 non-null  object        
 4   Educational_level            12316 non-null  object        
 5   Vehicle_driver_relation      12316 non-null  object        
 6   Driving_experience           12316 non-null  object        
 7   Type_of_vehicle              12316 non-null  object        
 8   Owner_of_vehicle             12316 non-null  object        
 9   Service_year_of_vehicle      12316 non-null  object        
 10  Defect_of_vehicle            12316 non-null  object        
 11  Area_accident_occured        12316 non-nu

In [75]:
df.columns

Index(['Time', 'Day_of_week', 'Age_band_of_driver', 'Sex_of_driver',
       'Educational_level', 'Vehicle_driver_relation', 'Driving_experience',
       'Type_of_vehicle', 'Owner_of_vehicle', 'Service_year_of_vehicle',
       'Defect_of_vehicle', 'Area_accident_occured', 'Lanes_or_Medians',
       'Road_allignment', 'Types_of_Junction', 'Road_surface_type',
       'Road_surface_conditions', 'Light_conditions', 'Weather_conditions',
       'Type_of_collision', 'Number_of_vehicles_involved',
       'Number_of_casualties', 'Vehicle_movement', 'Casualty_class',
       'Sex_of_casualty', 'Age_band_of_casualty', 'Casualty_severity',
       'Work_of_casuality', 'Fitness_of_casuality', 'Pedestrian_movement',
       'Cause_of_accident', 'Accident_severity', 'h', 'm', 's', 'Day'],
      dtype='object')

In [76]:
categorical_cols = list(categorical_cols)  # Convert to list
if "Time" in categorical_cols:
    categorical_cols.remove("Time")


In [77]:
categorical_cols

['Day_of_week',
 'Age_band_of_driver',
 'Sex_of_driver',
 'Educational_level',
 'Vehicle_driver_relation',
 'Driving_experience',
 'Type_of_vehicle',
 'Owner_of_vehicle',
 'Service_year_of_vehicle',
 'Defect_of_vehicle',
 'Area_accident_occured',
 'Lanes_or_Medians',
 'Road_allignment',
 'Types_of_Junction',
 'Road_surface_type',
 'Road_surface_conditions',
 'Light_conditions',
 'Weather_conditions',
 'Type_of_collision',
 'Vehicle_movement',
 'Casualty_class',
 'Sex_of_casualty',
 'Age_band_of_casualty',
 'Casualty_severity',
 'Work_of_casuality',
 'Fitness_of_casuality',
 'Pedestrian_movement',
 'Cause_of_accident',
 'Accident_severity',
 'Day']

In [78]:
from sklearn.preprocessing import LabelEncoder

# Apply Label Encoding to all categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert to string to avoid issues
    label_encoders[col] = le  # Store encoders for future decoding

    # Print encoding mapping
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    print(f"{col}: {mapping}")

Day_of_week: {'Friday': 0, 'Monday': 1, 'Saturday': 2, 'Sunday': 3, 'Thursday': 4, 'Tuesday': 5, 'Wednesday': 6}
Age_band_of_driver: {'18-30': 0, '31-50': 1, 'Over 51': 2, 'Under 18': 3, 'Unknown': 4}
Sex_of_driver: {'Female': 0, 'Male': 1, 'Unknown': 2}
Educational_level: {'Above high school': 0, 'Elementary school': 1, 'High school': 2, 'Illiterate': 3, 'Junior high school': 4, 'Unknown': 5, 'Writing & reading': 6}
Vehicle_driver_relation: {'Employee': 0, 'Other': 1, 'Owner': 2, 'Unknown': 3}
Driving_experience: {'1-2yr': 0, '2-5yr': 1, '5-10yr': 2, 'Above 10yr': 3, 'Below 1yr': 4, 'No Licence': 5, 'unknown': 6}
Type_of_vehicle: {'Automobile': 0, 'Bajaj': 1, 'Bicycle': 2, 'Long lorry': 3, 'Lorry (11?40Q)': 4, 'Lorry (41?100Q)': 5, 'Motorcycle': 6, 'Other': 7, 'Pick up upto 10Q': 8, 'Public (12 seats)': 9, 'Public (13?45 seats)': 10, 'Public (> 45 seats)': 11, 'Ridden horse': 12, 'Special vehicle': 13, 'Stationwagen': 14, 'Taxi': 15, 'Turbo': 16}
Owner_of_vehicle: {'Governmental': 0, 

In [79]:
X = df.drop('Accident_severity', axis=1)
y = df['Accident_severity']


In [80]:
# encoder = LabelEncoder()
# for col in X.select_dtypes(include='object').columns:
#     X[col] = encoder.fit_transform(X[col])

# y = encoder.fit_transform(y)  # Also encode target


In [81]:
X.dtypes

Time                           datetime64[ns]
Day_of_week                             int32
Age_band_of_driver                      int32
Sex_of_driver                           int32
Educational_level                       int32
Vehicle_driver_relation                 int32
Driving_experience                      int32
Type_of_vehicle                         int32
Owner_of_vehicle                        int32
Service_year_of_vehicle                 int32
Defect_of_vehicle                       int32
Area_accident_occured                   int32
Lanes_or_Medians                        int32
Road_allignment                         int32
Types_of_Junction                       int32
Road_surface_type                       int32
Road_surface_conditions                 int32
Light_conditions                        int32
Weather_conditions                      int32
Type_of_collision                       int32
Number_of_vehicles_involved           float64
Number_of_casualties              

In [82]:
X = X.drop('Time' , axis = 1)

In [88]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Age_band_of_driver           12316 non-null  int32  
 1   Driving_experience           12316 non-null  int32  
 2   Owner_of_vehicle             12316 non-null  int32  
 3   Area_accident_occured        12316 non-null  int32  
 4   Light_conditions             12316 non-null  int32  
 5   Type_of_collision            12316 non-null  int32  
 6   Number_of_vehicles_involved  12316 non-null  float64
 7   Number_of_casualties         12316 non-null  float64
 8   Age_band_of_casualty         12316 non-null  int32  
 9   h                            12316 non-null  float64
 10  m                            12316 non-null  float64
 11  Day                          12316 non-null  int32  
dtypes: float64(4), int32(8)
memory usage: 769.9 KB


In [85]:
from sklearn.feature_selection import SelectKBest, f_classif

# Assume X is a DataFrame and y is your target variable
selector = SelectKBest(score_func=f_classif, k=12)
X_new = selector.fit_transform(X, y)

# Get the boolean mask of selected features
mask = selector.get_support()

# Get the column names of the selected features
selected_features = X.columns[mask]
print("Selected features:\n", selected_features.tolist())


Selected features:
 ['Age_band_of_driver', 'Driving_experience', 'Owner_of_vehicle', 'Area_accident_occured', 'Light_conditions', 'Type_of_collision', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Age_band_of_casualty', 'h', 'm', 'Day']


In [87]:
X = df[selected_features]

In [89]:
X.columns


Index(['Age_band_of_driver', 'Driving_experience', 'Owner_of_vehicle',
       'Area_accident_occured', 'Light_conditions', 'Type_of_collision',
       'Number_of_vehicles_involved', 'Number_of_casualties',
       'Age_band_of_casualty', 'h', 'm', 'Day'],
      dtype='object')

In [90]:
X = X.drop(['Driving_experience' ,'Owner_of_vehicle','Area_accident_occured','m','Number_of_vehicles_involved', 'Age_band_of_casualty'] , axis = 1)

In [91]:
X['Type_of_vehicle'] = df['Type_of_vehicle']
X['Gender'] = df['Sex_of_driver']
X['Road_surface_type'] = df['Road_surface_type']
X['Cause_of_accident'] = df['Cause_of_accident']

In [93]:
X['Weather_conditions'] = df['Weather_conditions']

In [95]:
X['Pedestrian_movement'] = df['Pedestrian_movement']

In [96]:
X.columns

Index(['Age_band_of_driver', 'Light_conditions', 'Type_of_collision',
       'Number_of_casualties', 'h', 'Day', 'Type_of_vehicle', 'Gender',
       'Road_surface_type', 'Cause_of_accident', 'Weather_conditions',
       'Pedestrian_movement'],
      dtype='object')

In [97]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)


In [98]:
smote = SMOTE(random_state=42)
 = smote.fit_resample(X_train, y_train)


In [103]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KMeans":KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    print(f"----- {name} -----")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print()


----- Decision Tree -----
[[   6    3   22]
 [   8   88  253]
 [  39  358 1687]]
              precision    recall  f1-score   support

           0       0.11      0.19      0.14        31
           1       0.20      0.25      0.22       349
           2       0.86      0.81      0.83      2084

    accuracy                           0.72      2464
   macro avg       0.39      0.42      0.40      2464
weighted avg       0.76      0.72      0.74      2464

Accuracy: 0.7228084415584416

----- Random Forest -----
[[   2    3   26]
 [   2   61  286]
 [  19  206 1859]]
              precision    recall  f1-score   support

           0       0.09      0.06      0.07        31
           1       0.23      0.17      0.20       349
           2       0.86      0.89      0.87      2084

    accuracy                           0.78      2464
   macro avg       0.39      0.38      0.38      2464
weighted avg       0.76      0.78      0.77      2464

Accuracy: 0.7800324675324676

----- Logistic R

In [104]:
rf_clf = RandomForestClassifier(
    n_estimators=100,       # Number of trees
    max_depth=None,         # Let it grow fully
    random_state=42, 
    class_weight='balanced' # Optional if you didn't use SMOTE
)


In [105]:
rf_clf.fit(X_train_res, y_train_res)

In [106]:
y_pred = rf_clf.predict(X_test)


In [107]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7800324675324676

Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.06      0.07        31
           1       0.23      0.17      0.20       349
           2       0.86      0.89      0.87      2084

    accuracy                           0.78      2464
   macro avg       0.39      0.38      0.38      2464
weighted avg       0.76      0.78      0.77      2464


Confusion Matrix:
 [[   2    3   26]
 [   2   61  286]
 [  19  206 1859]]


In [108]:
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_clf, file)


In [113]:
df[df['Accident_severity'] == 0].sample(1)

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity,h,m,s,Day
11292,1900-01-01 00:10:00,3,0,1,6,0,2,8,3,3,...,2,3,2,5,16,0,0.0,10.0,0.0,1
