**Import Necessary Libraries**

In [4]:
# Data manipulation
import pandas as pd
import numpy as np
import io

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Google Colab specific imports
from google.colab import files

# Data preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Model selection
from sklearn.model_selection import train_test_split

# Model training and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

**Upload and Load the Dataset**

In [5]:
# Prompt the user to upload the .xlsx dataset file
print("Please upload your .xlsx dataset file:")
uploaded = files.upload()

# Load the uploaded .xlsx file into a Pandas DataFrame
filename = next(iter(uploaded))
df = pd.read_excel(io.BytesIO(uploaded[filename]), engine='openpyxl')

# Display the first few rows to confirm successful loading
print("Dataset loaded successfully.")

Please upload your .xlsx dataset file:


Saving RTA Data 2020 to July 2023.xlsx to RTA Data 2020 to July 2023 (1).xlsx
Dataset loaded successfully.


**Initial Data Inspection**

In [6]:
def initial_data_inspection(df):
    # Basic Overview
    print("First 5 rows:")
    display(df.head())

    print("\nData Summary and Types:")
    df.info()  # Provides an overview of the data types and missing values

    # Summary statistics for numerical columns
    print("\nSummary Statistics for Numerical Columns:")
    numeric_columns = df.select_dtypes(include=['number'])
    display(numeric_columns.describe())

    # Summary statistics for categorical columns
    print("\nSummary Statistics for Categorical Columns:")
    categorical_columns = df.select_dtypes(include=['object', 'category'])
    display(categorical_columns.describe())

    # Check for missing values
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    print("\nMissing Values and Percentage:")
    missing_summary = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percentage
    }).sort_values(by='Percentage', ascending=False)
    display(missing_summary)

    # Cardinality of categorical features (number of unique values)
    print("\nCardinality of Categorical Columns (Unique Values in Each Column):")
    unique_values = df.nunique()
    display(unique_values.sort_values(ascending=False))

    # Distribution and visualization of categorical features with few unique values
    for col in categorical_columns:
        if df[col].nunique() < 10:  # Only plot for columns with fewer unique values for better visualization
            print(f"\nValue Counts for {col}:")
            display(df[col].value_counts())

    # Check for duplicate rows
    num_duplicates = df.duplicated().sum()
    if num_duplicates > 0:
        print(f"\nNumber of Duplicate Rows in Dataset: {num_duplicates}")
    else:
        print("\nNo Duplicate Rows Found in Dataset.")

    # Inspect data types and convert columns if necessary
    print("\nColumns with Potential Data Type Issues:")
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                pd.to_numeric(df[col])
                print(f"Column '{col}' can potentially be converted to a numeric type.")
            except ValueError:
                continue

# Call the function for DataFrame
initial_data_inspection(df)

First 5 rows:


Unnamed: 0,EcYear,EcNumber,CallTime,EmergencyArea,TotalPatientsInEmergency,Gender,Age,HospitalName,Reason,responsetime,...,BikesInvolved,BusesInvolved,CarsInvolved,CartInvovled,RickshawsInvolved,TractorInvovled,TrainsInvovled,TrucksInvolved,VansInvolved,OthersInvolved
0,2020,31486,2020-12-31 22:41:47,NEAR APS SCHOOL FORT ROAD RWP,1,Male,27.0,BBH,Bike Slip,10.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020,31485,2020-12-31 22:25:00,"Infront of Daig.com, Near Dha gate 2, gt road...",1,Male,20.0,,Car hit Footpath,12.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020,31483,2020-12-31 21:54:59,Muhammadi chowk arshad bakery khyaban e sirsye...,1,Male,48.0,BBH,Rickshaw hit with Car,10.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2020,31482,2020-12-31 21:24:22,"Gulzar e quaid, T/W Katcheri Near Attock Pump,...",1,Male,45.0,,Car hit Car and runaway,5.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020,31479,2020-12-31 21:03:49,Taaj Company Gawalmandi Chowk Liaqat Baag Road...,1,Male,22.0,,Unknown Bike hit Bike and runaway,5.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Data Summary and Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46189 entries, 0 to 46188
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   EcYear                    46189 non-null  object        
 1   EcNumber                  38978 non-null  object        
 2   CallTime                  40233 non-null  datetime64[ns]
 3   EmergencyArea             46188 non-null  object        
 4   TotalPatientsInEmergency  46189 non-null  object        
 5   Gender                    46188 non-null  object        
 6   Age                       46188 non-null  float64       
 7   HospitalName              24239 non-null  object        
 8   Reason                    46188 non-null  object        
 9   responsetime              46184 non-null  float64       
 10  EducationTitle            46188 non-null  object        
 11  InjuryType                46187 non-null  object       

Unnamed: 0,Age,responsetime,BicycleInvovled,BikesInvolved,BusesInvolved,CarsInvolved,CartInvovled,RickshawsInvolved,TractorInvovled,TrainsInvovled,TrucksInvolved,VansInvolved,OthersInvolved
count,46188.0,46184.0,46187.0,46187.0,46187.0,46187.0,46187.0,46187.0,46187.0,46187.0,46187.0,46187.0,46187.0
mean,32.447129,6.256106,0.008725,0.967242,0.009202,0.234049,0.00197,0.087297,0.00983,0.002208,0.044948,0.117392,0.009656
std,15.176229,4.901342,0.093467,0.578776,0.100349,0.448054,0.044344,0.289319,0.098657,0.046942,0.220947,0.334686,0.098893
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,42.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,118.0,101.0,2.0,10.0,2.0,5.0,1.0,2.0,1.0,1.0,3.0,4.0,2.0



Summary Statistics for Categorical Columns:


Unnamed: 0,EcYear,EcNumber,EmergencyArea,TotalPatientsInEmergency,Gender,HospitalName,Reason,EducationTitle,InjuryType,Cause,PatientStatus
count,46189,38978,46188,46189,46188,24239,46188,46188,46187,46187,46187
unique,5,34800,37520,13,4,162,23940,9,5,7,3
top,2022,35,Railway Track Underpass Nazar Chowk Near Kach...,1,Male,BBH,same,Matric,Minor,Over Speed,Alive & unstable
freq,12902,3,42,36151,39794,6953,2328,15697,34788,29586,23788



Missing Values and Percentage:


Unnamed: 0,Missing Values,Percentage
HospitalName,21950,47.522137
EcNumber,7211,15.611942
CallTime,5956,12.894845
responsetime,5,0.010825
BicycleInvovled,2,0.00433
PatientStatus,2,0.00433
Cause,2,0.00433
InjuryType,2,0.00433
BikesInvolved,2,0.00433
TrainsInvovled,2,0.00433



Cardinality of Categorical Columns (Unique Values in Each Column):


Unnamed: 0,0
CallTime,40170
EmergencyArea,37520
EcNumber,34800
Reason,23940
HospitalName,162
Age,100
responsetime,68
TotalPatientsInEmergency,13
EducationTitle,9
Cause,7



Value Counts for EcYear:


Unnamed: 0_level_0,count
EcYear,Unnamed: 1_level_1
2022,12902
2021,12677
2020,11382
2023,9227
Hospital,1



Value Counts for Gender:


Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,39794
Female,6392
Other,1
0,1



Value Counts for EducationTitle:


Unnamed: 0_level_0,count
EducationTitle,Unnamed: 1_level_1
Matric,15697
Primary,11717
Illetrate,10845
Intermediate,5675
Graduation,1686
Masters,439
Middle,119
PHD,9
0,1



Value Counts for InjuryType:


Unnamed: 0_level_0,count
InjuryType,Unnamed: 1_level_1
Minor,34788
Single Fracture,6737
Head Injury,3503
Multiple Fractures,778
Spinal Injury,381



Value Counts for Cause:


Unnamed: 0_level_0,count
Cause,Unnamed: 1_level_1
Over Speed,29586
Carelessness,14177
U Turn,934
Wrong Turn,841
Tyre Burst,481
Others,104
One Wheeling,64



Value Counts for PatientStatus:


Unnamed: 0_level_0,count
PatientStatus,Unnamed: 1_level_1
Alive & unstable,23788
Alive & stable,21812
Dead,587



Number of Duplicate Rows in Dataset: 8

Columns with Potential Data Type Issues:


**Split the Dataset into Train, Validation, and Test Sets**

In [7]:
# Split the dataset into 60% training and 40% temporary data
train_set, temp_set = train_test_split(df, test_size=0.4, random_state=42)

# Split the temporary set into 20% validation and 20% test
validation_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=42)

# Confirm the sizes of each set
print(f"Training Set Size: {len(train_set)}")
print(f"Validation Set Size: {len(validation_set)}")
print(f"Test Set Size: {len(test_set)}")

Training Set Size: 27713
Validation Set Size: 9238
Test Set Size: 9238


**Initial Data Preprocessing**

**Handle Missing Values**

In [8]:
# Identify numerical and categorical columns
numeric_cols = train_set.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train_set.select_dtypes(include=['object', 'category']).columns.tolist()

# Identify columns not included in imputation
all_cols = train_set.columns.tolist()
imputed_cols = numeric_cols + categorical_cols
remaining_cols = list(set(all_cols) - set(imputed_cols))
print("\nColumns not included in imputation:", remaining_cols)

# Handle datetime columns separately if any
datetime_cols = [col for col in remaining_cols if np.issubdtype(train_set[col].dtype, np.datetime64)]
other_cols = list(set(remaining_cols) - set(datetime_cols))

# Impute datetime columns without using inplace=True
for col in datetime_cols:
    mode_datetime = train_set[col].mode()[0]
    train_set[col] = train_set[col].fillna(mode_datetime)
    validation_set[col] = validation_set[col].fillna(mode_datetime)
    test_set[col] = test_set[col].fillna(mode_datetime)

# Add other remaining columns to categorical_cols
categorical_cols.extend(other_cols)

# Initialize imputers
numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Fit and transform the training set
train_set[numeric_cols] = numeric_imputer.fit_transform(train_set[numeric_cols])
train_set[categorical_cols] = categorical_imputer.fit_transform(train_set[categorical_cols])

# Apply transformations to validation and test sets
validation_set[numeric_cols] = numeric_imputer.transform(validation_set[numeric_cols])
validation_set[categorical_cols] = categorical_imputer.transform(validation_set[categorical_cols])
test_set[numeric_cols] = numeric_imputer.transform(test_set[numeric_cols])
test_set[categorical_cols] = categorical_imputer.transform(test_set[categorical_cols])

# Verify missing values
print("\nTotal missing values after imputation:")
print("Training Set:", train_set.isnull().sum().sum())
print("Validation Set:", validation_set.isnull().sum().sum())
print("Test Set:", test_set.isnull().sum().sum())


Columns not included in imputation: ['CallTime']

Total missing values after imputation:
Training Set: 0
Validation Set: 0
Test Set: 0


**Handle Outliers**

In [9]:
# Define a function to handle outliers using the IQR method
def handle_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Clip the outliers to the lower and upper bounds
        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

# Apply to numerical columns in each set
handle_outliers(train_set, numeric_cols)
handle_outliers(validation_set, numeric_cols)
handle_outliers(test_set, numeric_cols)

**Feature Engineering**

In [10]:
# Extracting features from 'CallTime' column if it exists in the dataset
if 'CallTime' in train_set.columns:
    for dataset in [train_set, validation_set, test_set]:
        dataset['Year'] = pd.to_datetime(dataset['CallTime']).dt.year
        dataset['Month'] = pd.to_datetime(dataset['CallTime']).dt.month
        dataset['Day'] = pd.to_datetime(dataset['CallTime']).dt.day
        dataset['Hour'] = pd.to_datetime(dataset['CallTime']).dt.hour
    # Drop the original 'CallTime' column to prevent redundancy
    train_set.drop(columns=['CallTime'], inplace=True)
    validation_set.drop(columns=['CallTime'], inplace=True)
    test_set.drop(columns=['CallTime'], inplace=True)

**Encode Categorical Variables, Excluding Target Variables**

In [11]:
# Exclude target variables from categorical columns
categorical_cols = [col for col in categorical_cols if col not in ['InjuryType', 'PatientStatus']]

# Convert all categorical columns to strings to avoid mixed data type issues
for col in categorical_cols:
    train_set[col] = train_set[col].astype(str)
    validation_set[col] = validation_set[col].astype(str)
    test_set[col] = test_set[col].astype(str)

# One-hot encoding of categorical columns
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Fit and transform training data, then apply to validation and test sets
train_encoded = pd.DataFrame(onehot_encoder.fit_transform(train_set[categorical_cols]))
validation_encoded = pd.DataFrame(onehot_encoder.transform(validation_set[categorical_cols]))
test_encoded = pd.DataFrame(onehot_encoder.transform(test_set[categorical_cols]))

# Assign column names to encoded features
train_encoded.columns = onehot_encoder.get_feature_names_out(categorical_cols)
validation_encoded.columns = onehot_encoder.get_feature_names_out(categorical_cols)
test_encoded.columns = onehot_encoder.get_feature_names_out(categorical_cols)

# Reset the index to prepare for concatenation
train_encoded.reset_index(drop=True, inplace=True)
validation_encoded.reset_index(drop=True, inplace=True)
test_encoded.reset_index(drop=True, inplace=True)

# Drop the original categorical columns (excluding target variables) and concatenate the new encoded features
train_set = pd.concat([train_set.drop(columns=categorical_cols).reset_index(drop=True), train_encoded], axis=1)
validation_set = pd.concat([validation_set.drop(columns=categorical_cols).reset_index(drop=True), validation_encoded], axis=1)
test_set = pd.concat([test_set.drop(columns=categorical_cols).reset_index(drop=True), test_encoded], axis=1)

**Feature Scaling**

In [12]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training set only
train_set[numeric_cols] = scaler.fit_transform(train_set[numeric_cols])

# Apply the scaler to the validation and test sets
validation_set[numeric_cols] = scaler.transform(validation_set[numeric_cols])
test_set[numeric_cols] = scaler.transform(test_set[numeric_cols])

**Final Data Inspection**

In [13]:
def final_data_inspection(df, name):
    print(f"{name} Inspection:")

    # Total Missing Values
    total_missing = df.isnull().sum().sum()
    print(f"\nTotal Missing Values in {name}: {total_missing}")
    if total_missing > 0:
        # Identify columns with missing values
        missing_values = df.isnull().sum()
        missing_cols = missing_values[missing_values > 0].index.tolist()
        print(f"Columns with missing values in {name}: {missing_cols}")
        # Display missing values per column
        missing_percentage = (missing_values[missing_values > 0] / len(df)) * 100
        missing_summary = pd.DataFrame({
            'Missing Values': missing_values[missing_values > 0],
            'Percentage': missing_percentage
        }).sort_values(by='Percentage', ascending=False)
        display(missing_summary)
    else:
        print(f"No Missing Values Found in {name}.")

    # Display the first few rows to confirm changes
    print(f"\nFirst 5 rows of {name}:")
    display(df.head())

    # Data Summary and Types
    print("\nData Summary and Types:")
    df.info()

    # Summary statistics for numerical columns
    numeric_columns = df.select_dtypes(include=['number'])
    if not numeric_columns.empty:
        print("\nSummary Statistics for Numerical Columns:")
        display(numeric_columns.describe())
    else:
        print("\nNo Numerical Columns in the Dataset.")

    # Summary statistics for categorical columns
    categorical_columns = df.select_dtypes(include=['object', 'category'])
    if not categorical_columns.empty:
        print("\nSummary Statistics for Categorical Columns:")
        display(categorical_columns.describe())

    # Cardinality of columns (number of unique values)
    print("\nCardinality of Columns (Unique Values in Each Column):")
    unique_values = df.nunique()
    display(unique_values.sort_values(ascending=False))

    # Check for duplicate rows
    num_duplicates = df.duplicated().sum()
    if num_duplicates > 0:
        print(f"\nNumber of Duplicate Rows in {name}: {num_duplicates}")
    else:
        print(f"\nNo Duplicate Rows Found in {name}.")

    # Verify columns with potential data type issues
    print("\nColumns with Potential Data Type Issues:")
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                pd.to_numeric(df[col])
                print(f"Column '{col}' in {name} can potentially be converted to a numeric type.")
            except ValueError:
                pass  # Cannot convert to numeric type

    # Value counts for categorical columns with few unique values
    if not categorical_columns.empty:
        for col in categorical_columns:
            if df[col].nunique() < 10:
                print(f"\nValue Counts for '{col}' in {name}:")
                display(df[col].value_counts())

# Call the function for each dataset
final_data_inspection(train_set, 'Training Set')
final_data_inspection(validation_set, 'Validation Set')
final_data_inspection(test_set, 'Test Set')

Training Set Inspection:

Total Missing Values in Training Set: 0
No Missing Values Found in Training Set.

First 5 rows of Training Set:


Unnamed: 0,Age,responsetime,InjuryType,PatientStatus,BicycleInvovled,BikesInvolved,BusesInvolved,CarsInvolved,CartInvovled,RickshawsInvolved,...,EducationTitle_Matric,EducationTitle_Middle,EducationTitle_PHD,EducationTitle_Primary,Cause_One Wheeling,Cause_Others,Cause_Over Speed,Cause_Tyre Burst,Cause_U Turn,Cause_Wrong Turn
0,1.174777,2.523807,Minor,Alive & unstable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.962896,0.273245,Minor,Alive & unstable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-1.029698,0.802789,Minor,Alive & stable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.562082,2.523807,Minor,Alive & stable,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.508788,-0.785842,Multiple Fractures,Alive & unstable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0



Data Summary and Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27713 entries, 0 to 27712
Columns: 60709 entries, Age to Cause_Wrong Turn
dtypes: float64(60703), int32(4), object(2)
memory usage: 12.5+ GB

Summary Statistics for Numerical Columns:


Unnamed: 0,Age,responsetime,BicycleInvovled,BikesInvolved,BusesInvolved,CarsInvolved,CartInvovled,RickshawsInvolved,TractorInvovled,TrainsInvovled,...,EducationTitle_Matric,EducationTitle_Middle,EducationTitle_PHD,EducationTitle_Primary,Cause_One Wheeling,Cause_Others,Cause_Over Speed,Cause_Tyre Burst,Cause_U Turn,Cause_Wrong Turn
count,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0,...,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0,27713.0
mean,-2.615212e-17,-5.4098990000000003e-17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.341717,0.002815,0.000253,0.255151,0.001191,0.002454,0.63988,0.010573,0.020388,0.018222
std,1.000018,1.000018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.474294,0.052979,0.015891,0.435954,0.034488,0.049475,0.480044,0.10228,0.141324,0.133758
min,-2.165336,-1.315386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.762489,-0.7858424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.2280709,-0.2562984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.6403585,0.5380174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
max,2.74463,2.523807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0



Summary Statistics for Categorical Columns:


Unnamed: 0,InjuryType,PatientStatus
count,27713,27713
unique,5,3
top,Minor,Alive & unstable
freq,20894,14275



Cardinality of Columns (Unique Values in Each Column):


Unnamed: 0,0
Age,75
Day,31
Hour,24
responsetime,17
Month,12
...,...
TrainsInvovled,1
TrucksInvolved,1
VansInvolved,1
OthersInvolved,1



Number of Duplicate Rows in Training Set: 4

Columns with Potential Data Type Issues:

Value Counts for 'InjuryType' in Training Set:


Unnamed: 0_level_0,count
InjuryType,Unnamed: 1_level_1
Minor,20894
Single Fracture,4062
Head Injury,2085
Multiple Fractures,449
Spinal Injury,223



Value Counts for 'PatientStatus' in Training Set:


Unnamed: 0_level_0,count
PatientStatus,Unnamed: 1_level_1
Alive & unstable,14275
Alive & stable,13100
Dead,338


Validation Set Inspection:

Total Missing Values in Validation Set: 0
No Missing Values Found in Validation Set.

First 5 rows of Validation Set:


Unnamed: 0,Age,responsetime,InjuryType,PatientStatus,BicycleInvovled,BikesInvolved,BusesInvolved,CarsInvolved,CartInvovled,RickshawsInvolved,...,EducationTitle_Matric,EducationTitle_Middle,EducationTitle_PHD,EducationTitle_Primary,Cause_One Wheeling,Cause_Others,Cause_Over Speed,Cause_Tyre Burst,Cause_U Turn,Cause_Wrong Turn
0,1.57559,-0.785842,Minor,Alive & stable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-1.029698,1.067561,Minor,Alive & stable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.842799,-0.52107,Minor,Alive & unstable,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.428478,-0.256298,Minor,Alive & stable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.174777,-0.256298,Minor,Alive & stable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0



Data Summary and Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9238 entries, 0 to 9237
Columns: 60709 entries, Age to Cause_Wrong Turn
dtypes: float64(60703), int32(4), object(2)
memory usage: 4.2+ GB

Summary Statistics for Numerical Columns:


Unnamed: 0,Age,responsetime,BicycleInvovled,BikesInvolved,BusesInvolved,CarsInvolved,CartInvovled,RickshawsInvolved,TractorInvovled,TrainsInvovled,...,EducationTitle_Matric,EducationTitle_Middle,EducationTitle_PHD,EducationTitle_Primary,Cause_One Wheeling,Cause_Others,Cause_Over Speed,Cause_Tyre Burst,Cause_U Turn,Cause_Wrong Turn
count,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,...,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0
mean,0.016114,-0.005914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.331565,0.001732,0.000108,0.249946,0.00184,0.002381,0.639641,0.0105,0.020892,0.019268
std,1.012464,1.000729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.470801,0.041583,0.010404,0.433005,0.042861,0.048745,0.480131,0.101936,0.14303,0.137474
min,-2.098534,-1.315386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.762489,-0.785842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.228071,-0.256298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.640358,0.538017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,2.74463,2.523807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0



Summary Statistics for Categorical Columns:


Unnamed: 0,InjuryType,PatientStatus
count,9238,9238
unique,5,3
top,Minor,Alive & unstable
freq,6962,4768



Cardinality of Columns (Unique Values in Each Column):


Unnamed: 0,0
Age,74
Day,31
Hour,24
responsetime,16
Month,12
...,...
"EmergencyArea_ Chur Chowk, Peshawar Road, Rawalpindi",1
"EmergencyArea_ Chur bazar Peshawar Road, Rawalpindi, Punjab",1
"EmergencyArea_ City Bakers Kallar Chowk, Kahuta",1
"EmergencyArea_ City Center, Bank Road, Saddar Rwp.",1



Number of Duplicate Rows in Validation Set: 1

Columns with Potential Data Type Issues:

Value Counts for 'InjuryType' in Validation Set:


Unnamed: 0_level_0,count
InjuryType,Unnamed: 1_level_1
Minor,6962
Single Fracture,1317
Head Injury,712
Multiple Fractures,165
Spinal Injury,82



Value Counts for 'PatientStatus' in Validation Set:


Unnamed: 0_level_0,count
PatientStatus,Unnamed: 1_level_1
Alive & unstable,4768
Alive & stable,4334
Dead,136


Test Set Inspection:

Total Missing Values in Test Set: 0
No Missing Values Found in Test Set.

First 5 rows of Test Set:


Unnamed: 0,Age,responsetime,InjuryType,PatientStatus,BicycleInvovled,BikesInvolved,BusesInvolved,CarsInvolved,CartInvovled,RickshawsInvolved,...,EducationTitle_Matric,EducationTitle_Middle,EducationTitle_PHD,EducationTitle_Primary,Cause_One Wheeling,Cause_Others,Cause_Over Speed,Cause_Tyre Burst,Cause_U Turn,Cause_Wrong Turn
0,0.172743,-0.256298,Minor,Alive & unstable,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.373149,-1.050614,Minor,Alive & unstable,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.174777,-0.785842,Spinal Injury,Alive & unstable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.310415,-0.256298,Minor,Alive & unstable,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.640358,-0.52107,Minor,Alive & stable,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0



Data Summary and Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9238 entries, 0 to 9237
Columns: 60709 entries, Age to Cause_Wrong Turn
dtypes: float64(60703), int32(4), object(2)
memory usage: 4.2+ GB

Summary Statistics for Numerical Columns:


Unnamed: 0,Age,responsetime,BicycleInvovled,BikesInvolved,BusesInvolved,CarsInvolved,CartInvovled,RickshawsInvolved,TractorInvovled,TrainsInvovled,...,EducationTitle_Matric,EducationTitle_Middle,EducationTitle_PHD,EducationTitle_Primary,Cause_One Wheeling,Cause_Others,Cause_Over Speed,Cause_Tyre Burst,Cause_U Turn,Cause_Wrong Turn
count,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,...,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0,9238.0
mean,-0.026023,0.019764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.342607,0.002706,0.000108,0.252977,0.001515,0.001515,0.643646,0.009851,0.019052,0.017103
std,0.992857,1.006249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.474607,0.051954,0.010404,0.434741,0.038902,0.038902,0.478948,0.098766,0.136714,0.129663
min,-2.165336,-1.315386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.829291,-0.785842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.294873,-0.256298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.506754,0.538017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
max,2.510822,2.523807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0



Summary Statistics for Categorical Columns:


Unnamed: 0,InjuryType,PatientStatus
count,9238,9238
unique,5,3
top,Minor,Alive & unstable
freq,6934,4747



Cardinality of Columns (Unique Values in Each Column):


Unnamed: 0,0
Age,71
Day,31
Hour,24
responsetime,17
Month,12
...,...
"EmergencyArea_ Chour Chowk Stop, Peshawar Road, Rwp",1
"EmergencyArea_ Chour Chowk Stop, Peshawar Road, towards saddar Rwp",1
"EmergencyArea_ Chour Chowk Stop, Peshawer Road, Rawalpindi",1
"EmergencyArea_ Chour Chowk Stop, Peshwer Road Rawalpindi",1



Number of Duplicate Rows in Test Set: 1

Columns with Potential Data Type Issues:

Value Counts for 'InjuryType' in Test Set:


Unnamed: 0_level_0,count
InjuryType,Unnamed: 1_level_1
Minor,6934
Single Fracture,1358
Head Injury,706
Multiple Fractures,164
Spinal Injury,76



Value Counts for 'PatientStatus' in Test Set:


Unnamed: 0_level_0,count
PatientStatus,Unnamed: 1_level_1
Alive & unstable,4747
Alive & stable,4378
Dead,113


**Model Training and Evaluation**

**Encode Target Variables**

In [14]:
# Create copies to avoid modifying the original data
train_set_encoded = train_set.copy()
validation_set_encoded = validation_set.copy()
test_set_encoded = test_set.copy()

# Initialize LabelEncoders
label_encoder_injury = LabelEncoder()
label_encoder_status = LabelEncoder()

# Encode 'InjuryType'
train_set_encoded['InjuryType'] = label_encoder_injury.fit_transform(train_set_encoded['InjuryType'])
validation_set_encoded['InjuryType'] = label_encoder_injury.transform(validation_set_encoded['InjuryType'])
test_set_encoded['InjuryType'] = label_encoder_injury.transform(test_set_encoded['InjuryType'])

# Encode 'PatientStatus'
train_set_encoded['PatientStatus'] = label_encoder_status.fit_transform(train_set_encoded['PatientStatus'])
validation_set_encoded['PatientStatus'] = label_encoder_status.transform(validation_set_encoded['PatientStatus'])
test_set_encoded['PatientStatus'] = label_encoder_status.transform(test_set_encoded['PatientStatus'])

# View the mapping
print("InjuryType Classes:", label_encoder_injury.classes_)
print("PatientStatus Classes:", label_encoder_status.classes_)

InjuryType Classes: ['Head Injury' 'Minor' 'Multiple Fractures' 'Single Fracture'
 'Spinal Injury']
PatientStatus Classes: ['Alive & stable' 'Alive & unstable' 'Dead']


**Separate Features and Target Variables**

In [15]:
# Define target columns
target_cols = ['InjuryType', 'PatientStatus']

# For InjuryType model:
# Features (X) and target (y)
X_train_injury = train_set_encoded.drop(columns=target_cols)
y_train_injury = train_set_encoded['InjuryType']

X_validation_injury = validation_set_encoded.drop(columns=target_cols)
y_validation_injury = validation_set_encoded['InjuryType']

X_test_injury = test_set_encoded.drop(columns=target_cols)
y_test_injury = test_set_encoded['InjuryType']

# For PatientStatus model:
# Features (X) and target (y)
X_train_status = train_set_encoded.drop(columns=target_cols)
y_train_status = train_set_encoded['PatientStatus']

X_validation_status = validation_set_encoded.drop(columns=target_cols)
y_validation_status = validation_set_encoded['PatientStatus']

X_test_status = test_set_encoded.drop(columns=target_cols)
y_test_status = test_set_encoded['PatientStatus']

**Train Logistic Regression Models on the Training Set**

**Train and Evaluate Models for InjuryType**

In [None]:
# Define solvers and max_iter values
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
max_iters = [50, 100, 150, 200, 250, 300]

# Initialize a list to store results
results_injury = []

# Loop over solvers and max_iter values
for solver in solvers:
    for max_iter in max_iters:
        try:
            # Initialize the model
            model_injury = LogisticRegression(solver=solver, max_iter=max_iter, random_state=42)

            # Fit the model
            model_injury.fit(X_train_injury, y_train_injury)

            # Predict on the test set
            y_pred_injury = model_injury.predict(X_test_injury)

            # Evaluate the model
            accuracy = accuracy_score(y_test_injury, y_pred_injury)
            precision = precision_score(y_test_injury, y_pred_injury, average='weighted', zero_division=0)
            recall = recall_score(y_test_injury, y_pred_injury, average='weighted', zero_division=0)
            f1 = f1_score(y_test_injury, y_pred_injury, average='weighted', zero_division=0)
            cm = confusion_matrix(y_test_injury, y_pred_injury)

            # Store the results
            results_injury.append({
                'Solver': solver,
                'Max_iter': max_iter,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1,
                'Confusion Matrix': cm
            })

            print(f"InjuryType - Solver: {solver}, Max_iter: {max_iter}, Accuracy: {accuracy:.4f}")

        except Exception as e:
            print(f"InjuryType - Solver: {solver}, Max_iter: {max_iter} encountered an error: {e}")

InjuryType - Solver: lbfgs, Max_iter: 50, Accuracy: 0.7506
InjuryType - Solver: lbfgs, Max_iter: 100, Accuracy: 0.7503
InjuryType - Solver: lbfgs, Max_iter: 150, Accuracy: 0.7486
InjuryType - Solver: lbfgs, Max_iter: 200, Accuracy: 0.7498
InjuryType - Solver: lbfgs, Max_iter: 250, Accuracy: 0.7488
InjuryType - Solver: lbfgs, Max_iter: 300, Accuracy: 0.7491
InjuryType - Solver: liblinear, Max_iter: 50, Accuracy: 0.7512
InjuryType - Solver: liblinear, Max_iter: 100, Accuracy: 0.7512
InjuryType - Solver: liblinear, Max_iter: 150, Accuracy: 0.7512
InjuryType - Solver: liblinear, Max_iter: 200, Accuracy: 0.7512
InjuryType - Solver: liblinear, Max_iter: 250, Accuracy: 0.7512
InjuryType - Solver: liblinear, Max_iter: 300, Accuracy: 0.7512
InjuryType - Solver: newton-cg, Max_iter: 50, Accuracy: 0.7477
InjuryType - Solver: newton-cg, Max_iter: 100, Accuracy: 0.7476
InjuryType - Solver: newton-cg, Max_iter: 150, Accuracy: 0.7476
InjuryType - Solver: newton-cg, Max_iter: 200, Accuracy: 0.7476


**Train and Evaluate Models for PatientStatus**

In [None]:
# Define solvers and max_iter values (if not already defined)
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
max_iters = [50, 100, 150, 200, 250, 300]

# Initialize a list to store results for PatientStatus
results_status = []

# Loop over solvers and max_iter values
for solver in solvers:
    for max_iter in max_iters:
        try:
            # Initialize the model
            model_status = LogisticRegression(solver=solver, max_iter=max_iter, random_state=42)

            # Fit the model
            model_status.fit(X_train_status, y_train_status)

            # Predict on the test set
            y_pred_status = model_status.predict(X_test_status)

            # Evaluate the model
            accuracy = accuracy_score(y_test_status, y_pred_status)
            precision = precision_score(y_test_status, y_pred_status, average='weighted', zero_division=0)
            recall = recall_score(y_test_status, y_pred_status, average='weighted', zero_division=0)
            f1 = f1_score(y_test_status, y_pred_status, average='weighted', zero_division=0)
            cm = confusion_matrix(y_test_status, y_pred_status)

            # Store the results
            results_status.append({
                'Solver': solver,
                'Max_iter': max_iter,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1,
                'Confusion Matrix': cm
            })

            print(f"PatientStatus - Solver: {solver}, Max_iter: {max_iter}, Accuracy: {accuracy:.4f}")

        except Exception as e:
            print(f"PatientStatus - Solver: {solver}, Max_iter: {max_iter} encountered an error: {e}")

**Evaluate the Performance on the Test Set Using the Specified Metrics**

**For InjuryType Model**

In [None]:
# Convert the results list to a DataFrame for InjuryType
df_results_injury = pd.DataFrame(results_injury)

# Display evaluation metrics for InjuryType
print("Evaluation Metrics for InjuryType:")
for result in results_injury:
    print(f"Solver: {result['Solver']}, Max_iter: {result['Max_iter']}")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    print(f"Precision: {result['Precision']:.4f}")
    print(f"Recall: {result['Recall']:.4f}")
    print(f"F1 Score: {result['F1 Score']:.4f}")
    print(f"Confusion Matrix:\n{result['Confusion Matrix']}\n")

**For PatientStatus Model**

In [None]:
# Convert the results list to a DataFrame for PatientStatus
df_results_status = pd.DataFrame(results_status)

# Display evaluation metrics for PatientStatus
print("Evaluation Metrics for PatientStatus:")
for result in results_status:
    print(f"Solver: {result['Solver']}, Max_iter: {result['Max_iter']}")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    print(f"Precision: {result['Precision']:.4f}")
    print(f"Recall: {result['Recall']:.4f}")
    print(f"F1 Score: {result['F1 Score']:.4f}")
    print(f"Confusion Matrix:\n{result['Confusion Matrix']}\n")

**Plot the Learning Curves**

**Accuracy vs Solver for InjuryType**

In [None]:
# Plot Accuracy vs Solver for InjuryType
plt.figure(figsize=(10, 6))
sns.barplot(x='Solver', y='Accuracy', data=df_results_injury.groupby('Solver')['Accuracy'].mean().reset_index())
plt.title('Accuracy vs Solver for InjuryType')
plt.ylabel('Accuracy')
plt.xlabel('Solver')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

**Accuracy vs Max_iter for InjuryType**

In [None]:
# Plot Accuracy vs Max_iter for InjuryType
plt.figure(figsize=(10, 6))
sns.lineplot(x='Max_iter', y='Accuracy', hue='Solver', data=df_results_injury)
plt.title('Accuracy vs Max_iter for InjuryType')
plt.ylabel('Accuracy')
plt.xlabel('Max_iter')
plt.legend(title='Solver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

**Accuracy vs Solver for PatientStatus**

In [None]:
# Plot Accuracy vs Solver for PatientStatus
plt.figure(figsize=(10, 6))
sns.barplot(x='Solver', y='Accuracy', data=df_results_status.groupby('Solver')['Accuracy'].mean().reset_index())
plt.title('Accuracy vs Solver for PatientStatus')
plt.ylabel('Accuracy')
plt.xlabel('Solver')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

**Accuracy vs Max_iter for PatientStatus**

In [None]:
# Plot Accuracy vs Max_iter for PatientStatus
plt.figure(figsize=(10, 6))
sns.lineplot(x='Max_iter', y='Accuracy', hue='Solver', data=df_results_status)
plt.title('Accuracy vs Max_iter for PatientStatus')
plt.ylabel('Accuracy')
plt.xlabel('Max_iter')
plt.legend(title='Solver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()