# Importing Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE

# Loading Datasets

In [2]:
train_data = pd.read_csv('/content/train_data_v3.csv')
test_data = pd.read_csv('/content/test_data_v3.csv')
validate_data = pd.read_csv('/content/validate_data_v3.csv')

train_data.info()
test_data.info()
validate_data.info()

train_data.head()
test_data.head()
validate_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 42 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   ID                                                8000 non-null   object 
 1   End_Time                                          8000 non-null   int64  
 2   Distance(mi)                                      8000 non-null   float64
 3   Description                                       8000 non-null   object 
 4   Zipcode                                           8000 non-null   object 
 5   Weather_Timestamp                                 8000 non-null   object 
 6   Temperature(F)                                    8000 non-null   float64
 7   Humidity(%)                                       8000 non-null   float64
 8   Pressure(in)                                      8000 non-null   float64
 9   Visibility(mi)     

Unnamed: 0,ID,End_Time,Distance(mi),Description,Zipcode,Weather_Timestamp,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),...,City_Los Angeles,City_Miami,City_Orlando,City_Others,Weather_Condition_Encoded,Weather_Condition_Fog / Low Visibility,Weather_Condition_Others,Weather_Condition_Snow,Weather_Condition_Thunderstorms / Severe Weather,Severity
0,A-6440727,1376,0.013,Incident on E HOLT BLVD near CORONA AVE Drive ...,91761-2110,2021-01-29 20:53:00,46.0,83.0,29.02,10.0,...,0,0,0,1,0,0,0,0,0,2
1,A-3749509,1152,0.086,Incident on BISCAYNE BAY DR near HOUSE 12700 D...,33181,2022-12-05 16:53:00,79.0,62.0,30.05,10.0,...,0,1,0,0,0,0,0,0,0,2
2,A-7118981,928,0.7,At MD-5/Branch Ave/Exit 7 - Accident.,20748,2020-03-17 14:56:00,60.0,55.0,29.8,10.0,...,0,0,0,1,0,0,0,0,0,3
3,A-2308799,327,0.0,Accident on I-94 Westbound at Exit 208 Schaefe...,48120,2019-02-27 04:53:00,18.0,84.0,31.08,1.2,...,0,0,0,1,4,0,0,1,0,3
4,A-5403565,656,0.386,Crash blocking the left lane on SR-260 Westbou...,85541,2022-07-30 09:15:00,72.0,69.0,25.02,10.0,...,0,0,0,1,0,0,0,0,0,2


# Data Cleaning

In [3]:
#Drop useless columns
drop_columns = ['ID', 'Description', 'Zipcode', 'Weather_Timestamp', 'Wind_Direction']
train_data_clean = train_data.drop(columns=drop_columns)
test_data_clean = test_data.drop(columns=drop_columns)
validate_data_clean = validate_data.drop(columns=drop_columns)

# Building Model

## Separate Target and Features

In [4]:
X_train = train_data_clean.drop(columns=['Severity'])
y_train = train_data_clean['Severity']
X_test = test_data_clean.drop(columns=['Severity'])
y_test = test_data_clean['Severity']
X_validate = validate_data_clean.drop(columns=['Severity'])
y_validate = validate_data_clean['Severity']

## Scale Data

In [5]:
# Scale the data before feature selection and model training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

##SVM Base Model (kernel=rbf)

In [6]:
svm_pipeline = Pipeline([
    ('svm', SVC(kernel='rbf', class_weight='balanced'))
])

# Define hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 5],  # Regularization parameter
    'svm__gamma': ['scale', 'auto', 0.1, 0.01]  # Kernel coefficient
}

In [7]:
# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

In [8]:
# Get the best model from Grid Search
best_svm = grid_search.best_estimator_

# Predict on validation set (for hyperparameter tuning)
y_validate_pred = best_svm.predict(X_validate_scaled)

# Evaluate performance on validation set
validate_accuracy = accuracy_score(y_validate, y_validate_pred)
validate_report = classification_report(y_validate, y_validate_pred)

# Print results on validation set
print("Validation Accuracy ", validate_accuracy)
print("Classification Report (Validation Set):\n", validate_report)

# Evaluate the best model on the test set
y_test_pred_balanced = best_svm.predict(X_test_scaled)
test_accuracy_balanced = accuracy_score(y_test, y_test_pred_balanced)
test_report_balanced = classification_report(y_test, y_test_pred_balanced)

# Print results
print("Best SVM Model:", grid_search.best_params_)
print("Test Accuracy After Balancing:", test_accuracy_balanced)
print("Classification Report:\n", test_report_balanced)

Validation Accuracy  0.351
Classification Report (Validation Set):
               precision    recall  f1-score   support

           1       0.06      0.79      0.10        14
           2       0.93      0.29      0.45       828
           3       0.24      0.62      0.34       124
           4       0.09      0.56      0.15        34

    accuracy                           0.35      1000
   macro avg       0.33      0.57      0.26      1000
weighted avg       0.81      0.35      0.42      1000

Best SVM Model: {'svm__C': 5, 'svm__gamma': 'auto'}
Test Accuracy After Balancing: 0.362
Classification Report:
               precision    recall  f1-score   support

           1       0.03      0.83      0.05         6
           2       0.93      0.33      0.49       857
           3       0.22      0.55      0.32       119
           4       0.04      0.50      0.07        18

    accuracy                           0.36      1000
   macro avg       0.31      0.55      0.23      1000
weig

## SVM 2nd Model (kernel = poly)

In [9]:
svm_pipeline = Pipeline([
    ('svm', SVC(kernel='poly', class_weight='balanced'))
])

# Define hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 10],  # Regularization parameter
    'svm__gamma': ['scale', 'auto', 0.1, 0.01]  # Kernel coefficient
}

In [10]:
# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

In [11]:
# Get the best model from Grid Search
best_svm = grid_search.best_estimator_

# Predict on validation set (for hyperparameter tuning)
y_validate_pred = best_svm.predict(X_validate_scaled)

# Evaluate performance on validation set
validate_accuracy = accuracy_score(y_validate, y_validate_pred)
validate_report = classification_report(y_validate, y_validate_pred)

# Print results on validation set
print("Validation Accuracy ", validate_accuracy)
print("Classification Report (Validation Set):\n", validate_report)

# Evaluate the best model on the test set
y_test_pred_balanced = best_svm.predict(X_test_scaled)
test_accuracy_balanced = accuracy_score(y_test, y_test_pred_balanced)
test_report_balanced = classification_report(y_test, y_test_pred_balanced)

# Print results
print("Best SVM Model:", grid_search.best_params_)
print("Test Accuracy After Balancing:", test_accuracy_balanced)
print("Classification Report:\n", test_report_balanced)

Validation Accuracy  0.379
Classification Report (Validation Set):
               precision    recall  f1-score   support

           1       0.06      0.79      0.11        14
           2       0.93      0.34      0.50       828
           3       0.22      0.52      0.31       124
           4       0.09      0.56      0.16        34

    accuracy                           0.38      1000
   macro avg       0.32      0.55      0.27      1000
weighted avg       0.80      0.38      0.46      1000

Best SVM Model: {'svm__C': 10, 'svm__gamma': 0.1}
Test Accuracy After Balancing: 0.382
Classification Report:
               precision    recall  f1-score   support

           1       0.03      0.83      0.05         6
           2       0.92      0.36      0.52       857
           3       0.24      0.50      0.32       119
           4       0.02      0.28      0.04        18

    accuracy                           0.38      1000
   macro avg       0.30      0.49      0.23      1000
weight