# Importing Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE

# Loading Datasets

In [2]:
train_data = pd.read_csv('/content/train_data_v2.csv')
test_data = pd.read_csv('/content/test_data_v2.csv')
validate_data = pd.read_csv('/content/validate_data_v2.csv')

train_data.info()
test_data.info()
validate_data.info()

train_data.head()
test_data.head()
validate_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 47 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   ID                                                8000 non-null   object 
 1   Severity                                          8000 non-null   int64  
 2   Start_Time                                        8000 non-null   int64  
 3   End_Time                                          8000 non-null   int64  
 4   Distance(mi)                                      8000 non-null   float64
 5   Description                                       8000 non-null   object 
 6   Zipcode                                           8000 non-null   object 
 7   Weather_Timestamp                                 8000 non-null   object 
 8   Temperature(F)                                    8000 non-null   float64
 9   Humidity(%)        

Unnamed: 0,ID,Severity,Start_Time,End_Time,Distance(mi),Description,Zipcode,Weather_Timestamp,Temperature(F),Humidity(%),...,City_Miami,City_Orlando,City_Others,Weather_Condition_Encoded,Weather_Condition_Clear / Cloudy,Weather_Condition_Fog / Low Visibility,Weather_Condition_Others,Weather_Condition_Rain,Weather_Condition_Snow,Weather_Condition_Thunderstorms / Severe Weather
0,A-6171552,2,36,232,0.111,Incident on N SILVERBELL RD near N SAN JOSE AV...,85745-2626,2021-08-21 00:58:00,79.0,57.0,...,0,0,1,0,1,0,0,0,0,0
1,A-4808036,2,836,865,0.575,Stationary traffic on US-40 E from MD-43 (US-4...,21162,2022-07-09 13:54:00,70.0,94.0,...,0,0,1,3,0,0,0,1,0,0
2,A-4252512,2,1420,14,0.767,Slow traffic on I-635 W - LBJ Fwy W from New C...,75238,2022-03-15 23:53:00,53.0,80.0,...,0,0,1,0,1,0,0,0,0,0
3,A-5501729,2,472,592,0.591,Accident at exit [29A].,55369,2021-03-01 07:53:00,13.0,67.0,...,0,0,1,0,1,0,0,0,0,0
4,A-4721213,2,1273,1408,0.013,Accident on Merrydale Rd from San Rafael to Wi...,94903,2023-02-06 06:40:00,36.0,100.0,...,0,0,1,0,1,0,0,0,0,0


# Data Cleaning

In [3]:
#Drop useless columns
drop_columns = ['ID', 'Description', 'Zipcode', 'Weather_Timestamp', 'Wind_Direction']
train_data_clean = train_data.drop(columns=drop_columns)
test_data_clean = test_data.drop(columns=drop_columns)
validate_data_clean = validate_data.drop(columns=drop_columns)

# Building Model

## Separate Target and Features

In [4]:
X_train = train_data_clean.drop(columns=['Severity'])
y_train = train_data_clean['Severity']
X_test = test_data_clean.drop(columns=['Severity'])
y_test = test_data_clean['Severity']
X_validate = validate_data_clean.drop(columns=['Severity'])
y_validate = validate_data_clean['Severity']

## Scale Data

In [5]:
# Scale the data before feature selection and model training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

##SVM Base Model

In [6]:
svm_pipeline = Pipeline([
    ('svm', SVC(kernel='rbf', class_weight='balanced'))
])

# Define hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 10],  # Regularization parameter
    'svm__gamma': ['scale', 'auto', 0.1, 0.01]  # Kernel coefficient
}

In [7]:
# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

In [12]:
# Get the best model from Grid Search
best_svm = grid_search.best_estimator_

# Predict on validation set (for hyperparameter tuning)
y_validate_pred = best_svm.predict(X_validate_scaled)

# Evaluate performance on validation set
validate_accuracy = accuracy_score(y_validate, y_validate_pred)
validate_report = classification_report(y_validate, y_validate_pred)

# Print results on validation set
print("Validation Accuracy ", validate_accuracy)
print("Classification Report (Validation Set):\n", validate_report)

# Evaluate the best model on the test set
y_test_pred_balanced = best_svm.predict(X_test)
test_accuracy_balanced = accuracy_score(y_test, y_test_pred_balanced)
test_report_balanced = classification_report(y_test, y_test_pred_balanced)

# Print results
print("Best SVM Model:", grid_search.best_params_)
print("Test Accuracy After Balancing:", test_accuracy_balanced)
print("Classification Report:\n", test_report_balanced)

Validation Accuracy  0.453
Classification Report (Validation Set):
               precision    recall  f1-score   support

           1       0.08      0.86      0.15        14
           2       0.95      0.40      0.57       838
           3       0.28      0.72      0.41       124
           4       0.07      0.54      0.13        24

    accuracy                           0.45      1000
   macro avg       0.35      0.63      0.31      1000
weighted avg       0.84      0.45      0.53      1000





Best SVM Model: {'svm__C': 10, 'svm__gamma': 'scale'}
Test Accuracy After Balancing: 0.847
Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00         6
           2       0.85      1.00      0.92       847
           3       0.00      0.00      0.00       119
           4       0.00      0.00      0.00        28

    accuracy                           0.85      1000
   macro avg       0.21      0.25      0.23      1000
weighted avg       0.72      0.85      0.78      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Check if everything is predicted as class 2

In [9]:
class_counts = pd.Series(y_test_pred_balanced).value_counts()
print(class_counts)

2    1000
Name: count, dtype: int64


## SVM with RFE (Recursive feature elimination)

In [10]:
# SVM with RFE (Recursive Feature Elimination)
linear_svm = LinearSVC(C=1, max_iter=5000, class_weight='balanced')
rfe = RFE(estimator=linear_svm, n_features_to_select=20, step=1)  # Keep top 20 features
rfe.fit(X_train_scaled, y_train)

# Get the ranking of features and selected features
selected_features = rfe.support_
X_train_rfe = X_train_scaled[:, selected_features]
X_validate_rfe = X_validate_scaled[:, selected_features]
X_test_rfe = X_test_scaled[:, selected_features]

# Train SVM on selected features
svm_rfe = SVC(kernel='rbf', C=10, gamma=0.1, class_weight='balanced')
svm_rfe.fit(X_train_rfe, y_train)

# Predict on validation set (for hyperparameter tuning)
y_validate_pred_rfe = svm_rfe.predict(X_validate_rfe)

# Evaluate performance on validation set
validate_accuracy_rfe = accuracy_score(y_validate, y_validate_pred_rfe)
validate_report_rfe = classification_report(y_validate, y_validate_pred_rfe)

# Print RFE results on validation set
print("Validation Accuracy After RFE:", validate_accuracy_rfe)
print("Classification Report After RFE (Validation Set):\n", validate_report_rfe)

# Hyperparameter tuning on selected features (optional)
# Use GridSearchCV to tune hyperparameters after feature selection
param_grid_rfe = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'gamma': ['scale', 'auto', 0.1, 0.01]  # Kernel coefficient
}

grid_search_rfe = GridSearchCV(svm_rfe, param_grid_rfe, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rfe.fit(X_train_rfe, y_train)

# Get the best SVM model after feature selection
best_svm_rfe = grid_search_rfe.best_estimator_

# Evaluate the final model on the test set
y_test_pred_rfe = best_svm_rfe.predict(X_test_rfe)

# Evaluate performance on the test set
accuracy_rfe = accuracy_score(y_test, y_test_pred_rfe)
report_rfe = classification_report(y_test, y_test_pred_rfe)

# Print final results
print("Best SVM Model After RFE and Hyperparameter Tuning:", grid_search_rfe.best_params_)
print("Test Accuracy After RFE and Hyperparameter Tuning:", accuracy_rfe)
print("Classification Report After RFE (Test Set):\n", report_rfe)

Validation Accuracy After RFE: 0.418
Classification Report After RFE (Validation Set):
               precision    recall  f1-score   support

           1       0.07      0.86      0.12        14
           2       0.97      0.37      0.53       838
           3       0.27      0.69      0.39       124
           4       0.07      0.54      0.12        24

    accuracy                           0.42      1000
   macro avg       0.34      0.61      0.29      1000
weighted avg       0.85      0.42      0.50      1000

Best SVM Model After RFE and Hyperparameter Tuning: {'C': 10, 'gamma': 0.1}
Test Accuracy After RFE and Hyperparameter Tuning: 0.39
Classification Report After RFE (Test Set):
               precision    recall  f1-score   support

           1       0.03      0.83      0.06         6
           2       0.97      0.34      0.50       847
           3       0.28      0.66      0.39       119
           4       0.07      0.68      0.13        28

    accuracy                

In [11]:
class_counts = pd.Series(y_test_pred_rfe).value_counts()
print(class_counts)

2    295
3    285
4    254
1    166
Name: count, dtype: int64
