# SVM & Decision Tree
## Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Import Train and Test Datasets

In [2]:
X_train = pd.read_csv('Data/X_train_engineered.csv')
X_test = pd.read_csv('Data/X_test_engineered.csv')
Y_train = pd.read_csv('Data/y_train_undersampled_data.csv')
Y_test = pd.read_csv('Data/y_test.csv')

print("Train Set :", Y_train.shape, X_train.shape)
print("Test Set  :", Y_test.shape, X_test.shape)

Train Set : (20392, 1) (20392, 99)
Test Set  : (8176, 1) (8176, 99)


In [3]:
# helper function for calculating evaluation scores
def evaluate(Y_train, Y_train_pred, Y_test, Y_test_pred, version, train_metrics, test_metrics):
    train_metric = {
    "Version": version,
    "Accuracy": accuracy_score(Y_train, Y_train_pred),
    "Precision": precision_score(Y_train, Y_train_pred),
    "Recall": recall_score(Y_train, Y_train_pred),
    "F1 Score": f1_score(Y_train, Y_train_pred)
    }

    test_metric = {
        "Version": version,
        "Accuracy": accuracy_score(Y_test, Y_test_pred),
        "Precision": precision_score(Y_test, Y_test_pred),
        "Recall": recall_score(Y_test, Y_test_pred),
        "F1 Score": f1_score(Y_test, Y_test_pred)
    }

    # Save to overall metrics dataframe for comparison later
    if len(train_metrics)==0:
        train_metrics = pd.DataFrame.from_records([train_metric])
        test_metrics = pd.DataFrame.from_records([test_metric])
    else:
        train_metrics = pd.concat([train_metrics, pd.DataFrame.from_records([train_metric])], ignore_index = True)
        test_metrics = pd.concat([test_metrics, pd.DataFrame.from_records([test_metric])], ignore_index = True)

    # Calculate general metrics for the train set
    print("**Training Set Metrics**")
    print("Accuracy \t:", train_metric["Accuracy"])
    print("Precision \t:", train_metric["Precision"])
    print("Recall \t\t:", train_metric["Recall"])
    print("F1 Score \t:", train_metric["F1 Score"])

    print() # New Line

    # Calculate general metrics for the test set
    print("**Test Set Metrics**")
    print("Accuracy \t:", test_metric["Accuracy"])
    print("Precision \t:", test_metric["Precision"])
    print("Recall \t\t:", test_metric["Recall"])
    print("F1 Score \t:", test_metric["F1 Score"])
    
    return train_metrics, test_metrics

In [11]:
# Set up a dataframe to store the results from different versions
train_metrics = pd.DataFrame(columns=['Version', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
test_metrics = pd.DataFrame(columns=['Version', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# SVM

## Feature Selection for SVM

We are applying three techniques for SVM feature selection:
1. Forward Feature Selection using SVM <br>
Forward Feature Selection involves iteratively adding features to the model, starting with one and gradually increasing the feature set. With each iteration, the model's performance is evaluated using metrics like accuracy or R-squared value. This method helps identify the most relevant features by considering them one at a time.

 
2. Backward Feature Selection using SVM <br>
Backward Feature Selection begins with all features included and progressively removes one feature at a time, evaluating performance at each step. By iteratively eliminating less relevant features, this method aims to identify the most important subset of features for the model. 


3. Recursive Feature selection using SVM <br>
Recursive Feature Selection prioritizes feature importance and iteratively removes the least important features to identify the most informative subset. It typically involves using feature ranking techniques in conjunction with the model to select the most relevant features.

#### Forward Feature Selection using SVM

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score

# Initialize SVM classifier
svm = SVC(kernel='linear')

# Initialize forward feature selector
selector = SequentialFeatureSelector(svm, direction='forward', scoring='f1', cv=5, n_features_to_select='auto', tol=0.01)
# Perform forward feature selection
selector.fit(X_train, Y_train.values.ravel())

# Get selected feature indices
selected_indices_forward = selector.get_support(indices=True)

# Get selected features
selected_features_forward = X_train.columns[selected_indices_forward]

# Print selected features
print("Selected features:", selected_features_forward)

#### Backward Feature Selection using SVM

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score

# Initialize SVM classifier
svm = SVC(kernel='linear')

# Initialize forward feature selector
selector = SequentialFeatureSelector(svm, direction='backward', scoring='f1', cv=5, n_features_to_select='auto', tol=0.01)
# Perform forward feature selection
selector.fit(X_train, Y_train.values.ravel())

# Get selected feature indices
selected_indices_backward = selector.get_support(indices=True)

# Get selected features
selected_features_backward = X_train.columns[selected_indices_backward]

# Print selected features
print("Selected features:", selected_features_backward)

#### Recursive Feature selection using SVM

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

# Initialize SVM classifier
svm = SVC(kernel='linear')

# Initialize RFE for SVM
rfe = RFE(estimator=svm, scoring='f1')

# Fit RFE
rfe.fit(X_train, Y_train.values.ravel())

# Get selected feature indices
selected_indices_recursive = rfe.get_support(indices=True)

# Get selected features
selected_features_recursive = X_train.columns[selected_indices_recursive]

# Print selected features
print("Selected features:", selected_features_recursive)

## SVM with Selected Features

#### SVM with Forward Feature Selection

In [None]:
# Subset training and test data with selected features
X_train_selected = X_train[selected_features_forward]
X_test_selected = X_test[selected_features_forward]

# Initialize SVM classifier
svm = SVC(kernel='linear')

# Train SVM classifier on the selected features
svm.fit(X_train_selected, Y_train.values.ravel())

# Predict using the trained SVM classifier
Y_train_pred = svm.predict(X_train_selected)
Y_test_pred = svm.predict(X_test_selected)

In [None]:
train_metrics_forward, test_metrics_forward = evaluate(Y_train, Y_train_pred, Y_test, Y_test_pred, "Feature Selected", train_metrics, test_metrics)

#### SVM with Backward Feature Selection

In [None]:
# Subset training and test data with selected features
X_train_selected = X_train[selected_features_backward]
X_test_selected = X_test[selected_features_backward]

# Initialize SVM classifier
svm = SVC(kernel='linear')

# Train SVM classifier on the selected features
svm.fit(X_train_selected, Y_train.values.ravel())

# Predict using the trained SVM classifier
Y_train_pred = svm.predict(X_train_selected)
Y_test_pred = svm.predict(X_test_selected)

In [None]:
train_metrics_backward, test_metrics_backward = evaluate(Y_train, Y_train_pred, Y_test, Y_test_pred, "Feature Selected", train_metrics, test_metrics)

#### SVM with Recursive Feature Selection

In [None]:
# Subset training and test data with selected features
X_train_selected = X_train[selected_features_recursive]
X_test_selected = X_test[selected_features_backward_recursive]

# Initialize SVM classifier
svm = SVC(kernel='linear')

# Train SVM classifier on the selected features
svm.fit(X_train_selected, Y_train.values.ravel())

# Predict using the trained SVM classifier
Y_train_pred = svm.predict(X_train_selected)
Y_test_pred = svm.predict(X_test_selected)

In [None]:
train_metrics_recursive, test_metrics_recursive = evaluate(Y_train, Y_train_pred, Y_test, Y_test_pred, "Feature Selected", train_metrics, test_metrics)

## Hyperparameter-tuning for SVM

# Decision Tree

## Feature Selection for Decision Tree

Decision tree inherently performs feature selection as part of its model training process. In scikit-learn, the feature importance provided by decision tree-based models is based on the Gini importance. Gini importance measures the total decrease in node impurity (often quantified using the Gini index) weighted by the probability of reaching that node during the construction of the decision tree.

In [12]:
from sklearn.tree import DecisionTreeClassifier
# Initialize the decision tree classifier
tree_clf = DecisionTreeClassifier()

# Train the decision tree classifier on the training data
tree_clf.fit(X_train, Y_train)

# Make predictions on the testing data
Y_train_pred = tree_clf.predict(X_train)
Y_test_pred = tree_clf.predict(X_test)

train_metrics_dt, test_metrics_dt = evaluate(Y_train, Y_train_pred, Y_test, Y_test_pred, "Baseline", train_metrics, test_metrics)

# Get feature importances
feature_importances = tree_clf.feature_importances_

# Print feature importances
print("Feature Importances:")
for i, importance in enumerate(feature_importances):
    print(f"Feature {i+1}: {importance}")

**Training Set Metrics**
Accuracy 	: 0.9928893683797567
Precision 	: 0.9960517224360873
Recall 		: 0.9897018438603374
F1 Score 	: 0.9928666305898558

**Test Set Metrics**
Accuracy 	: 0.7602739726027398
Precision 	: 0.4141975308641975
Recall 		: 0.3989298454221165
F1 Score 	: 0.406420351302241
Feature Importances:
Feature 1: 0.0004658081540141596
Feature 2: 0.0
Feature 3: 0.0012121299316346908
Feature 4: 0.0
Feature 5: 0.005349597189101101
Feature 6: 0.0017287745260927106
Feature 7: 0.0025633193750964445
Feature 8: 0.0
Feature 9: 0.0
Feature 10: 0.0
Feature 11: 0.00022789569858413906
Feature 12: 0.000366470959372675
Feature 13: 0.0006341490784579505
Feature 14: 0.0004232746726651288
Feature 15: 0.0006420467228193327
Feature 16: 0.0016703049825390237
Feature 17: 0.0
Feature 18: 0.00018762110442647385
Feature 19: 0.0008201175863536985
Feature 20: 0.000874035546156173
Feature 21: 0.00041831140677244644
Feature 22: 0.001100999332487604
Feature 23: 0.0
Feature 24: 0.0034832237052165554
Featu

In [13]:
# Get feature importances
feature_importances = tree_clf.feature_importances_

# Sort feature importances and select top 20 indices
top_20_indices = (-feature_importances).argsort()[:20]

# Extract top 20 features
X_train_top_20 = X_train.iloc[:, top_20_indices]
X_test_top_20 = X_test.iloc[:, top_20_indices]

# Train the decision tree classifier on the top 20 features
tree_clf_top_20 = DecisionTreeClassifier()
tree_clf_top_20.fit(X_train_top_20, Y_train)

# Make predictions on the testing data using the model with top 20 features
Y_train_pred_top_20 = tree_clf_top_20.predict(X_train_top_20)
Y_test_pred_top_20 = tree_clf_top_20.predict(X_test_top_20)

# Evaluate the performance of the model with top 20 features
train_metrics_dt_top_20, test_metrics_dt_top_20 = evaluate(Y_train, Y_train_pred_top_20, Y_test, Y_test_pred_top_20, "Top 20 Features", train_metrics_dt, test_metrics_dt)

**Training Set Metrics**
Accuracy 	: 0.9922518634758729
Precision 	: 0.9960466495354813
Recall 		: 0.9884268340525696
F1 Score 	: 0.9922221128285911

**Test Set Metrics**
Accuracy 	: 0.7738502935420744
Precision 	: 0.4484249536751081
Recall 		: 0.43162901307966706
F1 Score 	: 0.4398667070584672


#### Permutation Importance

It assesses the impact of shuffling or permuting the values of individual features on the model's performance. 

In [17]:
from sklearn.inspection import permutation_importance

# Initialize the decision tree classifier
tree_clf = DecisionTreeClassifier()

# Train the decision tree classifier on the training data
tree_clf.fit(X_train, Y_train)

# Perform permutation importance
perm_importance = permutation_importance(tree_clf, X_test, Y_test, n_repeats=10, random_state=42,scoring='f1')

# Get feature importances from permutation importance
feature_importances_perm = perm_importance.importances_mean

# Sort feature importances
top_feature_indices_perm = (-feature_importances_perm).argsort()[:20]

# Extract top 20 features
X_train_top_20_perm = X_train.iloc[:, top_feature_indices_perm]
X_test_top_20_perm = X_test.iloc[:, top_feature_indices_perm]

# Train the decision tree classifier on the top 20 features
tree_clf_top_20_perm = DecisionTreeClassifier()
tree_clf_top_20_perm.fit(X_train_top_20_perm, Y_train)

# Make predictions on the testing data using the model with top 20 features
Y_train_pred_top_20_perm = tree_clf_top_20_perm.predict(X_train_top_20_perm)
Y_test_pred_top_20_perm = tree_clf_top_20_perm.predict(X_test_top_20_perm)

# Evaluate the performance of the model with top 20 features
train_metrics_dt_top_20_perm, test_metrics_dt_top_20_perm = evaluate(Y_train, Y_train_pred_top_20_perm, Y_test, Y_test_pred_top_20_perm, "Top 20 Features (Permutation Importance)", train_metrics_dt, test_metrics_dt)

**Training Set Metrics**
Accuracy 	: 0.9919576304433111
Precision 	: 0.9957501482506425
Recall 		: 0.9881326010200079
F1 Score 	: 0.9919267500246136

**Test Set Metrics**
Accuracy 	: 0.7712818003913894
Precision 	: 0.4454123112659698
Recall 		: 0.45600475624256837
F1 Score 	: 0.45064629847238535
