In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv(r"C:\Users\ramak\OneDrive\Desktop\CleanedData.csv")

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
y = df['Attrition'].map({'Yes': 1, 'No': 0})  
X = df.drop('Attrition', axis=1)

In [5]:
categorical_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)



In [14]:


import numpy as np

unique, counts = np.unique(y_train, return_counts=True)
print("Training set distribution:", dict(zip(unique, counts)))

unique_test, counts_test = np.unique(y_test, return_counts=True)
print("Test set distribution:", dict(zip(unique_test, counts_test)))


Training set distribution: {np.int64(0): np.int64(5798), np.int64(1): np.int64(1402)}
Test set distribution: {np.int64(0): np.int64(1449), np.int64(1): np.int64(351)}


we can clearly see its 4:1 imbalance , so lets use class_weight. , To give priority to imbalance data.

In [11]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

linear_clf = LinearSVC(
    random_state=42,
    max_iter=10000,
    class_weight='balanced'  # <-- Key fix
)
linear_clf.fit(X_train_scaled, y_train)

y_pred_linear = linear_clf.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred_linear))
print("Classification Report:\n", classification_report(y_test, y_pred_linear, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_linear))



Accuracy: 0.805
Classification Report:
               precision    recall  f1-score   support

           0       0.81      1.00      0.89      1449
           1       0.00      0.00      0.00       351

    accuracy                           0.81      1800
   macro avg       0.40      0.50      0.45      1800
weighted avg       0.65      0.81      0.72      1800

Confusion Matrix:
 [[1449    0]
 [ 351    0]]


In [15]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

rbf_clf = SVC(kernel='rbf',C=1.0,gamma='scale',class_weight='balanced',random_state=42)
rbf_clf.fit(X_train_scaled, y_train)

y_pred_rbf = rbf_clf.predict(X_test_scaled)

print("RBF Kernel SVM")
print("Accuracy:", accuracy_score(y_test, y_pred_rbf))
print("Classification Report:\n", classification_report(y_test, y_pred_rbf, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rbf))



RBF Kernel SVM
Accuracy: 0.805
Classification Report:
               precision    recall  f1-score   support

           0       0.81      1.00      0.89      1449
           1       0.00      0.00      0.00       351

    accuracy                           0.81      1800
   macro avg       0.40      0.50      0.45      1800
weighted avg       0.65      0.81      0.72      1800

Confusion Matrix:
 [[1449    0]
 [ 351    0]]


It takes to long to run. Since we have large data . also its complexity is since complexity is ~O(n^2–n^3)
We need to tune C and gamma, otherwise it may again predict mostly one class.

In [18]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np


param_dist = {
    'C': np.logspace(-2, 2, 20),      
    'gamma': np.logspace(-3, 1, 20)   
}

random_search = RandomizedSearchCV(
    SVC(kernel='rbf', class_weight='balanced', random_state=42, max_iter=1000),
    param_distributions=param_dist,
    n_iter=30,         
    cv=3,              
    verbose=2,
    n_jobs=-1,
    random_state=42
)


random_search.fit(X_train_scaled, y_train)

print("Best Params:", random_search.best_params_)

best_rbf = random_search.best_estimator_
y_pred_best_rbf = best_rbf.predict(X_test_scaled)

print("Best RBF SVM Results")
print("Accuracy:", accuracy_score(y_test, y_pred_best_rbf))
print("Classification Report:\n", classification_report(y_test, y_pred_best_rbf, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best_rbf))


Fitting 3 folds for each of 30 candidates, totalling 90 fits


KeyboardInterrupt: 

Polynomial can get very slow compare to rbf ...


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

poly_clf = SVC(
    kernel='poly',
    degree=3,              # default is 3
    C=1.0,
    gamma='scale',
    class_weight='balanced',  # handle imbalance
    random_state=42
)
poly_clf.fit(X_train_scaled, y_train)

y_pred_poly = poly_clf.predict(X_test_scaled)

print("Polynomial Kernel SVM")
print("Accuracy:", accuracy_score(y_test, y_pred_poly))
print("Classification Report:\n", classification_report(y_test, y_pred_poly, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_poly))


The SVM is taking a long time to train because it has to compare all the data points with each other, which gets slower as we have more rows. Non-linear kernels like RBF and polynomial are even slower because they do more complex calculations to find the best separation. Also, setting probability=True makes it do extra work to calculate probabilities. If the dataset has a lot of features especially after converting categories into numbers it makes training even slower.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Linear Kernel
ConfusionMatrixDisplay(cm_linear, display_labels=[0,1]).plot(cmap=plt.cm.Blues)
plt.title("Linear Kernel SVM")
plt.show()

# Polynomial Kernel
ConfusionMatrixDisplay(cm_poly, display_labels=[0,1]).plot(cmap=plt.cm.Oranges)
plt.title("Polynomial Kernel SVM")
plt.show()

# RBF Kernel
ConfusionMatrixDisplay(cm_rbf, display_labels=[0,1]).plot(cmap=plt.cm.Greens)
plt.title("RBF Kernel SVM")
plt.show()
