# Support Vector Classifier for Binary Classification

This code was run in Google Collab

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split;
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss;
from sklearn.model_selection import GridSearchCV;

sns.set_style('whitegrid')

In [6]:
# Loading data from pre-cleaned csv file
dataframe = pd.read_csv("/mnt/d/Code/College/Machine Learning/Team Assignment/Default Credit Scoring/credit_card_clients.csv");

# Some education data contains 0, 5, and 6. I eliminating 'em.

invalid_education_data = [];
for idx, e in enumerate(dataframe['EDUCATION']):
    if(e > 4 or e < 1):
        invalid_education_data.append(idx);

# Some marriage data contains 0. I eliminating 'em.

invalid_marriage_data = [];
for idx, e in enumerate(dataframe['MARRIAGE']):
    if(e == 0):
        invalid_marriage_data.append(idx);

final_invalid_data_index = [];
for e in [invalid_marriage_data, invalid_education_data]:
    for f in e:

        # Check the index. If it throw exception, append the number.
        try:
            final_invalid_data_index.index(f);
            continue;
        except:
            final_invalid_data_index.append(f);

dataframe = dataframe.drop(final_invalid_data_index);

# Make one hot encoding for Sex, Educuation, and Marriage since the data is an ordinal data
hot_encoded_sex = pd.get_dummies(dataframe['SEX'], prefix = "sex");
hot_encoded_education = pd.get_dummies(dataframe['EDUCATION'], prefix = "education");
hot_encoded_marital_status = pd.get_dummies(dataframe['MARRIAGE'], prefix = "marital_status");

# Merge the hot_encoded with the main dataframe
for i in [hot_encoded_sex, hot_encoded_education,hot_encoded_marital_status]:
    dataframe = pd.concat([dataframe, i], axis = 1);

# Since OneClassSVM are mainly used for anomaly / outlier detection, I have to change the value of the label into 1 for true positive or -1 for false negative
dataframe['LABEL'] = dataframe['LABEL'].apply(lambda x: 1 if x == 1 else -1);


print("Data after cleaning: ", dataframe.shape);
print(dataframe.columns.tolist());

Data after cleaning:  (29601, 34)
['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'LABEL', 'sex_1', 'sex_2', 'education_1', 'education_2', 'education_3', 'education_4', 'marital_status_1', 'marital_status_2', 'marital_status_3']


In [7]:
# Define label data
label = dataframe['LABEL'];

# Drop ID, SEX, EDUCATION, MARRIAGE, and LABEL from dataframe for features
features = dataframe.drop(columns=["ID", "SEX", "EDUCATION", "MARRIAGE", "LABEL"]);

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=42)

# Scale the features
scaler = StandardScaler();
feature_train_scaled = scaler.fit_transform(X_train);
feature_test_scaled = scaler.transform(X_test);

In [8]:
# Searching the most suited algorithms

param_grid = {'nu': [0.01, 0.05, 0.1], 'gamma': ["scale", "auto"]};
grid_search = GridSearchCV(OneClassSVM(), param_grid, cv=5, scoring = "accuracy")
grid_search.fit(feature_train_scaled, y_train);
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set
prediction = best_model.predict(feature_test_scaled);

# Evaluate the model
accuracy = accuracy_score(y_test, prediction);
print(f"Accuracy: {accuracy}");

# Calculate mean squared error
loss = log_loss(y_test, prediction);
print(f"Log Loss: {loss}");

Accuracy: 0.5219006868595879
Log Loss: 17.232445928407998


In [15]:
print(best_params)

{'gamma': 1.0, 'nu': 0.05}
