In [41]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
sb.set()
b = pd.read_csv('data 2.csv')
b = b.drop(columns=['id', 'Unnamed: 32'])
b['diagnosis'] = b['diagnosis'].replace(["M", "B"], [1, 0])
y = b["diagnosis"] #Predictors
X = b.drop("diagnosis", axis=1) #Response

# Core Analysis using Support Vector Machine

Support Vector Machines is considered to be a classification approach, it but can be employed in both types of classification and regression problems. It can easily handle multiple continuous and categorical variables. SVM constructs a hyperplane in multidimensional space to separate different classes. SVM generates optimal hyperplane in an iterative manner, which is used to minimize an error. The core idea of SVM is to find a maximum marginal hyperplane(MMH) that best divides the dataset into classes.

The main function of the kernel is to transform the given dataset input data into the required form. There are various types of functions such as linear, polynomial, and radial basis function (RBF). Polynomial and RBF are useful for non-linear hyperplane. Polynomial and RBF kernels compute the separation line in the higher dimension. In some of the applications, it is suggested to use a more complex kernel to separate the classes that are curved or nonlinear. This transformation can lead to more accurate classifiers.

We have tried to predict using linear polynomial and gaussian kernels to predict which one would be the best.

## Linear Support Vector Machine (SVM)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 69) #Ensure same random state in every model to ensure comparison is fair

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[58  2]
 [ 5 49]]
0.9385964912280702


## Polynomial Support Vector Machine (SVM)

In [45]:
svclassifier = SVC(kernel='poly', degree=8)
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[58  2]
 [13 41]]
0.868421052631579


## Gaussian Support Vector Machine (SVM)

In [47]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[34  1]
 [ 4 17]]
0.9107142857142857


## Linear and Gaussian SVM using Stratified K-Fold

In [50]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# Assuming X and y are already defined and preprocessed

# Performing PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Defining the number of folds for stratified k-fold cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

# Initializing the linear SVM classifier
svm = SVC(kernel='linear', random_state=42)
svmC = SVC(kernel='linear', random_state=42, C=12)
Gsvm = SVC(kernel='rbf', random_state=42, C=12)

accuracy_scores_svm = []
accuracy_scores_Gsvm = []
accuracy_scores_svmC = []

# Performing stratified k-fold cross-validation
for train_index, test_index in skf.split(X_pca, y):
    X_train, X_test = X_pca[train_index], X_pca[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Linear SVM
    svm.fit(X_train, y_train)
    y_pred_svm = svm.predict(X_test)
    accuracy_svm = accuracy_score(y_test, y_pred_svm)
    accuracy_scores_svm.append(accuracy_svm)
    
    # Linear SVM with Regularisation
    svmC.fit(X_train, y_train)
    y_pred_svmC = svmC.predict(X_test)
    accuracy_svmC = accuracy_score(y_test, y_pred_svmC)
    accuracy_scores_svmC.append(accuracy_svmC)
    
    #Gaussian SVM
    Gsvm.fit(X_train, y_train)
    y_pred_Gsvm = Gsvm.predict(X_test)
    accuracy_Gsvm = accuracy_score(y_test, y_pred_Gsvm)
    accuracy_scores_Gsvm.append(accuracy_Gsvm)
    

# Printing the accuracy scores for each fold
print("Linear SVM accuracy scores for each fold:", accuracy_scores_svm)
print("Gaussian SVM accuracy scores for each fold:", accuracy_scores_Gsvm)
print("Linear SVM with Regularisation accuracy scores for each fold:", accuracy_scores_svmC)

# Calculating the average accuracy score
average_accuracy_svm = np.mean(accuracy_scores_svm)
print("Average accuracy score for Linear SVM:", average_accuracy_svm)
average_accuracy_Gsvm = np.mean(accuracy_scores_Gsvm)
print("Average accuracy score for Gaussian SVM:", average_accuracy_Gsvm)
average_accuracy_svmC = np.mean(accuracy_scores_svmC)
print("Average accuracy score for Linear SVM with regularisation:", average_accuracy_svmC)

Linear SVM accuracy scores for each fold: [0.9298245614035088, 0.9298245614035088, 0.9649122807017544, 0.8421052631578947, 0.9298245614035088, 0.8771929824561403, 0.9473684210526315, 0.9824561403508771, 0.9473684210526315, 0.9464285714285714]
Gaussian SVM accuracy scores for each fold: [0.9298245614035088, 0.9122807017543859, 0.9649122807017544, 0.8421052631578947, 0.9298245614035088, 0.8771929824561403, 0.9473684210526315, 0.9824561403508771, 0.9473684210526315, 0.9285714285714286]
Linear SVM with Regularisation accuracy scores for each fold: [0.9298245614035088, 0.9298245614035088, 0.9649122807017544, 0.8421052631578947, 0.9298245614035088, 0.8771929824561403, 0.9649122807017544, 0.9824561403508771, 0.9473684210526315, 0.9464285714285714]
Average accuracy score for Linear SVM: 0.9297305764411027
Average accuracy score for Gaussian SVM: 0.9261904761904761
Average accuracy score for Linear SVM with regularisation: 0.931484962406015


Advantages:

SVM Classifiers offer good accuracy and perform faster prediction  They also use less memory because they use a subset of training points in the decision phase. SVM works well with a clear margin of separation and with high dimensional space.

Disadvantages:

SVM is not suitable for large datasets because of its high training time and it also takes more time in training compared to Naïve Bayes. It works poorly with overlapping classes and is also sensitive to the type of kernel used.

### Why did we add a regularization parameter (C=)?

Here C is the penalty parameter, which represents misclassification or error term. The misclassification or error term tells the SVM optimization how much error is bearable. This is how you can control the trade-off between decision boundary and misclassification term. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane.


Through trial and error, we have found that C=12 gives the highest prediction accuracy.
