# **SVM Classifier on Wisconsin Breast Cancer Dataset**




In [0]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [0]:
data = pd.read_csv("Breast_Cancer.csv",header=0)

In [0]:
data.head(10)

# **We wish to predict the diagonsis, get rid of unnamed and id columns**

In [0]:
data.drop("Unnamed: 32",axis=1,inplace=True)
data.drop("id",axis=1,inplace=True)

#**Check if we have missing data**

In [0]:
Missing_data=data.isna().sum()
Missing_data=Missing_data.to_frame()
Missing_data.columns=['Missing Values']
Missing_data.sort_values(by='Missing Values',ascending=False)

#**Convert Malignant Class 'M' to 1 and Benign Class 'B' to 0**
#**Visualize the class Distributions**

In [0]:
data['diagnosis']=data['diagnosis'].map({'M':1,'B':0})
Output = data['diagnosis']


In [0]:
plt.figure(figsize=(12,4), dpi=100)
sns.countplot(data['diagnosis'],label="Count")


In [0]:
data.drop("diagnosis",axis=1,inplace=True)


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data,Output,test_size=0.25,random_state=2)

In [0]:
print(X_train.shape)
print(X_test.shape)

#**Always scale the data for modles that calculate weights**

In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [0]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import metrics


## **Train SVM  Model**

In [0]:
model_svm= SVC(kernel='linear',random_state=2)


In [0]:
model_svm.fit(X_train,Y_train)


In [0]:
prediction_svm_train=model_svm.predict(X_train)

#**Get training accuracy**

In [0]:
metrics.accuracy_score(prediction_svm_train,Y_train)


In [0]:
cm_train=confusion_matrix(Y_train, prediction_svm_train)
cm_train

# **Get Testing Accuracy**

In [0]:
from sklearn.model_selection import cross_val_score
accu_svm=cross_val_score(estimator=model_svm,X=X_train,y=Y_train,cv=10)
print("The mean accuracy is:", accu_svm.mean())
print("The standard deviation of the accuracy is:", accu_svm.std())

#**Get the testing confusion matrix**

In [0]:
prediction_svm=model_svm.predict(X_test)
cm_test=confusion_matrix(Y_test, prediction_svm)

In [0]:
print("The testing accuracy is:",metrics.accuracy_score(prediction_svm,Y_test))


# **Let's tune the C parameter (trade off parameter) in SVM. Lower values represent Larger margin width and vice versa. Also tune the Kernel** 

In [0]:
from sklearn.model_selection import GridSearchCV


In [0]:
parameters_SVM= [{'C':[1,10,0.1],'kernel':['linear','poly','rbf']} ]


In [0]:
grid_search_svm=GridSearchCV(estimator=model_svm, param_grid=parameters_SVM,scoring='accuracy',cv=5)

In [0]:
grid_search_svm.fit(X_train,Y_train)

In [0]:
print("The best validation score achived was: ",grid_search_svm.best_score_)

In [0]:
print("The above validation score was achieved under the parameters:",grid_search_svm.best_params_)

In [0]:
grid_search_svm_tuned=grid_search_svm.predict(X_test)
print("Testing accuracy after tuning" ,metrics.accuracy_score(grid_search_svm_tuned,Y_test))