# 3/4/22**Supporting Vector Machine**

SVM are supervised learning models with associated learning algorithms that analyze data used for classification and regression.

The main goal of an SVM is to define an hyperplane that separates the points in two different classes. SVM differs from the other classification algorithms in the way that it chooses the decision boundary that maximizes the distance from the nearest data points of all the classes.

https://jakevdp.github.io/PythonDataScienceHandbook/06.00-figure-code.html#Classification-Example-Figure-2




## **Import package**

In [9]:
from sklearn.svm import SVC 
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_curve,roc_auc_score

from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## **Plot clustering points**

In [None]:
X.shape

In [None]:
# importing scikit learn with make_blobs
from sklearn.datasets import make_blobs
  
# creating datasets X containing n_samples
# Y containing two classes
X, Y = make_blobs(n_samples=500, centers=3, n_features=2,
                  random_state=0, cluster_std=0.40)
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring')
#plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='BuPu')
plt.show() 

## **What is SVM?** 

In [19]:
def format_plot(ax, title):
    ax.xaxis.set_major_formatter(plt.NullFormatter())
    ax.yaxis.set_major_formatter(plt.NullFormatter())
    ax.set_xlabel('feature 1', color='gray')
    ax.set_ylabel('feature 2', color='gray')
    ax.set_title(title, color='gray')

In [20]:
from sklearn.datasets import make_blobs
from sklearn.svm import SVC

# create 50 separable points
X, y = make_blobs(n_samples=500, centers=2,
                  random_state=0, cluster_std=0.60)

# fit the support vector classifier model
clf = SVC(kernel='linear')
clf.fit(X, y)



SVC(kernel='linear')

In [None]:
# plot the data
fig, ax = plt.subplots(figsize=(8, 6))
point_style = dict(cmap='Paired', s=50)
ax.scatter(X[:, 0], X[:, 1], c=y, **point_style)

# format plot
format_plot(ax, 'Input Data')
ax.axis([-1, 4, -2, 7])

#fig.savefig('figures/05.01-classification-1.png')

In [None]:
## Get contours describing the model
xx = np.linspace(-1, 4, 10)
yy = np.linspace(-2, 7, 10)
xy1, xy2 = np.meshgrid(xx, yy)
Z = np.array([clf.decision_function([t])
              for t in zip(xy1.flat, xy2.flat)]).reshape(xy1.shape)

# plot points and model
fig, ax = plt.subplots(figsize=(8, 6))
line_style = dict(levels = [-1.0, 0.0, 1.0],
                  linestyles = ['dashed', 'solid', 'dashed'],
                  colors = 'gray', linewidths=1)
ax.scatter(X[:, 0], X[:, 1], c=y, **point_style)
ax.contour(xy1, xy2, Z, **line_style)
#ax.scatter(X2[:, 0], X2[:, 1], c=y2, **point_style)

# format plot
format_plot(ax, 'Model Learned from Input Data')
ax.axis([-1, 4, -2, 7])

#fig.savefig('figures/05.01-classification-2.png')

In [23]:
# create some new points to predict
X2, _ = make_blobs(n_samples=80, centers=2,
                   random_state=0, cluster_std=0.80)
X2 = X2[50:]

# predict the labels
y2 = clf.predict(X2)


In [None]:
## Get contours describing the model
xx = np.linspace(-1, 4, 10)
yy = np.linspace(-2, 7, 10)
xy1, xy2 = np.meshgrid(xx, yy)
Z = np.array([clf.decision_function([t])
              for t in zip(xy1.flat, xy2.flat)]).reshape(xy1.shape)

# plot points and model
fig, ax = plt.subplots(figsize=(8, 6))
line_style = dict(levels = [-1.0, 0.0, 1.0],
                  linestyles = ['dashed', 'solid', 'dashed'],
                  colors = 'gray', linewidths=1)
ax.scatter(X[:, 0], X[:, 1], c=y, **point_style)
ax.contour(xy1, xy2, Z, **line_style)
ax.scatter(X2[:, 0], X2[:, 1], c=y2, **point_style)

# format plot
format_plot(ax, 'Model Learned from Input Data')
ax.axis([-1, 4, -2, 7])

#fig.savefig('figures/05.01-classification-2.png')

## **Application of SVM**

In [None]:
df = sns.load_dataset('iris')
df.head()

In [26]:
#define the taget y and predictor X
target = df['species'] #target/response variable y
df1 = df.copy()
df1 = df1.drop('species', axis =1)

## Defining the attributes/predictors
X = df1

In [None]:
y=target
# Splitting the data - 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2, random_state = 42)
print("Training split input- ", X_train.shape)
print("Testing split input- ", X_test.shape)

In [None]:
#svclassifier = SVC(kernel='rbf') #Gaussian Kernel
#svclassifier = SVC(kernel='poly', degree=8) #Polynomial Kernel
svclassifier = SVC(kernel='linear') #Linear
#svclassifier = SVC(kernel='sigmoid') #Sigmoid Kernel
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred))

##**LinearSVC**
Similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

In [None]:

c_seed = np.linspace(.05, 3, num=100)
lvc_df = pd.DataFrame(c_seed, columns=['c'])
lvc_df['model'] = lvc_df['c'].apply(lambda c: LinearSVC(C=c).fit(X_train,y_train))
lvc_df['score']=lvc_df['model'].apply(lambda model: model.score(X_test, y_test))
filter=lvc_df['score']==max(lvc_df['score'])
bestmodel=lvc_df[filter]['model'].values[0]
train_score=bestmodel.score(X_train,y_train)
test_score=bestmodel.score(X_test,y_test)
bestmodel,train_score, test_score


In [None]:
y_pred = bestmodel.predict(X_test)
print(confusion_matrix(y_test,y_pred))


In [None]:
y_pred=bestmodel.predict(X)
cnf_matrix = confusion_matrix(y, y_pred)
cnf_matrix
#mse_krm=mean_squared_error(y, y_pred)
#print(mse_krm)
plt.figure(figsize=(5,5))
sns.heatmap(data=cnf_matrix,linewidths=.5, annot=True,square = True,  cmap = 'Blues')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(bestmodel.score(X, y))
plt.title(all_sample_title, size = 15)

## **Second Data Application**

In [43]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
#dat = pd.read_csv("pima-indians-diabetes.csv", header=None, names=col_names)
filein="https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
dataset=pd.read_csv(filein,header=None, names=col_names)

In [None]:
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = dataset[feature_cols] # Features
y = dataset.label # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
#svclassifier = SVC(kernel='rbf') #Gaussian Kernel
svclassifier = SVC(kernel='poly', degree=8) #Polynomial Kernel
svclassifier = SVC(kernel='linear') #Linear
svclassifier = SVC(kernel='sigmoid') #Sigmoid Kernel
svclassifier.fit(X_train, y_train)

y_pred = svclassifier.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
print(cm)

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(data=cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(round(svclassifier.score(X_test, y_test),2))
plt.title(all_sample_title, size = 15)
plt.show()

In [None]:
models=['linear','sigmoid','rbf']
test_scores=[]
for mdel in models:
  svclassifier = SVC(kernel=mdel) #Linear
  svclassifier.fit(X_train, y_train)
  y_pred = svclassifier.predict(X_test)
  cm=confusion_matrix(y_test,y_pred)
  print(cm)
  score=round(svclassifier.score(X_test, y_test),3)
  test_scores.append(score)
  
test_scores

In [None]:
max_value=max(test_scores)
max_index = test_scores.index(max_value)
models[max_index]

## **HW**
plot accuracy and roc curve for all models
The true positive rate is the proportion of observations that were correctly predicted to be positive out of all positive observations (TP/(TP + FN)). Similarly, the false positive rate is the proportion of observations that are incorrectly predicted to be positive out of all negative observations (FP/(TN + FP)).
In medical test, TPR measures proportion of all people who have disease are identified as disease.
FPR measures proportions all people who do not have disease are identified as has disease.

Final decision: trade off between TPR and 1-FPR
Where is the great performer?

In [54]:
#model = SGDClassifier(loss='hinge',alpha = alpha_hyperparameter_bow,penalty=penalty_hyperparameter_bow,class_weight='balanced')
svclassifier.fit(X_train, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class, not the predicted outputs.

y_train_pred = svclassifier.decision_function(X_train)    
y_test_pred = svclassifier.decision_function(X_test)

In [None]:
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)

print(svclassifier.score(X_test,y_test))
plt.grid()

plt.plot(train_fpr, train_tpr,color='blue')
plt.plot(test_fpr, test_tpr,color='red')
plt.plot([0,1],[0,1],'g--')
plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC(ROC curve)")
plt.grid(color='black', linestyle='-', linewidth=0.5)
plt.show()

In [None]:
svclassifier.score(X_test, y_test), svclassifier.score(X_train, y_train)

(0.7662337662337663, 0.7635009310986964)