#### Importing Packages

In [32]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns 

#### Reading page-blocks.data into a dataframe

In [3]:
df = pd.read_csv('page-blocks.data', header = None, delim_whitespace=True)
df.columns = ['Height', 'Length', 'Area', 'Eccen', 'p_black', 'p_and', 'mean_tr', 'blackpix', 'blackand', 'wb_trans', 'Type']
df

Unnamed: 0,Height,Length,Area,Eccen,p_black,p_and,mean_tr,blackpix,blackand,wb_trans,Type
0,5,7,35,1.400,0.400,0.657,2.33,14,23,6,1
1,6,7,42,1.167,0.429,0.881,3.60,18,37,5,1
2,6,18,108,3.000,0.287,0.741,4.43,31,80,7,1
3,5,7,35,1.400,0.371,0.743,4.33,13,26,3,1
4,6,3,18,0.500,0.500,0.944,2.25,9,17,4,1
...,...,...,...,...,...,...,...,...,...,...,...
5468,4,524,2096,131.000,0.542,0.603,40.57,1136,1264,28,2
5469,7,4,28,0.571,0.714,0.929,10.00,20,26,2,1
5470,6,95,570,15.833,0.300,0.911,1.64,171,519,104,1
5471,7,41,287,5.857,0.213,0.801,1.36,61,230,45,1


#### Separating our data into features and classification

In [4]:
features = df.iloc[:, :-1]
target = df.iloc[:, 10]

#### Normalizing our data features

In [19]:
scaler = MinMaxScaler() 
df_feat = scaler.fit_transform(features)

#### Spliting our data into a training and testing data set and testing with C=1

In [25]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df_feat, np.ravel(target), test_size=0.3, random_state=109) 
clf = svm.SVC(kernel='linear', C=1)
clf.fit(Xtrain, ytrain)
clf.score(Xtest, ytest)

0.9232643118148599

#### With C=4 

In [26]:
clf = svm.SVC(kernel='linear', C=2**2)
clf.fit(Xtrain, ytrain)
clf.score(Xtest, ytest)

0.940925700365408

#### With C=8

In [27]:
clf = svm.SVC(kernel='linear', C=2**3)
clf.fit(Xtrain, ytrain)
clf.score(Xtest, ytest)

0.9457978075517661

#### With C=16

In [28]:
clf = svm.SVC(kernel='linear', C=2**4)
clf.fit(Xtrain, ytrain)
clf.score(Xtest, ytest)

0.9494518879415347

#### With C=32

In [29]:
clf = svm.SVC(kernel='linear', C=2**5)
clf.fit(Xtrain, ytrain)
clf.score(Xtest, ytest)

0.9555420219244823

#### Finding the right value for C can be tricky as it could be over classifying or under classifying

#### 10 fold Cross Validation 

In [51]:
kfold = KFold(n_splits=10, shuffle=True)
crossvalScore = []
for train, test in kfold.split(features, target):

    clf = svm.SVC(kernel='linear', C=2**2)

    Xtrain, Xtest = features.iloc[train], features.iloc[test]
    ytrain, ytest = target[train], target[test]

    sc = MinMaxScaler()
    Xtrain = sc.fit_transform(Xtrain)
    Xtest = sc.fit_transform(Xtest)

    model = clf.fit(Xtrain, ytrain)
    prediction = clf.predict(Xtest)
    crossvalScore.append(model.score(Xtest, ytest))
print("Score from each iteration: ")
for n, i in enumerate(crossvalScore):
    print('\t'*3,n+1,': ', i, sep='')

print("\nAverage K Fold Score: ", np.mean(crossvalScore))

Score from each iteration: 
			1: 0.9543795620437956
			2: 0.9543795620437956
			3: 0.9562043795620438
			4: 0.946983546617916
			5: 0.9177330895795247
			6: 0.9488117001828154
			7: 0.9488117001828154
			8: 0.923217550274223
			9: 0.9360146252285192
			10: 0.9561243144424132

Average K Fold Score:  0.9442660030157862
