# SVM

In [1]:
import pandas
cars = pandas.read_csv('automobile.csv')

In [2]:
variables = ['bore', 'city_mpg', 'compression_ratio', 'curb_weight', 'engine_size',
             'horsepower', 'peak_rpm', 'city_mpg', 'price']
X = cars[variables]
y = cars['doors']

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [4]:
from sklearn.svm import SVC
from sklearn import metrics

### Linear SVM

In [5]:
svc = SVC(kernel='linear')

In [6]:
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
y_svc = svc.predict(X_test)

In [8]:
metrics.confusion_matrix(y_test, y_svc)

array([[29,  9],
       [12, 14]])

In [9]:
metrics.accuracy_score(y_test, y_svc)

0.671875

In [10]:
metrics.precision_score(y_test, y_svc, pos_label='four')

0.70731707317073167

In [11]:
metrics.recall_score(y_test, y_svc, pos_label='four')

0.76315789473684215

In [12]:
metrics.f1_score(y_test, y_svc, pos_label='four')

0.73417721518987333

#### 패널티 조정

In [13]:
svc2 = SVC(kernel='linear', C=0.1)
svc2.fit(X_train, y_train)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
y_svc2 = svc2.predict(X_test)

In [15]:
metrics.confusion_matrix(y_test, y_svc2)

array([[31,  7],
       [12, 14]])

In [16]:
metrics.accuracy_score(y_test, y_svc2)

0.703125

In [17]:
metrics.precision_score(y_test, y_svc2, pos_label='four')

0.72093023255813948

In [18]:
metrics.recall_score(y_test, y_svc2, pos_label='four')

0.81578947368421051

In [19]:
metrics.f1_score(y_test, y_svc2, pos_label='four')

0.76543209876543195

### 전체 과정을 함수로 만들기

In [20]:
def run_svm_model(kernel, penalty):
    model = SVC(kernel=kernel, C=penalty)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label='four')))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label='four')))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label='four')))
    return model

In [21]:
svc3 = run_svm_model('linear', 10)

confusion matrix
[[28 10]
 [12 14]]
accuracy : 0.65625
precision : 0.7
recall : 0.7368421052631579
F1 : 0.717948717948718


In [22]:
svc4 = run_svm_model('linear', 0.01)

confusion matrix
[[29  9]
 [11 15]]
accuracy : 0.6875
precision : 0.725
recall : 0.7631578947368421
F1 : 0.7435897435897436


### SVM + RBF 커널

In [23]:
def run_svc_model(kernel, penalty, gamma='auto'):
    model = SVC(kernel=kernel, C=penalty, gamma=gamma)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label='four')))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label='four')))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label='four')))
    return model

In [24]:
svc_rbf1 = run_svc_model('rbf', 1)

confusion matrix
[[38  0]
 [26  0]]
accuracy : 0.59375
precision : 0.59375
recall : 1.0
F1 : 0.7450980392156863


In [25]:
svc_rbf2 = run_svc_model('rbf', 0.01, gamma=0.001)

confusion matrix
[[38  0]
 [26  0]]
accuracy : 0.59375
precision : 0.59375
recall : 1.0
F1 : 0.7450980392156863


### Custom Kernel

In [26]:
cate_var = cars.columns[cars.dtypes == 'object']
cate_var

Index(['maker', 'fuel', 'aspiration', 'doors', 'body', 'wheels',
       'engine_location', 'engine_type', 'cylinders', 'fuel_system'],
      dtype='object')

In [27]:
data = cars[cate_var]
data.head()

Unnamed: 0,maker,fuel,aspiration,doors,body,wheels,engine_location,engine_type,cylinders,fuel_system
0,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
1,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
2,audi,gas,std,four,sedan,fwd,front,ohc,five,mpfi
3,audi,gas,turbo,four,sedan,fwd,front,ohc,five,mpfi
4,bmw,gas,std,two,sedan,rwd,front,ohc,four,mpfi


In [28]:
X = cars[cate_var.difference(['doors'])]
y = cars['doors']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
row_train, _ = X_train.shape
row_train

95

In [40]:
row_test, _ = X_test.shape
row_test

64

In [29]:
# 커널 만들기

In [30]:
# 두 데이터 포인트의 9가지 변수 중에 같은 변수의 갯수를 센다
same = 0
for a, b in zip(data.ix[0,:], data.ix[1,:]):
    if a == b:
        same = same + 1
same

8

In [31]:
def hand_made_kernel(d1, d2):
    same = 0
    for a, b in zip(d1, d2):
        if a == b:
            same = same + 1
    return same

In [33]:
# 커널 적용

In [35]:
import numpy
P_train = numpy.zeros((row_train, row_train))
P_train

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [36]:
for i in range(row_train):
    for j in range(row_train):
        P_train[i, j] = hand_made_kernel(X_train.iloc[i,:], X_train.iloc[j,:])

In [37]:
P_train

array([[ 9.,  5.,  3., ...,  4.,  6.,  4.],
       [ 5.,  9.,  4., ...,  4.,  6.,  3.],
       [ 3.,  4.,  9., ...,  4.,  5.,  3.],
       ..., 
       [ 4.,  4.,  4., ...,  9.,  6.,  3.],
       [ 6.,  6.,  5., ...,  6.,  9.,  4.],
       [ 4.,  3.,  3., ...,  3.,  4.,  9.]])

In [None]:
# modeling

In [44]:
model = SVC(kernel='precomputed', C=1)
model.fit(P_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto',
  kernel='precomputed', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [45]:
P_test = numpy.zeros((row_test, row_train))
for i in range(row_test):
    for j in range(row_train):
        P_test[i, j] = hand_made_kernel(X_test.iloc[i,:], X_train.iloc[j,:])
P_test

array([[ 5.,  6.,  7., ...,  5.,  7.,  5.],
       [ 4.,  3.,  4., ...,  4.,  4.,  6.],
       [ 5.,  5.,  5., ...,  4.,  6.,  6.],
       ..., 
       [ 7.,  5.,  4., ...,  5.,  7.,  4.],
       [ 6.,  5.,  4., ...,  5.,  7.,  5.],
       [ 7.,  5.,  4., ...,  5.,  7.,  4.]])

In [47]:
y_c_pred = model.predict(P_test)

In [48]:
print('confusion matrix')
print(metrics.confusion_matrix(y_test, y_c_pred))
print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_c_pred)))
print('precision : {}'.format(metrics.precision_score(y_test, y_c_pred, pos_label='four')))
print('recall : {}'.format(metrics.recall_score(y_test, y_c_pred, pos_label='four')))
print('F1 : {}'.format(metrics.f1_score(y_test, y_c_pred, pos_label='four')))

confusion matrix
[[33  5]
 [ 5 21]]
accuracy : 0.84375
precision : 0.868421052631579
recall : 0.868421052631579
F1 : 0.868421052631579
