# SVM

In [1]:
import pandas
cars = pandas.read_csv('automobile.csv')

In [2]:
variables = ['bore', 'city_mpg', 'compression_ratio', 'curb_weight', 'engine_size',
             'horsepower', 'peak_rpm', 'city_mpg', 'price']
X = cars[variables]
y = cars['doors']

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [4]:
from sklearn.svm import SVC
from sklearn import metrics

### Linear SVM

In [5]:
svc = SVC(kernel='linear')

In [6]:
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
y_svc = svc.predict(X_test)

In [9]:
metrics.confusion_matrix(y_test, y_svc)

array([[29, 10],
       [ 7, 18]])

In [10]:
metrics.accuracy_score(y_test, y_svc)

0.734375

In [11]:
metrics.precision_score(y_test, y_svc, pos_label='four')

0.80555555555555558

In [12]:
metrics.recall_score(y_test, y_svc, pos_label='four')

0.74358974358974361

In [13]:
metrics.f1_score(y_test, y_svc, pos_label='four')

0.77333333333333343

#### 패널티 조정

In [14]:
svc2 = SVC(kernel='linear', C=0.1)
svc2.fit(X_train, y_train)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
y_svc2 = svc2.predict(X_test)

In [16]:
metrics.confusion_matrix(y_test, y_svc2)

array([[30,  9],
       [ 8, 17]])

In [17]:
metrics.accuracy_score(y_test, y_svc2)

0.734375

In [18]:
metrics.precision_score(y_test, y_svc2, pos_label='four')

0.78947368421052633

In [19]:
metrics.recall_score(y_test, y_svc2, pos_label='four')

0.76923076923076927

In [20]:
metrics.f1_score(y_test, y_svc2, pos_label='four')

0.77922077922077926

### 전체 과정을 함수로 만들기

In [24]:
def run_svm_model(kernel, penalty):
    model = SVC(kernel=kernel, C=penalty)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label='four')))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label='four')))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label='four')))
    return model

In [25]:
svc3 = run_svm_model('linear', 10)

confusion matrix
[[28 11]
 [ 7 18]]
accuracy : 0.71875
precision : 0.8
recall : 0.717948717948718
F1 : 0.7567567567567569


In [27]:
svc4 = run_svm_model('linear', 0.01)

confusion matrix
[[27 12]
 [ 5 20]]
accuracy : 0.734375
precision : 0.84375
recall : 0.6923076923076923
F1 : 0.7605633802816902


### SVM + RBF 커널

In [28]:
def run_svc_model(kernel, penalty, gamma='auto'):
    model = SVC(kernel=kernel, C=penalty, gamma=gamma)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label='four')))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label='four')))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label='four')))
    return model

In [29]:
svc_rbf1 = run_svc_model('rbf', 1)

confusion matrix
[[39  0]
 [25  0]]
accuracy : 0.609375
precision : 0.609375
recall : 1.0
F1 : 0.7572815533980582


In [32]:
svc_rbf2 = run_svc_model('rbf', 0.01, gamma=0.001)

confusion matrix
[[39  0]
 [25  0]]
accuracy : 0.609375
precision : 0.609375
recall : 1.0
F1 : 0.7572815533980582


### Custom Kernel

In [34]:
cate_var = cars.columns[cars.dtypes == 'object']
cate_var

Index(['maker', 'fuel', 'aspiration', 'doors', 'body', 'wheels',
       'engine_location', 'engine_type', 'cylinders', 'fuel_system'],
      dtype='object')

In [35]:
data = cars[cate_var]
data.head()

Unnamed: 0,maker,fuel,aspiration,doors,body,wheels,engine_location,engine_type,cylinders,fuel_system
0,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
1,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
2,audi,gas,std,four,sedan,fwd,front,ohc,five,mpfi
3,audi,gas,turbo,four,sedan,fwd,front,ohc,five,mpfi
4,bmw,gas,std,two,sedan,rwd,front,ohc,four,mpfi


In [39]:
X = cars[cate_var.difference(['doors'])]
y = cars['doors']

In [40]:
# 커널 만들기

In [43]:
# 두 데이터 포인트의 9가지 변수 중에 같은 변수의 갯수를 센다
same = 0
for a, b in zip(data.ix[0,:], data.ix[1,:]):
    if a == b:
        same = same + 1
same

8

In [44]:
def hand_made_kernel(d1, d2):
    same = 0
    for a, b in zip(d1, d2):
        if a == b:
            same = same + 1
    return same