## Analisis Klasifikasi SVM 10 Fold Cross Validation

Memproses data yang sudah bersih (di preprocessing) :

In [1]:
import pandas as pd

df = pd.read_csv('Data/CleanDatasetStopwordStemmer.csv', header=0, names=['ID','label','tweet'])
df.head()

Unnamed: 0,ID,label,tweet
0,1,terlambat,posindonesia kau sungguh sangat lot sekali cum...
1,2,terlambat,min resi sampai kapan yah koo lama bgt ga sampaj
2,3,terlambat,blm pernah pake next day telat minggu koq pali...
3,4,terlambat,barang ambil aja gak takut lama klu anter gpp ...
4,5,terlambat,paket no blm jg padahal pake pos express udah ...


## Proses Klasifikasi dengan SVM

Mengimport library yang diperlukan untuk klasifikasi dengan metode SVM, fold yang disarankan adalah 10 fold yaitu membagi data jadi 10 partisi yang bergantian menjadi data uji dan training, bagus untuk data berjumlah kecil, tapi akan diuji untuk data ini agar semua data bisa digunakan sbg training dan testing.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics
# from sklearn.multiclass import OneVsRestClassifier
#SVM
from sklearn import svm
from sklearn.svm import LinearSVC
import time

# 10-fold cross validation (Linear)

In [3]:
from sklearn.model_selection import cross_val_score

start_time = time.time()

df=pd.read_csv('data/CleanDatasetStopwordStemmer.csv',sep=',', header=0,names=['id','label','tweet',],encoding="ISO-8859-1")

df.loc[df["label"]=='terlambat',"label"]=0
df.loc[df["label"]=='sistem',"label"]=1
df.loc[df["label"]=='gagal',"label"]=2
df.loc[df["label"]=='jaminan',"label"]=3
df.loc[df["label"]=='pelayanan',"label"]=4
df.loc[df["label"]=='respon',"label"]=5

df_x=df["tweet"]
df_y=df["label"]

cv=CountVectorizer()

x_traincv=cv.fit_transform(df_x)
z=x_traincv.toarray()
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(df_x)
idf = vectorizer._tfidf.idf_
cv1=TfidfVectorizer(min_df=1,stop_words=[""])
x_traincv1 = cv1.fit_transform(df_x)
a=x_traincv1.toarray() 

metode = svm.SVC(kernel='linear', C=1) 

y_train =df_y.astype('int')
metode.fit(x_traincv1,y_train)
scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
print(scores)

[ 0.69369369  0.77927928  0.78828829  0.82882883  0.83783784  0.88738739
  0.81981982  0.84234234  0.80630631  0.84234234]


In [5]:
print(scores.mean())

0.812612612613


# 10-fold cross validation (RBF)

In [14]:
from sklearn.model_selection import cross_val_score

start_time = time.time()

df=pd.read_csv('data/CleanDatasetStopwordStemmer.csv',sep=',', header=0,names=['id','label','tweet',],encoding="ISO-8859-1")

df.loc[df["label"]=='terlambat',"label"]=0
df.loc[df["label"]=='sistem',"label"]=1
df.loc[df["label"]=='gagal',"label"]=2
df.loc[df["label"]=='jaminan',"label"]=3
df.loc[df["label"]=='pelayanan',"label"]=4
df.loc[df["label"]=='respon',"label"]=5

df_x=df["tweet"]
df_y=df["label"]

cv=CountVectorizer()

x_traincv=cv.fit_transform(df_x)
z=x_traincv.toarray()
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(df_x)
idf = vectorizer._tfidf.idf_
cv1=TfidfVectorizer(min_df=1,stop_words=[""])
x_traincv1 = cv1.fit_transform(df_x)
a=x_traincv1.toarray() 

# Kernel RBF
metode = svm.SVC(kernel='rbf', C=10, gamma=1) 
y_train =df_y.astype('int')
metode.fit(x_traincv1,y_train)

scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
print(scores)

[ 0.67117117  0.77027027  0.78828829  0.83783784  0.85585586  0.88288288
  0.86486486  0.84684685  0.79279279  0.83333333]


In [15]:
print(scores.mean())

0.814414414414


# 10-fold cross validation (Polynomial)

In [4]:
from sklearn.model_selection import cross_val_score

start_time = time.time()

df=pd.read_csv('data/CleanDatasetStopwordStemmer.csv',sep=',', header=0,names=['id','label','tweet',],encoding="ISO-8859-1")

df.loc[df["label"]=='terlambat',"label"]=0
df.loc[df["label"]=='sistem',"label"]=1
df.loc[df["label"]=='gagal',"label"]=2
df.loc[df["label"]=='jaminan',"label"]=3
df.loc[df["label"]=='pelayanan',"label"]=4
df.loc[df["label"]=='respon',"label"]=5

df_x=df["tweet"]
df_y=df["label"]

cv=CountVectorizer()

x_traincv=cv.fit_transform(df_x)
z=x_traincv.toarray()
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(df_x)
idf = vectorizer._tfidf.idf_
cv1=TfidfVectorizer(min_df=1,stop_words=[""])
x_traincv1 = cv1.fit_transform(df_x)
a=x_traincv1.toarray() 

# Kernel Poly
metode = svm.SVC(kernel='poly', degree=5) 
y_train =df_y.astype('int')
metode.fit(x_traincv1,y_train)

scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
print(scores)

[ 0.56756757  0.65315315  0.64414414  0.67117117  0.67567568  0.77477477
  0.64864865  0.68918919  0.63063063  0.75675676]


In [5]:
print(scores.mean())

0.671171171171


## Range nilai C, gamma, degree dengan cross validation 10 fold

## Linear parameter C

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svm_model= SVC()

In [10]:
C_range=list(range(1,26))
acc_score=[]
for c in C_range:
    metode = svm.SVC(kernel='linear', C=c)
    scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)

[0.81261261261261253, 0.81396396396396398, 0.8112612612612613, 0.80675675675675684, 0.80945945945945952, 0.80900900900900896, 0.80945945945945952, 0.80900900900900896, 0.80900900900900896, 0.80855855855855852, 0.80855855855855852, 0.80855855855855852, 0.80810810810810807, 0.80765765765765773, 0.80810810810810829, 0.80810810810810829, 0.80855855855855852, 0.80855855855855852, 0.80855855855855852, 0.80810810810810829, 0.80810810810810829, 0.80810810810810829, 0.80810810810810829, 0.80810810810810829, 0.80810810810810829]


In [13]:
highest = max(acc_score)
print("Tertinggi : ",highest)

('Tertinggi : ', 0.81396396396396398)


In [8]:
C_range=[0.001,0.01,0.1,1,10,100,1000]
acc_score=[]
for c in C_range:
    metode = svm.SVC(kernel='linear', C=c)
    scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)

[0.44909909909909917, 0.44909909909909917, 0.72927927927927938, 0.81261261261261253, 0.80855855855855852, 0.80720720720720729, 0.80720720720720729]


In [9]:
highest = max(acc_score)
print("Tertinggi : ",highest)

('Tertinggi : ', 0.81261261261261253)


## Polynomial c,d, dan gamma

In [14]:
degree=[2,3,4,5,6]
acc_score=[]
for d in degree:
    metode = svm.SVC(kernel='poly', degree=d)
    scores = cross_val_score(metode, x_traincv1, y_train,  scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)   

[0.27521345105924172, 0.29761488475099807, 0.19690678089576594, 0.62758618177579628, 0.60821531602412804]


In [15]:
C_range=[0.001,0.01,0.1,1,10,100,1000]
acc_score=[]
for c in C_range:
    metode = svm.SVC(kernel='poly', C=c)
    scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)

[0.37297297297297299, 0.36036036036036034, 0.36126126126126124, 0.36081081081081079, 0.36081081081081079, 0.36081081081081079, 0.36081081081081079]


In [17]:
gamma_range=[0.0001,0.001,0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,2,3,4,5,6,7,8,10,100,1000]
acc_score=[]
for g in gamma_range:
    metode = svm.SVC(kernel='poly', gamma=g)
    scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)   

[0.3603603603603604, 0.36081081081081079, 0.36081081081081079, 0.36081081081081079, 0.36081081081081079, 0.36081081081081079, 0.36081081081081079, 0.36081081081081079, 0.36396396396396391, 0.37972972972972974, 0.38378378378378375, 0.4635135135135135, 0.65405405405405403, 0.67162162162162153, 0.67162162162162153, 0.67162162162162153, 0.67162162162162153, 0.67162162162162153, 0.67162162162162153, 0.67162162162162153, 0.67162162162162153, 0.67162162162162153, 0.67162162162162153]


In [9]:
C_range=[0.001,0.01,0.1,1,10,100,1000]
acc_score=[]
for c in C_range:
    metode = svm.SVC(kernel='poly', C=c, degree=5, )
    scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)

[0.67117117117117109, 0.67117117117117109, 0.67117117117117109, 0.67117117117117109, 0.67117117117117109, 0.20765765765765765, 0.17882882882882883]


In [10]:
C_range=[0.001,0.01,0.1,1,10,100,1000]
acc_score=[]
for c in C_range:
    metode = svm.SVC(kernel='poly', C=c, gamma='auto', degree=5, )
    scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)

[0.67117117117117109, 0.67117117117117109, 0.67117117117117109, 0.67117117117117109, 0.67117117117117109, 0.20765765765765765, 0.17882882882882883]


In [11]:
highest = max(acc_score)
print("Tertinggi : ",highest)

('Tertinggi : ', 0.67117117117117109)


## RBF c dan gamma

In [23]:
gamma_range=[0.0001,0.001,0.01,0.1,1,10,100,1000]
acc_score=[]
for g in gamma_range:
    metode = svm.SVC(kernel='rbf', gamma=g)
    scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)    

[0.44909909909909917, 0.44909909909909917, 0.44864864864864867, 0.77252252252252251, 0.80045045045045049, 0.17972972972972973, 0.16936936936936936, 0.1707207207207207]


In [25]:
C_range=[0.001,0.01,0.1,1,10,100,1000]
acc_score=[]
for c in C_range:
    metode = svm.SVC(kernel='rbf', C=c)
    scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)

[0.44909909909909917, 0.44909909909909917, 0.44909909909909917, 0.44909909909909917, 0.44909909909909917, 0.53198198198198199, 0.80585585585585595]


In [12]:
C_range=[0.001,0.01,0.1,1,10,100,1000]
acc_score=[]
for c in C_range:
    metode = svm.SVC(kernel='rbf', C=c, gamma=1)
    scores = cross_val_score(metode, x_traincv1, y_train, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
print(acc_score)

[0.37522522522522522, 0.37522522522522522, 0.40225225225225225, 0.80045045045045049, 0.81441441441441442, 0.81441441441441442, 0.81441441441441442]


In [13]:
highest = max(acc_score)
print("Tertinggi : ",highest)

('Tertinggi : ', 0.81441441441441442)


## GRIDSEARCH CV 10

Metode Grid Search untuk menemukan parameter dari kernel dengan akurasi terbaik

In [27]:
from sklearn.svm import SVC
svm_model= SVC()

In [28]:
tuned_parameters = {
 'C': (np.arange(0.1,1,0.1)) , 'kernel': ['linear'],
 'C': (np.arange(0.1,1,0.1)) , 'gamma': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 'kernel': ['rbf']
                   }

In [30]:
from sklearn.model_selection import GridSearchCV

model_svm = GridSearchCV(svm_model, tuned_parameters,cv=10,scoring='accuracy')

In [33]:
model_svm.fit( x_traincv1, y_train)
print(model_svm.best_score_)

0.800900900901


In [34]:
print(model_svm.grid_scores_)

[mean: 0.44144, std: 0.09014, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.1}, mean: 0.43694, std: 0.08679, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.2}, mean: 0.44279, std: 0.09245, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.3}, mean: 0.44009, std: 0.09445, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.4}, mean: 0.44009, std: 0.09097, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.5}, mean: 0.43604, std: 0.09227, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.6}, mean: 0.42523, std: 0.08728, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.7}, mean: 0.41712, std: 0.08606, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.8}, mean: 0.41126, std: 0.08599, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.9}, mean: 0.40225, std: 0.08578, params: {'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 1}, mean: 0.45405, std: 0.09284, params: {'ke



In [32]:
print(model_svm.best_params_)

{'kernel': 'rbf', 'C': 0.90000000000000002, 'gamma': 0.6}
