In [161]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import model_selection # Library untuk proses prediksi
from sklearn.model_selection import KFold, RepeatedKFold # Library untuk penerapan KFold Validation
from sklearn.model_selection import cross_val_score # Library untuk menampilkan nilai pengujian Cross Validation (nilai return nya berupa array)
from sklearn.model_selection import cross_validate # ...
from sklearn.model_selection import train_test_split # Library untuk memisahkan data menjadi data training:testing
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # library untuk menerapkan gridsearch
from sklearn.metrics import roc_auc_score , roc_curve , make_scorer , accuracy_score , precision_score , recall_score , f1_score, confusion_matrix, classification_report # Menampilkan hasil tes

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [162]:
passenger = pd.read_csv('data/jumlahpenumpangkereta.csv')
passenger

Unnamed: 0,nama_variabel,nama_variabel_turunan,nama_tahun,nama_turunan_tahun,data_content,nama_item_vertical_variabel
0,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,126739481,Jabodetabek
1,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,20781674,Non Jabodetabek (Jawa)
2,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,147521155,Jawa (Jabodetabek+Non Jabodetabek)
3,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,2242116,Sumatera
4,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,149763271,Total
5,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,154592886,Jabodetabek
6,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,28804410,Non Jabodetabek (Jawa)
7,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,183397296,Jawa (Jabodetabek+Non Jabodetabek)
8,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,2733488,Sumatera
9,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,186130784,Total


Preparing dataset

In [163]:
passenger1 = passenger.copy()
passenger1

Unnamed: 0,nama_variabel,nama_variabel_turunan,nama_tahun,nama_turunan_tahun,data_content,nama_item_vertical_variabel
0,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,126739481,Jabodetabek
1,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,20781674,Non Jabodetabek (Jawa)
2,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,147521155,Jawa (Jabodetabek+Non Jabodetabek)
3,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,2242116,Sumatera
4,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2021,Tahun,149763271,Total
5,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,154592886,Jabodetabek
6,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,28804410,Non Jabodetabek (Jawa)
7,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,183397296,Jawa (Jabodetabek+Non Jabodetabek)
8,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,2733488,Sumatera
9,Jumlah Penumpang berdasarkan Moda Transportasi...,Tidak Ada,2020,Tahun,186130784,Total



Menghapus data row pertama

In [164]:

passenger1 = passenger1.drop(columns=['nama_variabel', 'nama_variabel_turunan', 'nama_turunan_tahun'])
passenger1

Unnamed: 0,nama_tahun,data_content,nama_item_vertical_variabel
0,2021,126739481,Jabodetabek
1,2021,20781674,Non Jabodetabek (Jawa)
2,2021,147521155,Jawa (Jabodetabek+Non Jabodetabek)
3,2021,2242116,Sumatera
4,2021,149763271,Total
5,2020,154592886,Jabodetabek
6,2020,28804410,Non Jabodetabek (Jawa)
7,2020,183397296,Jawa (Jabodetabek+Non Jabodetabek)
8,2020,2733488,Sumatera
9,2020,186130784,Total


> Mengganti nama kolom

In [165]:
passenger1 = passenger1.rename(columns={'nama_tahun': 'Tahun', 'data_content': 'Jumlah Penumpang', 'nama_item_vertical_variabel': 'Keterangan'})
passenger1

Unnamed: 0,Tahun,Jumlah Penumpang,Keterangan
0,2021,126739481,Jabodetabek
1,2021,20781674,Non Jabodetabek (Jawa)
2,2021,147521155,Jawa (Jabodetabek+Non Jabodetabek)
3,2021,2242116,Sumatera
4,2021,149763271,Total
5,2020,154592886,Jabodetabek
6,2020,28804410,Non Jabodetabek (Jawa)
7,2020,183397296,Jawa (Jabodetabek+Non Jabodetabek)
8,2020,2733488,Sumatera
9,2020,186130784,Total


> Menerapkan one-hot encoding untuk merubah data kategorikal menjadi numerik

In [166]:
passenger1 = passenger1.join(pd.get_dummies(passenger1['Keterangan']))
passenger1

Unnamed: 0,Tahun,Jumlah Penumpang,Keterangan,Jabodetabek,Jawa (Jabodetabek+Non Jabodetabek),Non Jabodetabek (Jawa),Sumatera,Total
0,2021,126739481,Jabodetabek,1,0,0,0,0
1,2021,20781674,Non Jabodetabek (Jawa),0,0,1,0,0
2,2021,147521155,Jawa (Jabodetabek+Non Jabodetabek),0,1,0,0,0
3,2021,2242116,Sumatera,0,0,0,1,0
4,2021,149763271,Total,0,0,0,0,1
5,2020,154592886,Jabodetabek,1,0,0,0,0
6,2020,28804410,Non Jabodetabek (Jawa),0,0,1,0,0
7,2020,183397296,Jawa (Jabodetabek+Non Jabodetabek),0,1,0,0,0
8,2020,2733488,Sumatera,0,0,0,1,0
9,2020,186130784,Total,0,0,0,0,1


> Menghapus kolom keterangan

In [167]:
passenger1 = passenger1.drop(columns='Keterangan')
passenger1

Unnamed: 0,Tahun,Jumlah Penumpang,Jabodetabek,Jawa (Jabodetabek+Non Jabodetabek),Non Jabodetabek (Jawa),Sumatera,Total
0,2021,126739481,1,0,0,0,0
1,2021,20781674,0,0,1,0,0
2,2021,147521155,0,1,0,0,0
3,2021,2242116,0,0,0,1,0
4,2021,149763271,0,0,0,0,1
5,2020,154592886,1,0,0,0,0
6,2020,28804410,0,0,1,0,0
7,2020,183397296,0,1,0,0,0
8,2020,2733488,0,0,0,1,0
9,2020,186130784,0,0,0,0,1


In [168]:
passenger1.columns = map(str.lower, passenger1.columns)
passenger1

Unnamed: 0,tahun,jumlah penumpang,jabodetabek,jawa (jabodetabek+non jabodetabek),non jabodetabek (jawa),sumatera,total
0,2021,126739481,1,0,0,0,0
1,2021,20781674,0,0,1,0,0
2,2021,147521155,0,1,0,0,0
3,2021,2242116,0,0,0,1,0
4,2021,149763271,0,0,0,0,1
5,2020,154592886,1,0,0,0,0
6,2020,28804410,0,0,1,0,0
7,2020,183397296,0,1,0,0,0
8,2020,2733488,0,0,0,1,0
9,2020,186130784,0,0,0,0,1


In [169]:
passenger1.isnull().sum()

tahun                                 0
jumlah penumpang                      0
jabodetabek                           0
jawa (jabodetabek+non jabodetabek)    0
non jabodetabek (jawa)                0
sumatera                              0
total                                 0
dtype: int64

In [183]:
# mengambil fitur/variabel masukan
X = passenger1.drop(columns = ['jabodetabek'])
y = passenger1['jabodetabek']

# membagi data menjadi data training:testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42)

In [180]:
# define kfold for cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=20)

In [172]:
# Klasifikasi SVM menggunakan Cross Validation
svm = SVC()

In [184]:
cv_score_svm = cross_val_score(svm, X_train, y_train, cv=kfold, scoring="accuracy")

print('Nilai akurasi tiap fold: {}'.format(cv_score_svm))
print('Nilai akurasi rata-rata: {}'.format(cv_score_svm.mean()))

Nilai akurasi tiap fold: [1. 1. 0. 1. 1. 1. 0. 1. 1. 1.]
Nilai akurasi rata-rata: 0.8


In [186]:
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
print(cross_validate(svm, X_train, y_train, cv=kfold, scoring=scoring))

{'fit_time': array([0.01258016, 0.00370455, 0.00281429, 0.00308132, 0.00324702,
       0.00309634, 0.002841  , 0.00214195, 0.00426745, 0.00306821]), 'score_time': array([0.00946164, 0.00990605, 0.01455998, 0.008847  , 0.01000857,
       0.00821733, 0.0066185 , 0.00710654, 0.00800395, 0.00948739]), 'test_accuracy': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]), 'test_precision': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]), 'test_recall': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]), 'test_f1': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]), 'test_roc_auc': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
Traceback (most recent call last):
  File "/home/iqbalf/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/iqbalf/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/iqbalf/.local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 381, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/home/iqbalf/.local/lib/python3.8/site-packages/sklearn/metrics/_ranking.py", line 567, in roc_auc_score
    return _average_binary_score(
  File "/home/iqbalf/.local/lib/python3.8/site-packages/sklearn/metrics/_base.py", line 75, in _average_binary_score
    retu

In [187]:
params = {
        'kernel': ['linear', 'poly', 'rbf']
        }

svm_classifierCV = GridSearchCV(SVC(), params, cv= kfold, scoring= 'accuracy')
svm_classifierCV.fit(X_train,y_train)

# print parameter terbaik
print('Parameter terbaik: {0} \ndengan nilai akurasi pada data training: {1}'.format(svm_classifierCV.best_params_, svm_classifierCV.best_score_))

In [None]:
pred = svm_classifierCV.predict(X_test)
print(classification_report(y_test, pred))