In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw_indo = stopwords.words("indonesian") + list(punctuation)

# Import Data

In [4]:
df = pd.read_csv("data_text/df_cleaning1.csv")
df.head()

Unnamed: 0,label,fitur
0,1,pemakaian masker menyebabkan penyakit legionna...
1,1,instruksi gubernur jateng tentang penilangan b...
2,1,foto jim rohn jokowi adalah presiden terbaik d...
3,1,ini bukan politik tapi kenyataan pak jokowi be...
4,1,foto kadrun kalo lihat foto ini panas dingin k...


In [5]:
df.label.value_counts()

1    3465
0     766
Name: label, dtype: int64

# Dataset Splitting

In [6]:
X = df.fitur
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3384,), (847,), (3384,), (847,))

# Training

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo', KNeighborsClassifier())
])

model = RandomizedSearchCV(pipeline, rsp.knn_params, cv=3, scoring='f1', n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.2min finished
  'stop_words.' % sorted(inconsistent))


ValueError: Metric 'minkowski' not valid for sparse input. Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) to get valid options. Metric can also be a callable function.

# Prediksi

In [None]:
y_pred = model.predict(X_test)

# Evaluasi

In [None]:
from sklearn.metrics import classification_report, f1_score

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(f1_score(y_test, y_pred))

In [None]:
from jcopml.plot import plot_confusion_matrix

In [None]:
plot_confusion_matrix(X_train, y_train, X_test, y_test, model)