In [127]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

## Import Data

In [128]:
df = pd.read_csv("data_training.csv", delimiter=";")
df.head()

Unnamed: 0,Label,Comment
0,Negatif,Jujur statement dia ini kek sok idealis gitu y...
1,Positif,baru nonton film nya second lead yg cewek suka...
2,Negatif,lebih banyak penonton masterpiece merahputih o...
3,Negatif,kok bisa nggak ngomenin perannya si ardhito yg...
4,Positif,Part sin hari saat tanya sama temo masalah ken...


## Preprocessing

In [129]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [130]:
df["Comment"] = df["Comment"].apply(preprocess)
df["Comment"]

Unnamed: 0,Comment
0,jujur statement dia ini kek sok idealis gitu y...
1,baru nonton film nya second lead yg cewek suka...
2,lebih banyak penonton masterpiece merahputih o...
3,kok bisa nggak ngomenin perannya si ardhito yg...
4,part sin hari saat tanya sama temo masalah ken...
...,...
255,aku dah nonton drakor nya emang seru dan lucu ...
256,saya kasian sama pemilik film dan aktor lain n...
257,star syndrom nih anak karna dpt peran utama di...
258,emang tolol sih si abidzar kalo bukan gara2 di...


## Train Test Split Data

In [131]:
vectorizer = TfidfVectorizer()

X = df["Comment"]
y = df["Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.95, stratify=y, random_state=42
)

X_train_transform = vectorizer.fit_transform(X_train)
X_test_transform = vectorizer.transform(X_test)

## Logistic Regression

In [54]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw_indo = stopwords.words('indonesian') + list(punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [55]:
from scipy import stats
class Real:
    def __init__(self, low, high, prior='uniform'):
        if low > high:
            low, high = high, low
            warnings.warn("'low' is higher than 'high' so I flipped them for you")
        self.low = low
        self.high = high
        self.prior = prior

    def rvs(self, random_state):
        uniform = self._uniform_inclusive(loc=self.low, scale=self.high - self.low)
        if self.prior == 'uniform':
            return uniform.rvs(random_state=random_state)
        elif self.prior == 'log-uniform':
            return np.power(10, uniform.rvs(random_state=random_state))
        else:
            raise Exception("Supported prior {'uniform', 'log-uniform'}")

    @staticmethod
    def _uniform_inclusive(loc, scale):
        return stats.uniform(loc=loc, scale=np.nextafter(scale, scale + 1.))

    def __repr__(self):
        return f"Real(low={self.low}, high={self.high}, prior='{self.prior}')"

class Integer:
    def __init__(self, low, high):
        if low > high:
            low, high = high, low
            warnings.warn("'low' is higher than 'high' so I flipped them for you")
        self.low = low
        self.high = high

    def rvs(self, random_state):
        rand_int = stats.randint(low=self.low, high=self.high+1)
        return rand_int.rvs(random_state=random_state)

    def __repr__(self):
        return f"Integer(low={self.low}, high={self.high})"

In [56]:
from sklearn.pipeline import Pipeline

# Preprocessor
from sklearn.linear_model import LogisticRegression
# Pipeline
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo', LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=42))
])

from sklearn.model_selection import RandomizedSearchCV
# Parameter Tuning
parameter = {
    'algo__fit_intercept': [True, False],
    'algo__C': Real(low=-3, high=3, prior='log-uniform')
}
model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

# Evaluation
print(model.best_params_);
print("Akurasi Training Model = ", model.score(X_train, y_train))
print("Akurasi Testing Model = ", model.score(X_test, y_test))


Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'algo__C': np.float64(695.8780103230367), 'algo__fit_intercept': True}
Akurasi Training Model =  1.0
Akurasi Testing Model =  0.47368421052631576


## SVM

In [57]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Preprocessor
from sklearn.linear_model import LogisticRegression
# Pipeline
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo', SVC(random_state=42))
])

from sklearn.model_selection import RandomizedSearchCV
# Parameter Tuning
parameter = {
    'algo__kernel': ['linear', 'rbf', 'sigmoid'],

    'algo__C': [0.1, 1, 10, 100, 300],
    'algo__class_weight': [None, 'balanced'],

    'algo__gamma': ['scale', 'auto', 0.01, 0.001, 0.0001],
}

model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

# Evaluation
print(model.best_params_);
print("Akurasi Training Model = ", model.score(X_train, y_train))
print("Akurasi Testing Model = ", model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'algo__kernel': 'linear', 'algo__gamma': 'auto', 'algo__class_weight': 'balanced', 'algo__C': 10}
Akurasi Training Model =  1.0
Akurasi Testing Model =  0.46963562753036436


## Random Forest

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Preprocessor
from sklearn.linear_model import LogisticRegression
# Pipeline
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo', RandomForestClassifier(random_state=42, n_jobs=-1))
])

from sklearn.model_selection import RandomizedSearchCV
# Parameter Tuning
parameter = {
    'algo__n_estimators': [100, 200, 300, 500, 800],
    'algo__max_depth': [None, 10, 20, 30, 50, 70],
    'algo__min_samples_split': [2, 5, 10],
    'algo__min_samples_leaf': [1, 2, 4],
    'algo__max_features': ['sqrt', 'log2', None],
    'algo__bootstrap': [True, False],
    'algo__class_weight': [None, 'balanced']
}

model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

# Evaluation
print(model.best_params_);
print("Akurasi Training Model = ", model.score(X_train, y_train))
print("Akurasi Testing Model = ", model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'algo__n_estimators': 300, 'algo__min_samples_split': 2, 'algo__min_samples_leaf': 2, 'algo__max_features': 'sqrt', 'algo__max_depth': 50, 'algo__class_weight': 'balanced', 'algo__bootstrap': False}
Akurasi Training Model =  0.7692307692307693
Akurasi Testing Model =  0.42105263157894735


## LSTM

sumber : https://medium.com/@gagangupta_82781/text-classification-using-lstm-7e4cc30f6232

In [59]:
!pip install Sastrawi -qq

In [132]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

factory = StemmerFactory()
stemmer = factory.create_stemmer()

sw_indo = stopwords.words('indonesian') + list(punctuation)

def data_preprocess(data):

  # tokenize word
  data["tokenized"] = data.map(word_tokenize)

  # stopwords
  data["selected"] = data["tokenized"].map(
      lambda tokens: [w for w in tokens if w not in sw_indo]
  )

  # join for make sentence
  def normalize(text):
      return " ".join(text)

  data["stemmed"] = data['selected'].map(lambda xs:[stemmer.stem(x) for x in xs])
  data["normalized"] = data['stemmed'].apply(normalize)

  return data

def embeddings(data, tokenizer = None):
    ## Tokenizer object for text to vector conversion
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=1000)
        tokenizer.fit_on_texts(data['normalized'])
        tokenized_train = tokenizer.texts_to_sequences(data['normalized'])
    else:
    ## text to vector/sequence conversion
        tokenized_train = tokenizer.texts_to_sequences(data['normalized'])

    ## adding padding if required
    train_padded = pad_sequences(tokenized_train, maxlen=15, padding="pre")

    return tokenizer, train_padded

In [133]:
ticket_data = data_preprocess(X_train)
tokenizer, train_padded = embeddings(ticket_data)

In [134]:
def transform_x(data, tokenizer):
    output_shape = [data.shape[0],
                    data.shape[1],
                    tokenizer.word_index.keys().__len__()]
    results = np.zeros(output_shape)

    for i in range(data.shape[0]):
        for ii in range(data.shape[1]):
            results[i, ii, data[i, ii]-1] = 1
    return results

In [135]:
from sklearn.preprocessing import OneHotEncoder as OHE

xtr_transformed = transform_x(train_padded, tokenizer)

## doing one hot encoding on output variable
y_encoder = OHE().fit(np.array(y_train).reshape(-1, 1))
ytr_encoded = y_encoder.transform(np.array(y_train).reshape(-1, 1)).toarray()

In [136]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, BatchNormalization, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy as CC
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.initializers import he_uniform, glorot_uniform
from tensorflow.keras.metrics import AUC
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2

class LSTMModel(object):

    def build_model(self, input_dim, output_shape, steps, dropout_rate, kernel_regularizer, bias_regularizer):

        ## ADDING INPUT LAYER
        input_layer = Input(shape=(steps, input_dim))
        # ADDING LSTM LAYER
        lstm = LSTM(units=steps)(input_layer)

        # ADDING DENSE LAYER
        dense_1 = Dense(output_shape, kernel_initializer=he_uniform(),
                        bias_initializer="zeros",
                        kernel_regularizer=l2(l2=kernel_regularizer),
                        bias_regularizer=l2(l2=bias_regularizer))(lstm)

        # DOING NORMALIZATION
        x = BatchNormalization()(dense_1)
        x = relu(x)

        # ADDING DROPOUT LAYER TO AVOID OVERFITTING
        x = Dropout(rate=dropout_rate)(x)

        # ADDING DENSE LAYER
        o = Dense(output_shape, kernel_initializer=glorot_uniform(),
                  bias_initializer="zeros",
                  kernel_regularizer=l2(l2=kernel_regularizer),
                  bias_regularizer=l2(l2=bias_regularizer))(dense_1)
        o = BatchNormalization()(o)

        ## ADDING OUTPUT LAYER
        output = softmax(o, axis=1)

        # DEFINING LOSS
        loss = CC()

        # DEFINING METRIC
        metrics = AUC()

        # DEFINING OPTIMIZER
        optimizer = Adam()

        # ASSEMBLING INTO A MODEL
        self.model = Model(inputs=[input_layer], outputs=[output])

        # COMPILING A MODEL
        self.model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

    def train(self, x, y, validation_split, epochs):
        ## MODEL TRAINING
        self.model.fit(x, y, batch_size=8, validation_split=validation_split, epochs=epochs)
        self.model.save(f"ticket_model.h5")

    def predict(self, x):
        return self.model.predict(x)

In [137]:
steps = xtr_transformed.shape[1]
dim = xtr_transformed.shape[2]
output_shape = ytr_encoded.shape[1]

model = LSTMModel()
model.build_model(input_dim=dim,
                  output_shape=output_shape,
                  steps=steps,
                  dropout_rate=0.5,
                  bias_regularizer=0.3,
                  kernel_regularizer=0.3)

In [138]:
model.train(xtr_transformed, ytr_encoded, 0.1, 30)

Epoch 1/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 573ms/step - auc_3: 0.5072 - loss: 4.1073 - val_auc_3: 0.4375 - val_loss: 3.9349
Epoch 2/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - auc_3: 0.5090 - loss: 4.0872 - val_auc_3: 0.4375 - val_loss: 3.9137
Epoch 3/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - auc_3: 0.6978 - loss: 3.7730 - val_auc_3: 0.4375 - val_loss: 3.8920
Epoch 4/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - auc_3: 0.6257 - loss: 3.8465 - val_auc_3: 0.4375 - val_loss: 3.8715
Epoch 5/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - auc_3: 0.8184 - loss: 3.6100 - val_auc_3: 0.4375 - val_loss: 3.8511
Epoch 6/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - auc_3: 0.7658 - loss: 3.6402 - val_auc_3: 0.4375 - val_loss: 3.8306
Epoch 7/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/ste



In [139]:
test_data = X_test
test_data = data_preprocess(test_data)
tokenizer, test_padded = embeddings(test_data, tokenizer)
test_transformed = transform_x(test_padded, tokenizer)

preds = []

for x in test_transformed:
    preds.append(y_encoder.inverse_transform(model.predict(np.array([x])))[0][0])

from sklearn.metrics import classification_report
print(classification_report(y_test.to_list(), preds))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3

## Kesimpulan

<p>Model Logistic Regression Menghasilkan akurasi    = 0.47</p>
<p>Model Support Vector Machine Menghasilkan akurasi = 0.46</p>
<p>Model Random Forest Menghasilkan akurasi          = 0.42</p>
<p>Model LSTM Menghasilkan akurasi           		     = 0.42</p>