In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, GRU, SimpleRNN
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils.data_utils import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

from sklearn.metrics import classification_report, confusion_matrix

In [None]:
data = pd.read_csv('/content/dataset_spam.csv')
data.tail()

In [3]:
# mapping  label dan konten, X untuk konten, dan Y untuk label
X = data.content
Y = data.label
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)
Y = Y.reshape(-1,1)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
# 6000 row
# 70% data train dan 30% data test.
# 80< maka dia spam

In [5]:
# conversi teks ke sequence
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = pad_sequences(sequences,maxlen=max_len)

In [6]:
# define RNN
def RNN():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 50, input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [None]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

In [None]:
model.fit(sequences_matrix, Y_train,batch_size=128, epochs=10,
          validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])

In [9]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = pad_sequences(test_sequences, maxlen=max_len)

In [10]:
accr = model.evaluate(test_sequences_matrix, Y_test)



In [11]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.144
  Accuracy: 0.959


# Matriks NLP Dasar

* Akurasi, Presisi, Recall, F1-Score

In [50]:
y_prediction = model.predict(test_sequences_matrix)



In [51]:
confusion_matrix = confusion_matrix(Y_test, np.rint(y_prediction))

In [52]:
confusion_matrix

array([[1713,   12],
       [  70,  206]])

In [56]:
print(classification_report(Y_test, np.rint(y_prediction)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1725
           1       0.94      0.75      0.83       276

    accuracy                           0.96      2001
   macro avg       0.95      0.87      0.91      2001
weighted avg       0.96      0.96      0.96      2001



# Matriks NLP Lanjut

* BLEU

In [1]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
reference = [['budi', 'pergi', 'ke', 'sekolah']]
candidate = ['budi', 'pergi', 'ke', 'sekolah']
score = sentence_bleu(reference, candidate)
print(score)

In [None]:
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["aku", "suka", "makan", "nasi", "goreng", "di", "tempat", "ini"]

score = sentence_bleu(reference, candidate)
print(score)

In [None]:
# 1-gram individual BLEU

reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["aku", "suka", "makan", "nasi", "goreng", "di", "tempat", "ini"]

score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
print(score)

In [None]:
# n-gram individual BLEU

print('Individual 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
print('Individual 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 1, 0, 0)))
print('Individual 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 1, 0)))
print('Individual 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 0, 1)))

In [None]:
# 4-gram cumulative BLEU

score = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
print(score)

In [None]:
# cumulative BLEU scores

print('Cumulative 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
print('Cumulative 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))
print('Cumulative 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)))

print('Cumulative 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
# perfect match
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"]
score = sentence_bleu(reference, candidate)
print(score)

In [None]:
# berbeda satu kata
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["saya", "suka", "makan", "nasi", "goreng", "di", "tempat", "ini"]
score = sentence_bleu(reference, candidate)
print(score)

In [None]:
# berbeda tiga kata
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["saya", "suka", "beli", "nasi", "goreng", "di", "tempat", "ini"]
score = sentence_bleu(reference, candidate)
print(score)

In [None]:
# berbeda keseluruhan
reference = [["saya", "suka", "makan", "nasi", "goreng", "di", "restoran", "ini"] ]
candidate = ["q", "w", "e", "r", "t", "y", "u", "i"]
score = sentence_bleu(reference, candidate)
print(score)

* ROGUE

In [None]:
!pip install evaluate
!pip install rouge-score

In [None]:
import evaluate
rouge = evaluate.load('rouge')
predictions = ["Hari ini saya belajar NLP"]
references = [
              ["Hari ini saya belajar NLP di rumah"]
             ]
results = rouge.compute(predictions=predictions, references=references)
print(results)

In [None]:
rouge = evaluate.load('rouge')
predictions = ["Hari ini saya belajar NLP di rumah"]
references = [
              ["Hari ini saya belajar NLP di rumah"]
             ]
results = rouge.compute(predictions=predictions, references=references)
print(results)

In [None]:
rouge = evaluate.load('rouge')
predictions = ["Hari ini saya belajar di rumah"]
references = [
              ["Hari ini saya belajar NLP di rumah"]
             ]
results = rouge.compute(predictions=predictions, references=references)
print(results)

* Prexpelity

In [None]:
!pip install transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [23]:
import torch

In [None]:
inputs = tokenizer("Machine Learing adalah topik yang cukup hangat dalam pembicaraan", return_tensors = "pt")
loss = model(input_ids = inputs["input_ids"], labels = inputs["input_ids"]).loss
ppl = torch.exp(loss)
print(ppl)

In [None]:
inputs_wiki_text = tokenizer("Generative Pretrained Transformer is an opensource artificial intelligence created by OpenAI in February 2019", return_tensors = "pt")
loss = model(input_ids = inputs_wiki_text["input_ids"], labels = inputs_wiki_text["input_ids"]).loss
ppl = torch.exp(loss)
print(ppl)