# Sarcasm Text Classification

## Manyiapkan Pustaka

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


## Load Dataset

In [None]:
train_df = pd.read_csv('/content/train.csv')
train_df.tail()

Unnamed: 0,Y,text
20028,1,tollbooth attendant wishes just one high speed...
20029,0,emotional intelligence needs moral rudder
20030,1,man putting off starting family to focus on tr...
20031,0,the 8 most important lessons from my first yea...
20032,1,priscilla chan leaves mark zuckerberg for onio...


In [None]:
train_df.Y.value_counts()

0    10479
1     9554
Name: Y, dtype: int64

In [None]:
train_df.isna().sum()

Y       0
text    0
dtype: int64

In [None]:
train_df.duplicated().sum()

69

In [None]:
train_df = train_df.drop_duplicates()

In [None]:
train_df.duplicated().sum()

0

## Prepocessing Data

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def Tahapan_Stopword(text):
  # tokenisasi teks
  words = word_tokenize(text) # Konversi text ke kata-kata

  # hapus stopwoerd
  stop_words = set(stopwords.words('english')) # Menghapus kata kaya yang tidak punya makna
  words = [word for word in words if word.lower() not in stop_words]

  return ' '.join(words)

def Tahapan_Stemming(text):
  stemmer = PorterStemmer()
  words = word_tokenize(text)

  stemmed_words = [stemmer.stem(word) for word in words] # Mereduksi kata-kata ke bentuk dasar
  return ' '.join(stemmed_words)

In [None]:
train_df['Stopwords'] = train_df['text'].apply(Tahapan_Stopword)
train_df['Stemming'] =train_df['Stopwords'].apply(Tahapan_Stemming)
train_df.tail()

Unnamed: 0,Y,text,Stopwords,Stemming
20028,1,tollbooth attendant wishes just one high speed...,tollbooth attendant wishes one high speed chas...,tollbooth attend wish one high speed chase wou...
20029,0,emotional intelligence needs moral rudder,emotional intelligence needs moral rudder,emot intellig need moral rudder
20030,1,man putting off starting family to focus on tr...,man putting starting family focus treading wat...,man put start famili focu tread water career year
20031,0,the 8 most important lessons from my first yea...,8 important lessons first year college,8 import lesson first year colleg
20032,1,priscilla chan leaves mark zuckerberg for onio...,priscilla chan leaves mark zuckerberg onion so...,priscilla chan leav mark zuckerberg onion soci...


In [None]:
text = train_df['Stemming'].values
label = train_df['Y'].values

In [None]:
pesan_latih, pesan_test, label_latih, label_test = train_test_split(text, label, test_size=0.2)

In [None]:
pesan_latih.shape

(15971,)

In [None]:
pesan_test.shape

(3993,)

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='x')
tokenizer.fit_on_texts(pesan_latih)
tokenizer.fit_on_texts(pesan_test)

sekuens_latih = tokenizer.texts_to_sequences(pesan_latih)
sekuens_test = tokenizer.texts_to_sequences(pesan_test)

padded_latih = pad_sequences(sekuens_latih, maxlen=30)
padded_test = pad_sequences(sekuens_test, maxlen=30)

## Modeling

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(25000, 16, input_length=30),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(24, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
#Callbacks
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self,epoch,logs={}):
    if logs.get('accuracy') > 0.9  :
      print('\nAkurasi Model Sudah > 90%')
      self.model.stop_training = True

In [None]:
num_epochs = 30
history = model.fit(padded_latih, label_latih, epochs=num_epochs,
                    validation_data=(padded_test, label_test), verbose=2, callbacks = myCallback())

Epoch 1/30
500/500 - 27s - loss: 0.6645 - accuracy: 0.6008 - val_loss: 0.5749 - val_accuracy: 0.7576 - 27s/epoch - 54ms/step
Epoch 2/30
500/500 - 3s - loss: 0.4555 - accuracy: 0.8025 - val_loss: 0.4492 - val_accuracy: 0.7886 - 3s/epoch - 6ms/step
Epoch 3/30
500/500 - 4s - loss: 0.3434 - accuracy: 0.8534 - val_loss: 0.4443 - val_accuracy: 0.7901 - 4s/epoch - 7ms/step
Epoch 4/30
500/500 - 5s - loss: 0.2864 - accuracy: 0.8792 - val_loss: 0.4686 - val_accuracy: 0.7879 - 5s/epoch - 10ms/step
Epoch 5/30
500/500 - 4s - loss: 0.2482 - accuracy: 0.8986 - val_loss: 0.4860 - val_accuracy: 0.7911 - 4s/epoch - 8ms/step
Epoch 6/30

Akurasi Model Sudah > 90%
500/500 - 3s - loss: 0.2203 - accuracy: 0.9118 - val_loss: 0.5248 - val_accuracy: 0.7864 - 3s/epoch - 6ms/step


In [None]:
# Save model
model.save('/tmp/saved_model')

In [None]:
load_model = tf.keras.models.load_model('/tmp/saved_model')

## Test Model

In [None]:
new_teks = str(input('Masukkan Teks : '))
max_len = load_model.input_shape[1] # Ambil panjang vektor input yang diharapkan

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([new_teks])
sekuens_teks = tokenizer.texts_to_sequences([new_teks])
padding_teks = pad_sequences(sekuens_teks, maxlen = max_len)

# Melkakukan prediksi
prediction = load_model.predict(padding_teks)

if prediction[0][0] > 0.5 :
  sentiment = 'Sarcasm'
else :
  sentiment = 'Not Sarcasm'

print('Teks     :',new_teks)
print('Prediksi :  {:.2f} {}'.format(prediction[0][0] * 100,'%'), '-',sentiment)

Masukkan Teks :  congress repairs to parlor to hear rep carolyn maloney play the recorder
Teks     :  congress repairs to parlor to hear rep carolyn maloney play the recorder
Prediksi :  83.69 % - Sarcasm
