# Multi-Class Text Classification
Dataset yang digunakan merupakan ["Emotion Dataset - Kaggle"](https://www.kaggle.com/datasets/abdallahwagih/emotion-dataset)

In [25]:
!gdown 1ENOmx-9mXrwvWX_qBz0TbC_025bRzhr6

Downloading...
From: https://drive.google.com/uc?id=1ENOmx-9mXrwvWX_qBz0TbC_025bRzhr6
To: /content/Emotion_classify_Data.csv
  0% 0.00/614k [00:00<?, ?B/s]100% 614k/614k [00:00<00:00, 86.2MB/s]


In [26]:
import pandas as pd
import tensorflow as tf
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional


In [27]:
df = pd.read_csv('/content/Emotion_classify_Data.csv')
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


## Data Understanding

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5937 entries, 0 to 5936
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  5937 non-null   object
 1   Emotion  5937 non-null   object
dtypes: object(2)
memory usage: 92.9+ KB


In [29]:
print('Total NA value:\n', df.isna().sum())

print('\nTotal Duplicated value:\n', df.duplicated().sum())

Total NA value:
 Comment    0
Emotion    0
dtype: int64

Total Duplicated value:
 0


In [30]:
# memeriksa keseimbangan data
df.groupby(['Emotion']).count()

Unnamed: 0_level_0,Comment
Emotion,Unnamed: 1_level_1
anger,2000
fear,1937
joy,2000


In [31]:
df['Emotion'].unique() # mengetahui kategori emosi tiap kalimat

array(['fear', 'anger', 'joy'], dtype=object)

In [32]:
print('kata terpanjang: ', df.Comment.str.len().max(), ' kata')

kata terpanjang:  298  kata


In [33]:
# melihat data terpanjang
print(max(df['Comment'], key=len))

i said in the words of a devotee that i feel relieved when i hear the your title as deen bandhu as i am the most fallen person but i become afraid at your title of uplifter of devotees as i don t consider myself to be a true devotee and hence unworthy to benefit from the aspect of your personality


## Persiapan dataset (Preprocessing data)

In [34]:
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def remove_stopwords(text):
  Text=[i for i in str(text).split() if i not in stopwords]
  return " ".join(Text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
# menguji fungsi penghapus stopword
print('Sebelum dikenakan remove_stopwords:')
print(df['Comment'][1])

print('\nSetelah dikenakan remove_stopwords:')
print(remove_stopwords(df['Comment'][1]))

Sebelum dikenakan remove_stopwords:
im so full of life i feel appalled

Setelah dikenakan remove_stopwords:
im full life feel appalled


In [36]:
# Menerapkan remove_stopwords ke dataset
df['Comment'] = df.Comment.apply(lambda text : remove_stopwords(text))
df.head()

Unnamed: 0,Comment,Emotion
0,seriously hate one subject death feel reluctan...,fear
1,im full life feel appalled,anger
2,sit write start dig feelings think afraid acce...,fear
3,ive really angry r feel like idiot trusting fi...,joy
4,feel suspicious one outside like rapture happe...,fear


In [37]:
# one hot encoding pada kolom Emotion
kategori = pd.get_dummies(df.Emotion)
df_new = pd.concat([df, kategori], axis=1)
df_new = df_new.drop(columns='Emotion')
df_new

Unnamed: 0,Comment,anger,fear,joy
0,seriously hate one subject death feel reluctan...,0,1,0
1,im full life feel appalled,1,0,0
2,sit write start dig feelings think afraid acce...,0,1,0
3,ive really angry r feel like idiot trusting fi...,0,0,1
4,feel suspicious one outside like rapture happe...,0,1,0
...,...,...,...,...
5932,begun feel distressed,0,1,0
5933,left feeling annoyed angry thinking center stu...,1,0,0
5934,ever get married everything ready offer got to...,0,0,1
5935,feel reluctant applying want able find company...,0,1,0


In [38]:
# merubah dataset ke list
teks = df_new.Comment.values
label = df_new[['fear', 'anger', 'joy']].values

In [39]:
# membagi dataset menjadi data latih dan data validasi
data_latih, data_valid, label_latih, label_valid = train_test_split(
    teks, label, test_size=0.2
)

print('jumlah data latih: ',len(data_latih))
print('jumlah data Validasi: ',len(data_valid))
print('jumlah label latih: ',len(label_latih))
print('jumlah label Validasi: ',len(label_valid))

jumlah data latih:  4749
jumlah data Validasi:  1188
jumlah label latih:  4749
jumlah label Validasi:  1188


In [40]:
print(df_new.Comment.str.len().max()+1)

228


In [41]:
padding='post'
maxlen=df_new.Comment.str.len().max()+1
truncating='post'

# tokenizer dan padding
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(data_latih)
tokenizer.fit_on_texts(data_valid)
word_index = tokenizer.word_index

# mengubah kalimat menjadi sekuens
squens_latih = tokenizer.texts_to_sequences(data_latih)
squens_valid = tokenizer.texts_to_sequences(data_valid)

# padding skuens
train_pad =  np.array(tf.keras.utils.pad_sequences(squens_latih, padding=padding,
                          truncating=truncating,  maxlen=maxlen))
valid_pad =  np.array(tf.keras.utils.pad_sequences(squens_valid, padding=padding,
                          truncating=truncating,  maxlen=maxlen))

## Modelling

In [42]:
#menambahkan fitur callback
class callbacks(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if ((logs.get('acc') > 0.90) and (logs.get('val_acc') > 0.90)):
      self.model.stop_training = True
callbacks = callbacks()

In [45]:
model = tf.keras.Sequential([
    Embedding(len(word_index) + 1, 300),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32, dropout=0.2,recurrent_dropout=0.2)),
    Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['acc'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 300)         2650800   
                                                                 
 bidirectional_6 (Bidirecti  (None, None, 128)         186880    
 onal)                                                           
                                                                 
 bidirectional_7 (Bidirecti  (None, 64)                41216     
 onal)                                                           
                                                                 
 dense_3 (Dense)             (None, 3)                 195       
                                                                 
Total params: 2879091 (10.98 MB)
Trainable params: 2879091 (10.98 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [50]:
history = model.fit(train_pad, label_latih, epochs=10,
                    validation_data=(valid_pad, label_valid),
                    verbose = 1,  batch_size=100, callbacks = [callbacks]
)

Epoch 1/10
Epoch 2/10
