<a href="https://colab.research.google.com/github/vuducanh0802/ureka_ntu/blob/main/ethos/ETHOS_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import tensorflow as tf

## **Preprocessing**

In [None]:
df = pd.read_csv("ETHOS.csv",index_col=0)
df = df.rename(columns={'0':'text','0.1':'isHate'})
df

Unnamed: 0,text,isHate
0,rt mayasolov a woman you should not complain a...,0
1,rt mleew17 boy dat cold tyga dwn bad for cuffi...,0
2,rt urkindofbrand dawg rt 80sbaby4lif you ever ...,0
3,rt anderson viva base she look like tranni,0
4,rt shenikarobert the shit you hear about me mi...,0
...,...,...
24778,you muthaf in lie 8220; lifeask 20 pearl corey...,0
24779,you have gone and broke the wrong heart babi a...,0
24780,young buck wanna eat dat nigguh like aint fuck...,0
24781,youu got wild bitch tellin you lie,0


In [None]:
df.isHate.value_counts()

0    23353
1     1430
Name: isHate, dtype: int64

In [None]:
def get_sequences(texts, tokenizer, train=True, max_seq_length =None):
  sequences = tokenizer.texts_to_sequences(texts)
  if train == True:
    max_seq_length = np.max(list(map(lambda x: len(x), sequences)))
  sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_seq_length, padding = "post")
  return sequences

In [None]:
from sklearn.model_selection import train_test_split
def preprocess(df):
  df = df.copy()

  X = df.text
  y = df.isHate

  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=42, shuffle= True)

  tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=15000)

  tokenizer.fit_on_texts(X_train)

  X_train = get_sequences(X_train, tokenizer, train=True)
  X_test = get_sequences(X_test, tokenizer, train=False, max_seq_length=X_train.shape[1])

  return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess(df)

In [None]:
X_train.shape

(19826, 74)

In [None]:
X_test.shape

(4957, 74)

## **Training**

In [None]:
inputs = tf.keras.Input(shape=(74,))

embedding = tf.keras.layers.Embedding(
    input_dim  = 15000,
    output_dim = 64
)(inputs)

flatten = tf.keras.layers.Flatten()(embedding)

outputs = tf.keras.layers.Dense(1, activation="sigmoid")(flatten)

model = tf.keras.Model(inputs=inputs, outputs= outputs)

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy", tf.keras.metrics.AUC(name="auc")])

In [None]:
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 74)]              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 74, 64)            960000    
_________________________________________________________________
flatten_4 (Flatten)          (None, 4736)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 4737      
Total params: 964,737
Trainable params: 964,737
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train,y_train, validation_split=0.2, batch_size=32, epochs=32, 
                    callbacks = [tf.keras.callbacks.EarlyStopping(
                        monitor = "val_loss",
                        patience=3,
                        restore_best_weights=True
                    )]
                    )

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32


In [None]:
result = model.evaluate(X_test,y_test,verbose = 0)

print("Loss: ", result[0])
print("Accuracy: ", result[1]*100)
print("AUC: ", result[2]*100)

Loss:  0.17557618021965027
Accuracy:  94.0891683101654
AUC:  82.93139934539795


## **LSTM**

In [None]:
from keras.preprocessing import sequence
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
model=Sequential()
model.add(Embedding(input_dim=15000, output_dim=74, input_length=74))
model.add(LSTM(units=100))
model.add(Dense(1, activation='sigmoid'))

In [None]:
X_train.shape

(19826, 74)

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8fff40fe10>

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Loss: ", result[0])
print("Accuracy: ", result[1]*100)
print("AUC: ", result[2]*100)

Loss:  0.17557618021965027
Accuracy:  94.0891683101654
AUC:  82.93139934539795
