## DATA PREPARATION

In [2]:
import os
#Ignore warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
import numpy as np
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
from googletrans import Translator
from iso639 import Lang

In [3]:
#set gpu memory so it doesn't crash.
gpu = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
X = df['comment_text']
y = df.iloc[:,2:].values

In [7]:
vector = TextVectorization(max_tokens=200000, output_sequence_length=2000, output_mode='int')
vector.adapt(X.values)

In [8]:
dataset = tf.data.Dataset.from_tensor_slices((vector(X.values), y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

## MODEL BUILDING

In [9]:
model = Sequential()
model.add(Embedding(200000+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [10]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [None]:
model.fit(train, epochs=20, validation_data=val)

In [23]:
precision = Precision()
recall = Recall()
acc = CategoricalAccuracy()

for batch in test.as_numpy_iterator(): 
    X_true, y_true = batch
    yhat = model.predict(X_true)
    
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    recall.update_state(y_true, yhat)
    precision.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

2023-11-11 14:50:37.825429: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2297822400 exceeds 10% of free system memory.




In [24]:
#test result_v3
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.9493491649627686, Recall:0.9493491649627686, Accuracy:0.41624873876571655


In [None]:
#Saving model
model.save('toxicity_v3.h5')

## TESTING

In [12]:
model = tf.keras.models.load_model('toxicity_v3.h5')

In [14]:
translator = Translator()

en = translator.translate("saya benci dia").text

input_str = vector([en])

results = model.predict(input_str)



In [21]:
language_code = Translator().detect(en).lang
language_name = Lang(language_code).name

'en'

In [35]:
text = []
for idx, col in enumerate(df.columns[2:]):
    if results[0][idx] > 0.4:
        text.append(col)
          

In [36]:
text

['toxic', 'obscene']