In [47]:
!pip install tensorflow==2.10 tensorflow-gpu==2.10 pandas matplotlib scikit-learn



You should consider upgrading via the 'E:\TFlow\venv_tf_gpu\Scripts\python.exe -m pip install --upgrade pip' command.


In [48]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [49]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# Load data

In [50]:
df_train = pd.read_csv(os.path.join('data_comment_toxicity_kaggle','train.csv'))
df_test = pd.read_csv(os.path.join('data_comment_toxicity_kaggle','test.csv'))
df_test_labels = pd.read_csv(os.path.join('data_comment_toxicity_kaggle','test_labels.csv'))

In [51]:
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [52]:
df_test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [53]:
df_test_labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [54]:
X_train = df_train['comment_text'].values
y_train = df_train.iloc[:,2:].values

X_test = df_test['comment_text'].values
y_test = df_test_labels.iloc[:,1:].values

# Preprocess

In [55]:
from tensorflow.keras.layers import TextVectorization

In [56]:
max_words = 100000
vectorizer = TextVectorization(max_tokens=max_words, output_sequence_length=1000, output_mode='int')

In [57]:
vectorizer.adapt(X_train)

In [58]:
vectorized_texts_train = vectorizer(X_train)
vectorized_texts_train

<tf.Tensor: shape=(159571, 1000), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [59]:
train_dataset = tf.data.Dataset.from_tensor_slices((vectorized_texts_train, y_train))
train_dataset = train_dataset.batch(32)

In [60]:
train_data = train_dataset.take(int(len(train_dataset)*.7))
valid_data = train_dataset.skip(int(len(train_dataset)*.7)).take(int(len(train_dataset)*.3))

# RNN model

In [61]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [62]:
rnn_model = Sequential([
    Embedding(max_words + 1, 32),
    Bidirectional(LSTM(32, activation='tanh')),
    Dense(128, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(6, activation='sigmoid')
])

In [63]:
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [64]:
rnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          3200032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

In [67]:
history = rnn_model.fit(train_data, epochs=1, validation_data=valid_data, verbose=1)



# Evaluate

In [112]:
rnn_model.evaluate(vectorizer(X_test), np.where(y_test == -1, 1, y_test))



[3.6116771697998047, 0.9990010857582092]

# Test

In [101]:
text = "I hate you so much, you freaking suck!"
vecto_text = vectorizer(text)

In [107]:
test_predict= (rnn_model.predict(np.expand_dims(vecto_text, 0)) > 0.5).astype(int).flatten()
toxicity_names = df_test_labels.columns[1:].values
result = toxicity_names[test_predict == 1]

print(f"Predict toxicity of '{text}': {result}")

Predict toxicity of 'I hate you so much, you freaking suck!': ['toxic' 'obscene' 'insult']


In [113]:
from tensorflow.keras.models import load_model

rnn_model.save('cmt_toxicity.keras')