<a href="https://colab.research.google.com/github/uadR1/nlphatespeech/blob/main/multi_class_with_Distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing Libraries
!pip install transformers
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, TFTrainer, TFTrainingArguments, DataCollatorWithPadding
import tensorflow as tf
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from matplotlib import pyplot as plt

In [None]:

# Read Data 
df = pd.read_csv('/content/df_all.csv')
df.drop(columns=df.columns[0], axis=1, inplace=True)
df.head()

In [None]:
# Check shape of dataframe 
df.shape

In [None]:
# drop NA 
df = df.dropna()

In [None]:
# Check distribution of classes
df['class'].value_counts()

In [None]:
# layers for multi class  classification 

# Copy Dataframe first
df_multiclass = df.copy()

# Replace classes 
df_multiclass['class'] = np.where((df_multiclass['class'] == 'not_hate'), 0, df_multiclass['class'])
df_multiclass['class'] = np.where((df_multiclass['class'] == 'offensive'), 1, df_multiclass['class'])
df_multiclass['class'] = np.where((df_multiclass['class'] == 'implicit_hate'), 2, df_multiclass['class'])
df_multiclass['class'] = np.where((df_multiclass['class'] == 'explicit_hate'), 3, df_multiclass['class'])

# show head 
df_multiclass.head()

In [None]:
# Show Distribution
df_multiclass['class'].value_counts()

In [None]:

# Get max number of words 
num_words = df_multiclass['text'].apply(lambda x:len(str(x).split()))
print('The comment with the most words consist of', num_words.max(),'words')

In [None]:
# Visualize 
plt.hist(num_words, bins=15)
plt.show()

In [None]:

# Create X  
X=list(df_multiclass['text']) 

# Create y 
y=list(df_multiclass['class']) 

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 123) 

In [None]:

categories=sorted(list(set(y))) #set will return the unique different entries

In [None]:
num_categories = len(categories)
num_categories

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize X Train & Test Set 
train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors='tf')
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors='tf')

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_input),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_input),
    y_test
))

In [None]:
training_args = TFTrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,   
    warmup_steps = 500,
    weight_decay=0.01,
    logging_steps=10,
    eval_steps = 10
)

In [None]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

In [None]:
from sklearn.metrics import f1_score, roc_auc_score

In [None]:
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=num_categories)

trainer = TFTrainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset
)

trainer.train()

In [None]:
output = trainer.predict(test_dataset)[0]
output = np.argmax(output, axis = - 1)

In [None]:
cm=confusion_matrix(y_test,output)
cm

In [None]:
print(classification_report(y_test, output))

In [None]:
from sklearn import metrics
plt.rcParams['figure.figsize'] = (10, 10)
confusion_matrix = metrics.confusion_matrix(y_test, output)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ["not_hate", "offensive", "implicit", "explicit" ])

cm_display.plot()