# Hate speech classification by k-fold cross validation on movies dataset



The class labels depict the following:

0: Normal speech, 
1: Offensive speech
2: Hate speech

#### To work with this, the following folder paths needs to be created in the directory of this notebook:
classification_reports/   : This will contain all the classification reports generated by the model

movies/       : contains all_movies.csv file

movies/for_training/:    contains 6 movies used for cross validation training and testing


In [1]:
! pip install transformers==2.6.0

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 9.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 59.4 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 65.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 57.0 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled P

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import os
import glob

In [3]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…





---
### Cross validation


#### 6-fold cross validation on movies 

Methods to convert the data into the data required by the model for training and testing

In [10]:
def convert_data_to_examples_cv(train, DATA_COLUMN, LABEL_COLUMN):
    train_InputExamples = train.apply(
        lambda x: InputExample(guid=None,  # Globally unique ID for bookkeeping, unused in this case
                               text_a=x[DATA_COLUMN],
                               text_b=None,
                               label=x[LABEL_COLUMN]), axis=1)

    return train_InputExamples


def convert_examples_to_tf_dataset_cv(examples, tokenizer, max_length=128):
    features = []  # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,  # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True,  # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
                                                     input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [11]:
def train_bert(df_train, df_test):
    # initialize model with 3 labels, for hate, offensive and normal class classification
    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                            trainable=True,
                                                            num_labels=3)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    train = df_train[['text', 'majority_answer']]
    train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

    test = df_test[['text', 'majority_answer']]
    test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

    DATA_COLUMN = 'DATA_COLUMN'
    LABEL_COLUMN = 'LABEL_COLUMN'

    train_InputExamples = convert_data_to_examples_cv(train, DATA_COLUMN, LABEL_COLUMN)
    test_InputExamples = convert_data_to_examples_cv(test, DATA_COLUMN, LABEL_COLUMN)

    train_data = convert_examples_to_tf_dataset_cv(list(train_InputExamples), tokenizer)
    train_data = train_data.batch(32)

    valid_data = convert_examples_to_tf_dataset_cv(list(test_InputExamples), tokenizer)
    valid_data = valid_data.batch(32)

    # compile and fit
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-6, epsilon=1e-08, clipnorm=1.0),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
    print('train data type',type(train_data))
    model.fit(train_data, epochs=6, validation_data=valid_data)

    test_data = convert_examples_to_tf_dataset_cv(list(test_InputExamples), tokenizer)
    test_data = test_data.batch(32)

    print('predicting')
    preds = model.predict(test_data)

    # classification
    return classification_report(pd.DataFrame(test['LABEL_COLUMN']), np.argmax(preds[0], axis=1), output_dict=True)

In [6]:
def load_movies_to_df(path):
    df_movies = []

    for filename in glob.glob(path + '*.csv'):
        df_movies.append(pd.read_csv(filename))

    return df_movies

In [7]:
df_movies = load_movies_to_df('movies/for_training/')
classification_reports = []
df_main = pd.DataFrame()

In [12]:
# perform cross folding
for i in range(len(df_movies)):
    df_train = pd.concat(df_movies[0:i] + df_movies[i + 1:])
    df_test = df_movies[i]

    train_movies = df_train['movie_name'].unique()
    test_movie = df_test['movie_name'].unique()
    print(','.join(train_movies))
    print(test_movie[0])

    report = train_bert(df_train, df_test)
    classification_reports.append(report)
    
    print('Train movies: ', str(','.join(train_movies)))
    print('Test movie: ', str(test_movie[0]))
    print('Classification report: \n', classification_reports[i])
    print('------------------------------------------------')

    df_cr = pd.DataFrame(classification_reports[i]).transpose()
    df_cr['movie_train'] =  str(','.join(train_movies))
    df_cr['movie_test'] = str(test_movie[0])
    df_cr.to_csv('classification_reports/'+'bert_cv_testmovie_'+str(test_movie[0])+'.csv')
    df_main = df_main.append(df_cr)

Pulp_Fiction,AmerricanHistoryX,TheWolfofWallStreet,Django_Unchained,South_Park
BlacKkKlansman


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train data type <class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
predicting
Train movies:  Pulp_Fiction,AmerricanHistoryX,TheWolfofWallStreet,Django_Unchained,South_Park
Test movie:  BlacKkKlansman
Classification report: 
 {'0': {'precision': 0.9631120053655265, 'recall': 0.9822161422708618, 'f1-score': 0.9725702675245513, 'support': 1462}, '1': {'precision': 0.6382978723404256, 'recall': 0.6185567010309279, 'f1-score': 0.6282722513089005, 'support': 97}, '2': {'precision': 0.85, 'recall': 0.5930232558139535, 'f1-score': 0.6986301369863014, 'support': 86}, 'accuracy': 0.9404255319148936, 'macro avg': {'precision': 0.8171366259019841, 'recall': 0.7312653663719145, 'f1-score': 0.7664908852732512, 'support': 1645}, 'weighted avg': {'precision': 0.9380453771801952, 'recall': 0.9404255319148936, 'f1-score': 0.9379467059444859, 'support': 1645}}
------------------------------------------------
BlacKkKlansman,AmerricanHi

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train data type <class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
predicting
Train movies:  BlacKkKlansman,AmerricanHistoryX,TheWolfofWallStreet,Django_Unchained,South_Park
Test movie:  Pulp_Fiction
Classification report: 
 {'0': {'precision': 0.9706103993971364, 'recall': 0.9662415603900976, 'f1-score': 0.968421052631579, 'support': 1333}, '1': {'precision': 0.8302583025830258, 'recall': 0.8490566037735849, 'f1-score': 0.8395522388059701, 'support': 265}, '2': {'precision': 0.8333333333333334, 'recall': 0.8333333333333334, 'f1-score': 0.8333333333333334, 'support': 24}, 'accuracy': 0.9451294697903823, 'macro avg': {'precision': 0.8780673451044986, 'recall': 0.8828771658323387, 'f1-score': 0.8804355415902941, 'support': 1622}, 'weighted avg': {'precision': 0.9456486514062173, 'recall': 0.9451294697903823, 'f1-score': 0.9453678214805653, 'support': 1622}}
------------------------------------------------
BlacKkKlansm

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train data type <class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
predicting
Train movies:  BlacKkKlansman,Pulp_Fiction,TheWolfofWallStreet,Django_Unchained,South_Park
Test movie:  AmerricanHistoryX
Classification report: 
 {'0': {'precision': 0.9567809239940388, 'recall': 0.9846625766871165, 'f1-score': 0.9705215419501135, 'support': 1304}, '1': {'precision': 0.8177083333333334, 'recall': 0.7584541062801933, 'f1-score': 0.786967418546366, 'support': 207}, '2': {'precision': 1.0, 'recall': 0.5740740740740741, 'f1-score': 0.7294117647058824, 'support': 54}, 'accuracy': 0.9405750798722045, 'macro avg': {'precision': 0.9248297524424575, 'recall': 0.7723969190137946, 'f1-score': 0.8289669084007874, 'support': 1565}, 'weighted avg': {'precision': 0.9398772842736272, 'recall': 0.9405750798722045, 'f1-score': 0.9379236943362067, 'support': 1565}}
------------------------------------------------
BlacKkKlansman,Pulp_Fiction

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train data type <class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
predicting
Train movies:  BlacKkKlansman,Pulp_Fiction,AmerricanHistoryX,Django_Unchained,South_Park
Test movie:  TheWolfofWallStreet
Classification report: 
 {'0': {'precision': 0.9787319422150883, 'recall': 0.9858528698464026, 'f1-score': 0.9822795006041081, 'support': 2474}, '1': {'precision': 0.941696113074205, 'recall': 0.9080068143100511, 'f1-score': 0.9245446660884649, 'support': 587}, '2': {'precision': 0.4, 'recall': 1.0, 'f1-score': 0.5714285714285715, 'support': 2}, 'accuracy': 0.9709435194253999, 'macro avg': {'precision': 0.7734760184297644, 'recall': 0.9646198947188179, 'f1-score': 0.8260842460403816, 'support': 3063}, 'weighted avg': {'precision': 0.9712564294530482, 'recall': 0.9709435194253999, 'f1-score': 0.9709468039932581, 'support': 3063}}
------------------------------------------------
BlacKkKlansman,Pulp_Fiction,AmerricanHistor

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train data type <class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
predicting
Train movies:  BlacKkKlansman,Pulp_Fiction,AmerricanHistoryX,TheWolfofWallStreet,South_Park
Test movie:  Django_Unchained
Classification report: 
 {'0': {'precision': 0.9788867562380038, 'recall': 0.9864603481624759, 'f1-score': 0.9826589595375722, 'support': 1551}, '1': {'precision': 0.7222222222222222, 'recall': 0.6582278481012658, 'f1-score': 0.6887417218543045, 'support': 79}, '2': {'precision': 0.9732142857142857, 'recall': 0.9316239316239316, 'f1-score': 0.9519650655021833, 'support': 117}, 'accuracy': 0.9679450486548369, 'macro avg': {'precision': 0.8914410880581706, 'recall': 0.8587707092958912, 'f1-score': 0.8744552489646867, 'support': 1747}, 'weighted avg': {'precision': 0.9669003926212197, 'recall': 0.9679450486548369, 'f1-score': 0.9673122810148942, 'support': 1747}}
------------------------------------------------
BlacKkKlans

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train data type <class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
predicting
Train movies:  BlacKkKlansman,Pulp_Fiction,AmerricanHistoryX,TheWolfofWallStreet,Django_Unchained
Test movie:  South_Park
Classification report: 
 {'0': {'precision': 0.9497267759562842, 'recall': 0.9764044943820225, 'f1-score': 0.9628808864265929, 'support': 890}, '1': {'precision': 0.8359375, 'recall': 0.7379310344827587, 'f1-score': 0.783882783882784, 'support': 145}, '2': {'precision': 0.6666666666666666, 'recall': 0.18181818181818182, 'f1-score': 0.28571428571428575, 'support': 11}, 'accuracy': 0.9349904397705545, 'macro avg': {'precision': 0.8174436475409835, 'recall': 0.632051236894321, 'f1-score': 0.6774926520078876, 'support': 1046}, 'weighted avg': {'precision': 0.9309761964000252, 'recall': 0.9349904397705545, 'f1-score': 0.9309463190492623, 'support': 1046}}
------------------------------------------------


In [13]:
df_main.to_csv('classification_reports/bert_crossvalid_movies.csv')

In [14]:
 print(df_main)

              precision  ...           movie_test
0              0.963112  ...       BlacKkKlansman
1              0.638298  ...       BlacKkKlansman
2              0.850000  ...       BlacKkKlansman
accuracy       0.940426  ...       BlacKkKlansman
macro avg      0.817137  ...       BlacKkKlansman
weighted avg   0.938045  ...       BlacKkKlansman
0              0.970610  ...         Pulp_Fiction
1              0.830258  ...         Pulp_Fiction
2              0.833333  ...         Pulp_Fiction
accuracy       0.945129  ...         Pulp_Fiction
macro avg      0.878067  ...         Pulp_Fiction
weighted avg   0.945649  ...         Pulp_Fiction
0              0.956781  ...    AmerricanHistoryX
1              0.817708  ...    AmerricanHistoryX
2              1.000000  ...    AmerricanHistoryX
accuracy       0.940575  ...    AmerricanHistoryX
macro avg      0.924830  ...    AmerricanHistoryX
weighted avg   0.939877  ...    AmerricanHistoryX
0              0.978732  ...  TheWolfofWallStreet


In [15]:
len(classification_reports[0])

6

In [16]:
df_main.head()

Unnamed: 0,precision,recall,f1-score,support,movie_train,movie_test
0,0.963112,0.982216,0.97257,1462.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
1,0.638298,0.618557,0.628272,97.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
2,0.85,0.593023,0.69863,86.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
accuracy,0.940426,0.940426,0.940426,0.940426,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
macro avg,0.817137,0.731265,0.766491,1645.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman


In [17]:
def get_precision_recall_f1(category, result_df):
    precision = result_df[result_df.label==category].precision.mean()
    recall = result_df[result_df.label==category].recall.mean()
    f1 = result_df[result_df.label==category]['f1-score'].mean()
    
    return {'label': category, 'precision': precision, 'recall': recall, 'f1': f1}

In [18]:
df_cv= pd.read_csv('classification_reports/bert_crossvalid_movies.csv')

In [19]:
len(classification_reports[0])

6

In [20]:
df_main.head()

Unnamed: 0,precision,recall,f1-score,support,movie_train,movie_test
0,0.963112,0.982216,0.97257,1462.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
1,0.638298,0.618557,0.628272,97.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
2,0.85,0.593023,0.69863,86.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
accuracy,0.940426,0.940426,0.940426,0.940426,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
macro avg,0.817137,0.731265,0.766491,1645.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman


In [21]:
def get_precision_recall_f1(category, result_df):
    precision = result_df[result_df.label==category].precision.mean()
    recall = result_df[result_df.label==category].recall.mean()
    f1 = result_df[result_df.label==category]['f1-score'].mean()
    
    return {'label': category, 'precision': precision, 'recall': recall, 'f1': f1}

In [25]:
df_cv= pd.read_csv('classification_reports/bert_crossvalid_movies.csv')

In [26]:
df_cv = df_cv.rename(columns={'Unnamed: 0': 'label', 'b': 'Y'})
df_cv.head()

Unnamed: 0,label,precision,recall,f1-score,support,movie_train,movie_test
0,0,0.963112,0.982216,0.97257,1462.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
1,1,0.638298,0.618557,0.628272,97.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
2,2,0.85,0.593023,0.69863,86.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
3,accuracy,0.940426,0.940426,0.940426,0.940426,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman
4,macro avg,0.817137,0.731265,0.766491,1645.0,"Pulp_Fiction,AmerricanHistoryX,TheWolfofWallSt...",BlacKkKlansman


In [27]:
normal_dict = get_precision_recall_f1('0', df_cv)
offensive_dict = get_precision_recall_f1('1',df_cv)
hate_dict = get_precision_recall_f1('2',df_cv)

#### Aggregated classification results for all 6 folds

In [28]:
df_result = pd.DataFrame([normal_dict, offensive_dict, hate_dict])
df_result

Unnamed: 0,label,precision,recall,f1
0,0,0.966308,0.980306,0.973222
1,1,0.797687,0.755039,0.775327
2,2,0.787202,0.685645,0.678414


In [29]:
for cr in classification_reports:
  print(cr)

{'0': {'precision': 0.9631120053655265, 'recall': 0.9822161422708618, 'f1-score': 0.9725702675245513, 'support': 1462}, '1': {'precision': 0.6382978723404256, 'recall': 0.6185567010309279, 'f1-score': 0.6282722513089005, 'support': 97}, '2': {'precision': 0.85, 'recall': 0.5930232558139535, 'f1-score': 0.6986301369863014, 'support': 86}, 'accuracy': 0.9404255319148936, 'macro avg': {'precision': 0.8171366259019841, 'recall': 0.7312653663719145, 'f1-score': 0.7664908852732512, 'support': 1645}, 'weighted avg': {'precision': 0.9380453771801952, 'recall': 0.9404255319148936, 'f1-score': 0.9379467059444859, 'support': 1645}}
{'0': {'precision': 0.9706103993971364, 'recall': 0.9662415603900976, 'f1-score': 0.968421052631579, 'support': 1333}, '1': {'precision': 0.8302583025830258, 'recall': 0.8490566037735849, 'f1-score': 0.8395522388059701, 'support': 265}, '2': {'precision': 0.8333333333333334, 'recall': 0.8333333333333334, 'f1-score': 0.8333333333333334, 'support': 24}, 'accuracy': 0.945