In [1]:
import torch
import pandas as pd
import numpy as np
import sklearn
from collections import Counter

In [2]:
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
from itertools import combinations

In [4]:
import re
import os

In [5]:
import torch.nn as nn

In [6]:
import matplotlib.pyplot as plt

# Data Loading

In [7]:
path = r'E:\github\movie_hatespeech_detection\data\movies_for_training\all_movies.csv'
df = pd.read_csv(path, index_col=0)
df = df.rename(columns={'majority_answer': 'label'})
df.head()

Unnamed: 0,movie_id,batch_id,label,text,movie_name
0,AmericanHistoryX(1998)_1,1566624979,0,Derek.,AmerricanHistoryX
1,AmericanHistoryX(1998)_2,1566624979,1,What the fuck are you thinking?,AmerricanHistoryX
2,AmericanHistoryX(1998)_3,1566624979,0,There's a black guy outside breaking into your...,AmerricanHistoryX
3,AmericanHistoryX(1998)_4,1566624979,0,How long has he been there?,AmerricanHistoryX
4,AmericanHistoryX(1998)_5,1566624979,0,I don't know.,AmerricanHistoryX


In [8]:
df.shape[0]

10688

In [9]:
df[df.label==2].shape[0] / df.shape[0]

0.02750748502994012

In [10]:
print(df.label.value_counts())
df.label.value_counts().plot(kind='pie', subplots=True, autopct='%1.0f%%', title='Hate Speech Distribution')

0    9014
1    1380
2     294
Name: label, dtype: int64


array([<AxesSubplot:ylabel='label'>], dtype=object)

In [11]:
movie_names = df.movie_name.unique()

## Data Splitting

In [12]:
def split_dataset(df, test_movie, seed):
    test = df[df.movie_name == test_movie]
    train = df[df.movie_name != test_movie]
    train = train.sample(frac=1, random_state=seed)
    return train.text.values, train.label.values, test.text.values, test.label.values

In [13]:
categories = [0,1,2]
seed = 11
movie_index = 5
test_movie = movie_names[movie_index]

In [14]:
train, train_targets, test, test_targets = split_dataset(df, test_movie, seed)

In [15]:
train_size = len(train)
test_size = len(test)
print(train_size)
print(test_size)

7625
3063


In [16]:
def calculate_dataset_class_distribution(targets, categories):
    df = pd.DataFrame({'category':targets})
    s = df.category.value_counts(normalize=True)
    s = s.reindex(categories)
    return [s.index[0], s[0]], [s.index[1], s[1]], [s.index[2], s[2]]

In [17]:
train_class_distribution = calculate_dataset_class_distribution(train_targets, categories)
test_class_distribution = calculate_dataset_class_distribution(test_targets, categories)
print(train_class_distribution)
print(test_class_distribution)

([0, 0.8577049180327869], [1, 0.104], [2, 0.03829508196721312])
([0, 0.8077048645119165], [1, 0.19164218086842966], [2, 0.000652954619653934])


In [18]:
train_ds = Bunch(data=train, target=train_targets)
test_ds = Bunch(data=test, target=test_targets)

## Buidling the Model

In [19]:
# Getting all the vocabularies and indexing to a unique position
vocab = Counter()
#Indexing words from the training data
for text in train_ds.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

#Indexing words from the training data
for text in test_ds.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

        
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i

    return word2index

word2index = get_word_2_index(vocab)

In [20]:
print(len(word2index))
print(word2index["the"]) # Showing the index of 'the'
print (total_words)

12836
3
12836


In [21]:
# define the network
class News_20_Net(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(News_20_Net, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True).cuda()
        self.relu = nn.ReLU().cuda()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True).cuda()
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True).cuda()
    # accept input and return an output
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

In [22]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    # Split into different batchs, get the next batch 
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    # get the targets 
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    #print(categories)
    for text in texts:
        # Dimension, 196609
        layer = np.zeros(total_words,dtype=float)

        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
        batches.append(layer)

    # We have 5 categories
    for category in categories:
        #print(category)
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        elif category == 2:
            index_y = 2
        results.append(index_y)

    # the training and the targets
    return np.array(batches),np.array(results)

In [23]:
# Parameters
learning_rate = 0.001
num_epochs = 8
batch_size = 32
display_step = 10 # ADDED will multiplied by 10

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features
input_size = total_words # Words in vocab
num_classes = len(categories)         # Categories: "graphics","space","baseball","guns", "christian"

## Training

In [24]:
results = []

In [25]:
news_net = News_20_Net(input_size, hidden_size, num_classes)
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  # This includes the Softmax loss function
optimizer = torch.optim.Adam(news_net.parameters(), lr=learning_rate)  

# Train the Model
for epoch in range(num_epochs):
    # determine the number of min-batches based on the batch size and size of training data
    total_batch = int(len(train_ds.data)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y = get_batch(train_ds,i,batch_size)
        
        articles = torch.cuda.FloatTensor(batch_x, device='cuda')
        labels = torch.cuda.LongTensor(batch_y, device='cuda')

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = news_net(articles)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % display_step == 0:
            result = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f'%(epoch+1, num_epochs, i+1, len(train_ds.data)/batch_size, loss.data)
            results.append({'Epoch': epoch+1, 'Step': i+1, 'Loss': loss.data.item()})
            if (i+1) % (display_step*10) == 0:
                print({'Epoch': epoch+1, 'Step': i+1, 'Loss': loss.data.item()})

{'Epoch': 1, 'Step': 100, 'Loss': 0.2353789508342743}
{'Epoch': 1, 'Step': 200, 'Loss': 0.24079222977161407}
{'Epoch': 2, 'Step': 100, 'Loss': 0.11936569213867188}
{'Epoch': 2, 'Step': 200, 'Loss': 0.11606612801551819}
{'Epoch': 3, 'Step': 100, 'Loss': 0.0728338211774826}
{'Epoch': 3, 'Step': 200, 'Loss': 0.012335972860455513}
{'Epoch': 4, 'Step': 100, 'Loss': 0.06808780133724213}
{'Epoch': 4, 'Step': 200, 'Loss': 0.0015068287029862404}
{'Epoch': 5, 'Step': 100, 'Loss': 0.081565722823143}
{'Epoch': 5, 'Step': 200, 'Loss': 0.00020875992777291685}
{'Epoch': 6, 'Step': 100, 'Loss': 0.10151759535074234}
{'Epoch': 6, 'Step': 200, 'Loss': 8.279733447125182e-05}
{'Epoch': 7, 'Step': 100, 'Loss': 0.09458906948566437}
{'Epoch': 7, 'Step': 200, 'Loss': 3.831751746474765e-05}
{'Epoch': 8, 'Step': 100, 'Loss': 0.09315795451402664}
{'Epoch': 8, 'Step': 200, 'Loss': 2.2156653358251788e-05}


## Validation

In [26]:
# Test the Model
correct = 0
total = 0
total_test_data = len(test_ds.target)

In [27]:
iterates = total_test_data/batch_size # ignore last (<batch_size) batch

In [28]:
all_total = []
all_correct = []
labels_all = []
predicted_all = []

In [29]:
for i in range(int(iterates)):
    batch_x_test,batch_y_test = get_batch(test_ds,i,batch_size)
    articles = torch.FloatTensor(batch_x_test).to('cuda')
    labels = torch.LongTensor(batch_y_test).to('cuda')
    outputs = news_net(articles)
    _, predicted = torch.max(outputs.data, 1)
    
    labels_all.extend([x.item() for x in labels])
    predicted_all.extend([x.item() for x in predicted])

In [30]:
categories = ['normal', 'offensive', 'hate']

In [31]:
report = classification_report(labels_all, predicted_all, target_names=categories, output_dict=True)

In [32]:
df_report = pd.DataFrame(report).transpose()

In [33]:
df_report.to_csv(movie_names[movie_index] + '.csv')

In [34]:
df_results = pd.DataFrame(results)

In [35]:
df_results['learning_rate'] = learning_rate
df_results['num_epochs'] = num_epochs
df_results['batch_size'] = batch_size
df_results['num_classes'] = num_classes
df_results['test_size'] = test_size
df_results['train_size'] = train_size
df_results['seed'] = seed
df_results['test_size'] = test_size
df_results['train_class_proportion_' + categories[0]] = round(train_class_distribution[0][1], 2)
df_results['train_class_proportion_' + categories[1]] = round(train_class_distribution[1][1], 2)
df_results['train_class_proportion_' + categories[2]] = round(train_class_distribution[2][1], 2)
df_results['test_class_proportion_' + categories[0]] = round(test_class_distribution[0][1], 2)
df_results['test_class_proportion_' + categories[1]] = round(test_class_distribution[1][1], 2)
df_results['test_class_proportion_' + categories[2]] = round(test_class_distribution[2][1], 2)

In [36]:
df_results.head()

Unnamed: 0,Epoch,Step,Loss,learning_rate,num_epochs,batch_size,num_classes,test_size,train_size,seed,train_class_proportion_normal,train_class_proportion_offensive,train_class_proportion_hate,test_class_proportion_normal,test_class_proportion_offensive,test_class_proportion_hate
0,1,10,0.977281,0.001,8,32,3,3063,7625,11,0.86,0.1,0.04,0.81,0.19,0.0
1,1,20,0.842035,0.001,8,32,3,3063,7625,11,0.86,0.1,0.04,0.81,0.19,0.0
2,1,30,0.661018,0.001,8,32,3,3063,7625,11,0.86,0.1,0.04,0.81,0.19,0.0
3,1,40,0.685303,0.001,8,32,3,3063,7625,11,0.86,0.1,0.04,0.81,0.19,0.0
4,1,50,0.562943,0.001,8,32,3,3063,7625,11,0.86,0.1,0.04,0.81,0.19,0.0


In [37]:
def plot_loss(df, ax, title):
    df.groupby('Epoch').Loss.plot(kind='line', legend=True, title=title, ax=ax, figsize=(15,8))
    ax.legend(loc='upper right')
    plt.xlabel('Step') 
    plt.ylabel('Loss')
    return ax

In [38]:
fig, axs = plt.subplots(1,1)
plot_loss(df_results, axs, title='Loss Word Embeddings, Twitter Dataset')

<AxesSubplot:title={'center':'Loss Word Embeddings, Twitter Dataset'}, xlabel='Step', ylabel='Loss'>

----

# Avarage Results

In [39]:
def load_df(path):
    name = path.split('.')[0]
    df = pd.read_csv(path)
    df['movie_name'] = name
    df = df.rename(columns={'Unnamed: 0': 'label'})
    return df

In [40]:
pathes = ['TheWolfofWallStreet.csv', 'South_Park.csv', 'Pulp_Fiction.csv', 'Django_Unchained.csv', 'AmerricanHistoryX.csv', 'BlacKkKlansman.csv']

In [54]:
dataframes = []
for path in pathes:
    df = load_df(path)
    dataframes.append(df)
result_df = pd.concat(dataframes)

In [55]:
result_df.head()

Unnamed: 0,label,precision,recall,f1-score,support,movie_name
0,normal,0.927231,0.986553,0.955972,2454.0,TheWolfofWallStreet
1,offensive,0.923445,0.660959,0.770459,584.0,TheWolfofWallStreet
2,hate,0.0,0.0,0.0,2.0,TheWolfofWallStreet
3,accuracy,0.923355,0.923355,0.923355,0.923355,TheWolfofWallStreet
4,macro avg,0.616892,0.54917,0.575477,3040.0,TheWolfofWallStreet


## macro avg

In [56]:
result_df[result_df.label=='macro avg'].groupby('movie_name')['f1-score'].mean().values.mean().round(2)

0.64

### Accuracy

In [57]:
result_df[result_df.label=='accuracy'].precision.mean()

0.9042221354763885

In [58]:
def get_precision_recall_f1(category):
    precision = result_df[result_df.label==category].precision.mean()
    recall = result_df[result_df.label==category].recall.mean()
    f1 = result_df[result_df.label==category]['f1-score'].mean()
    macro_avg = result_df[result_df.label==category]['f1-score'].mean()
    
    return {'label': category, 'precision': precision, 'recall': recall, 'f1': f1}

In [59]:
normal_dict = get_precision_recall_f1('normal')
offensive_dict = get_precision_recall_f1('offensive')
hate_dict = get_precision_recall_f1('hate')

In [60]:
df_result = pd.DataFrame([normal_dict, offensive_dict, hate_dict])

In [61]:
df_result

Unnamed: 0,label,precision,recall,f1
0,normal,0.928156,0.971211,0.949039
1,offensive,0.652766,0.560201,0.591007
2,hate,0.564356,0.280216,0.372885
