In [1]:
import torch
import pandas as pd
import numpy as np
import sklearn
from collections import Counter

In [2]:
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
from itertools import combinations

In [4]:
import re
import os

In [5]:
import torch.nn as nn

In [6]:
import matplotlib.pyplot as plt

# Data Loading

In [7]:
path = r"E:\github\movie_hatespeech_detection\data\twitter\twitter.csv"
df = pd.read_csv(path, index_col=0)
df = df.rename(columns={'class': 'label'})
# df['label'] = df['label'].replace({0: 2, 2: 0})
df.head()

Unnamed: 0,tweet,label
0,As a woman you shouldn't complain about cleani...,0
1,boy dats cold...tyga dwn bad for cuffin dat ho...,1
2,Dawg : You ever fuck a bitch and she start to...,1
3,she look like a tranny,1
4,The shit you hear about me might be true or it...,1


In [8]:
df.label.value_counts(normalize=True)

1    0.775049
0    0.167498
2    0.057453
Name: label, dtype: float64

In [9]:
df.duplicated(subset='tweet').value_counts()

False    24472
dtype: int64

In [10]:
path = r'E:\github\movie_hatespeech_detection\data\movies_for_training\all_movies.csv'
movie_data = pd.read_csv(path, index_col=0)

In [11]:
movie_data.head()

Unnamed: 0,movie_id,batch_id,majority_answer,text,movie_name
0,AmericanHistoryX(1998)_1,1566624979,0,Derek.,AmerricanHistoryX
1,AmericanHistoryX(1998)_2,1566624979,1,What the fuck are you thinking?,AmerricanHistoryX
2,AmericanHistoryX(1998)_3,1566624979,0,There's a black guy outside breaking into your...,AmerricanHistoryX
3,AmericanHistoryX(1998)_4,1566624979,0,How long has he been there?,AmerricanHistoryX
4,AmericanHistoryX(1998)_5,1566624979,0,I don't know.,AmerricanHistoryX


In [12]:
print(df.label.value_counts())
df.label.value_counts().plot(kind='pie', subplots=True, autopct='%1.0f%%', title='Hate Speech Distribution')

1    18967
0     4099
2     1406
Name: label, dtype: int64


array([<AxesSubplot:ylabel='label'>], dtype=object)

## Data Splitting

In [13]:
def split_dataset(df, seed, test_size):
    train, test = train_test_split(df, test_size=test_size, random_state=seed, shuffle=True)
    return train.tweet.values, train.label.values, test.tweet.values, test.label.values

In [14]:
categories = [0,1,2]
seed = 11
test_size = 0.2

In [15]:
train, train_targets, test, test_targets = split_dataset(df, seed=seed, test_size=test_size)

In [16]:
train_size = len(train)
test_size = len(test)

In [17]:
def calculate_dataset_class_distribution(targets, categories):
    df = pd.DataFrame({'category':targets})
    s = df.category.value_counts(normalize=True)
    s = s.reindex(categories)
    return [s.index[0], s[0]], [s.index[1], s[1]], [s.index[2], s[2]]

In [18]:
train_class_distribution = calculate_dataset_class_distribution(train_targets, categories)
test_class_distribution = calculate_dataset_class_distribution(test_targets, categories)
print(train_class_distribution)
print(test_class_distribution)

([0, 0.16779894774480258], [1, 0.7750421412882464], [2, 0.05715891096695101])
([0, 0.1662921348314607], [1, 0.7750766087844739], [2, 0.05863125638406537])


In [19]:
train_ds = Bunch(data=train, target=train_targets)
test_ds = Bunch(data=test, target=test_targets)

## Buidling the Model

In [20]:
# Getting all the vocabularies and indexing to a unique position
vocab = Counter()
#Indexing words from the training data
for text in train_ds.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

#Indexing words from the training data
for text in test_ds.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

for text in movie_data.text.values:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i

    return word2index

word2index = get_word_2_index(vocab)

In [21]:
print(len(word2index))
print(word2index["the"]) # Showing the index of 'the'
print (total_words)

38658
96
38658


In [22]:
# define the network
class News_20_Net(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(News_20_Net, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True).cuda()
        self.relu = nn.ReLU().cuda()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True).cuda()
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True).cuda()
    # accept input and return an output
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

In [23]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    # Split into different batchs, get the next batch 
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    # get the targets 
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    #print(categories)
    for text in texts:
        # Dimension, 196609
        layer = np.zeros(total_words,dtype=float)

        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
        batches.append(layer)

    # We have 5 categories
    for category in categories:
        #print(category)
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        elif category == 2:
            index_y = 2
        results.append(index_y)

    # the training and the targets
    return np.array(batches),np.array(results)

In [24]:
# Parameters
learning_rate = 0.001
num_epochs = 8
batch_size = 32
display_step = 10 # ADDED will multiplied by 10

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features
input_size = total_words # Words in vocab
num_classes = len(categories)         # Categories: "graphics","space","baseball","guns", "christian"

## Training

In [25]:
results = []

In [26]:
news_net = News_20_Net(input_size, hidden_size, num_classes)
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  # This includes the Softmax loss function
optimizer = torch.optim.Adam(news_net.parameters(), lr=learning_rate)  

# Train the Model
for epoch in range(num_epochs):
    # determine the number of min-batches based on the batch size and size of training data
    total_batch = int(len(train_ds.data)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y = get_batch(train_ds,i,batch_size)
        
        articles = torch.cuda.FloatTensor(batch_x, device='cuda')
        labels = torch.cuda.LongTensor(batch_y, device='cuda')

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = news_net(articles)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % display_step == 0:
            result = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f'%(epoch+1, num_epochs, i+1, len(train_ds.data)/batch_size, loss.data)
            results.append({'Epoch': epoch+1, 'Step': i+1, 'Loss': loss.data.item()})
            if (i+1) % (display_step*10) == 0:
                print({'Epoch': epoch+1, 'Step': i+1, 'Loss': loss.data.item()})

{'Epoch': 1, 'Step': 100, 'Loss': 0.5339282751083374}
{'Epoch': 1, 'Step': 200, 'Loss': 0.6157453656196594}
{'Epoch': 1, 'Step': 300, 'Loss': 0.4290543794631958}
{'Epoch': 1, 'Step': 400, 'Loss': 0.20704731345176697}
{'Epoch': 1, 'Step': 500, 'Loss': 0.2698601484298706}
{'Epoch': 1, 'Step': 600, 'Loss': 0.3692278265953064}
{'Epoch': 2, 'Step': 100, 'Loss': 0.333543062210083}
{'Epoch': 2, 'Step': 200, 'Loss': 0.26942670345306396}
{'Epoch': 2, 'Step': 300, 'Loss': 0.26908552646636963}
{'Epoch': 2, 'Step': 400, 'Loss': 0.11285869032144547}
{'Epoch': 2, 'Step': 500, 'Loss': 0.1551561802625656}
{'Epoch': 2, 'Step': 600, 'Loss': 0.07202954590320587}
{'Epoch': 3, 'Step': 100, 'Loss': 0.1383640021085739}
{'Epoch': 3, 'Step': 200, 'Loss': 0.11787940561771393}
{'Epoch': 3, 'Step': 300, 'Loss': 0.21497097611427307}
{'Epoch': 3, 'Step': 400, 'Loss': 0.038727086037397385}
{'Epoch': 3, 'Step': 500, 'Loss': 0.054312046617269516}
{'Epoch': 3, 'Step': 600, 'Loss': 0.01229896117001772}
{'Epoch': 4, 'Ste

## Validation

In [27]:
# Test the Model
correct = 0
total = 0
total_test_data = len(test_ds.target)

In [28]:
iterates = total_test_data/batch_size # ignore last (<batch_size) batch

In [29]:
all_total = []
all_correct = []
labels_all = []
predicted_all = []

In [30]:
for i in range(int(iterates)):
    batch_x_test,batch_y_test = get_batch(test_ds,i,batch_size)
    articles = torch.FloatTensor(batch_x_test).to('cuda')
    labels = torch.LongTensor(batch_y_test).to('cuda')
    outputs = news_net(articles)
    _, predicted = torch.max(outputs.data, 1)
    
    labels_all.extend([x.item() for x in labels])
    predicted_all.extend([x.item() for x in predicted])

In [31]:
report = classification_report(labels_all, predicted_all, output_dict=True)

In [32]:
df_report = pd.DataFrame(report).transpose()

In [33]:
df_report.round(2)

Unnamed: 0,precision,recall,f1-score,support
0,0.79,0.78,0.78,808.0
1,0.9,0.95,0.93,3773.0
2,0.43,0.18,0.26,283.0
accuracy,0.87,0.87,0.87,0.87
macro avg,0.71,0.64,0.66,4864.0
weighted avg,0.86,0.87,0.86,4864.0


----

## Classication of Movies

### Load Movies

In [34]:
def annotate_df(movie_df):
    utterances = movie_df.text.values
    predictions = []
    batch = []
    
    for text in utterances:
        # Dimension, 196609
        layer = np.zeros(total_words,dtype=float)

        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1

        batch.append(layer)
        
    texts = torch.FloatTensor(batch).to('cuda')
    outputs = news_net(texts)
    _, predicted = torch.max(outputs.data, 1)
    predictions.extend([x.item() for x in predicted])

    result = []
    
    for i, pred in enumerate(predictions):
        result.append({'index': i, 'label_bow_twitter': pred})
    
    result_df = pd.DataFrame(result)
    movie_df = movie_df.merge(result_df, right_index=True, left_index=True)
    
    return movie_df

In [35]:
result_df = annotate_df(movie_data)

In [36]:
result_df.label_bow_twitter.unique()

array([0, 1, 2], dtype=int64)

In [37]:
result_df.label_bow_twitter.value_counts()

1    5951
0    4267
2     470
Name: label_bow_twitter, dtype: int64

In [38]:
result_df.majority_answer.value_counts()

0    9014
1    1380
2     294
Name: majority_answer, dtype: int64

In [39]:
def get_classifications_results(df):
    
    df = df.copy()
    
    labels_all = df.majority_answer.values
    predicted_all = df.label_bow_twitter.values
    
    results_classification = classification_report(labels_all, predicted_all, output_dict=True)
    
    df_report = pd.DataFrame(results_classification).transpose()
    
    return df_report

In [40]:
get_classifications_results(result_df).round(2)

Unnamed: 0,precision,recall,f1-score,support
0,0.95,0.45,0.61,9014.0
1,0.19,0.83,0.31,1380.0
2,0.12,0.19,0.15,294.0
accuracy,0.49,0.49,0.49,0.49
macro avg,0.42,0.49,0.36,10688.0
weighted avg,0.83,0.49,0.56,10688.0
