In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('labeled_data.txt')
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data['tweet'], data['class'], test_size=0.2)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(19826,)
(4957,)
(19826,)
(4957,)


In [4]:
class Net(nn.Module):
  def __init__(self, num_words, emb_dim, num_y):
    super().__init__()
    self.emb = nn.Embedding(num_words, emb_dim)
    self.linear = nn.Linear(emb_dim, num_y)
    self.sigmoid = nn.Sigmoid()

  def forward(self, text):
    embeds = torch.mean(self.emb(text), dim=0)
    return self.sigmoid(self.linear(embeds))

In [5]:
def load_vocab(text):
  word_to_ix = {}
  for sent , label in text:
    for word in sent.split():
      word_to_ix.setdefault(word, len(word_to_ix))
  return word_to_ix

In [6]:
train_data = list(zip(x_train, y_train))
tok_to_ix = load_vocab(train_data)

In [7]:
emb_dim = 30
num_classes = 3
learning_rate = 0.001
model = Net(len(tok_to_ix), emb_dim, num_classes)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

In [8]:
n_epochs = 2
for epoch in range(n_epochs):
  model.train()
  for text, label in train_data:
    x = [tok_to_ix[tok] for tok in text.split()]
    x_train_tensor = torch.LongTensor(x)
    y_train_tensor = torch.Tensor([label])
    
    labels = [0, 0, 0]
    labels[int(y_train_tensor[0])] = 1
    y_train_tensor = torch.Tensor(labels)
    
    
    pred_y = model(x_train_tensor)
    loss = loss_fn(pred_y, y_train_tensor)
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  print("\nEpoch:", epoch)
  print("Training loss:", loss.item())


Epoch: 0
Training loss: 0.02260926365852356

Epoch: 1
Training loss: 0.0061704800464212894


In [9]:
test_data = list(zip(x_test, y_test))
tok_to_ix = load_vocab(test_data)

with torch.no_grad():
    scores = []
    model.eval()
    num_correct = 0
    for text, label in test_data:
        x = [tok_to_ix[tok] for tok in text.split()]
        x_test_tensor = torch.LongTensor(x)
        y_test_tensor = torch.Tensor([label])
    
        labels = [0, 0, 0]
        labels[int(y_test_tensor[0])] = 1
        y_test_tensor = torch.Tensor(labels)
    
        pred_y_test = model(x_test_tensor)
        pred_y_test = list(np.array(pred_y_test))
        idx_of_pred = pred_y_test.index(np.max(pred_y_test))
        if idx_of_pred == list(np.array(y_test_tensor)).index(1):
            num_correct += 1
    accuracy = num_correct/len(test_data)
    print('Test Accuracy:', accuracy)

Test Accuracy: 0.7359289893080492
