In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import spacy
import string
from nltk.util import ngrams
from happyfuntokenizing import Tokenizer as potts
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn import preprocessing
import torch
import gc

import warnings
warnings.filterwarnings('ignore')

def potter_tokenizer(data):
  tokenizer= potts()
  return tokenizer.tokenize(data)

# These 2 methods inspired by DonDuminda on Medium "Sentiment Analysis on Customer Tweets (NLP)"
def remove_url(data):
  tweet = re.sub('https?://[A-Za-z0-9./]+', '', data)
  return tweet

def remove_punc(data):
  text = "".join([char for char in data if char not in string.punctuation])
  return text

In [3]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')

In [4]:
train.head()

Unnamed: 0,id,text,target
0,86426,@USER She should ask a few native Americans wh...,1
1,16820,Amazon is investigating Chinese employees who ...,0
2,62688,"@USER Someone should'veTaken"" this piece of sh...",1
3,43605,@USER @USER Obama wanted liberals &amp; illega...,0
4,97670,@USER Liberals are all Kookoo !!!,1


In [5]:
bow = []
for i in train['text']:
    desc = remove_url(i)
    desc = remove_punc(i)
    tokens = potter_tokenizer(desc)
    tokens = [token for token in tokens]
    description =' '.join(tokens)
    bow.append(description)
bow[0]

'user she should ask a few native americans what their take on this is'

In [6]:
train['text'] = bow

In [7]:
import torch
import torch.nn as nn

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from datetime import datetime
from pathlib import Path
import pandas as pd

import torchtext.data as ttd

In [8]:
df = train

df.head()

Unnamed: 0,id,text,target
0,86426,user she should ask a few native americans wha...,1
1,16820,amazon is investigating chinese employees who ...,0
2,62688,user someone shouldvetaken this piece of shit ...,1
3,43605,user user obama wanted liberals amp illegals t...,0
4,97670,user liberals are all kookoo,1


In [9]:
# drop unnecessary columns
df = df.drop(["id"], axis=1)

In [10]:
df.head()

Unnamed: 0,text,target
0,user she should ask a few native americans wha...,1
1,amazon is investigating chinese employees who ...,0
2,user someone shouldvetaken this piece of shit ...,1
3,user user obama wanted liberals amp illegals t...,0
4,user liberals are all kookoo,1


In [11]:
df['target'].value_counts()

0    6220
1    3126
Name: target, dtype: int64

In [12]:
df.columns = ['data', 'labels']

In [13]:
df=df.sample(frac=1)

In [14]:
df.head()

Unnamed: 0,data,labels
6516,user wish to suck angel cookie,0
8832,user user its god not god,0
1133,user boycott the nfl,0
6836,user can it kid,0
7269,user user user user user user user user user u...,0


In [15]:
df.to_csv('preprocessed.csv', index=False)

In [16]:
TEXT = ttd.Field(
    sequential=True,
    batch_first=True,
    lower=False,
    tokenize='spacy',
    pad_first=True)

LABEL = ttd.LabelField()

dataset = ttd.TabularDataset(
    path= 'preprocessed.csv',
    format='csv',
    skip_header=True,
    fields=[('data', TEXT), ('label', LABEL)]
)

In [17]:
import random
train_dataset, test_dataset = dataset.split(split_ratio=0.95) # default is 0.7

In [18]:
train_dataset, valid_dataset = train_dataset.split() # default is 0.7

In [19]:
print(f'Number of training examples: {len(train_dataset)}')
print(f'Number of validation examples: {len(valid_dataset)}')
print(f'Number of testing examples: {len(test_dataset)}')

Number of training examples: 6215
Number of validation examples: 2664
Number of testing examples: 467


In [20]:
MAX_VOCAB_SIZE = 30000

TEXT.build_vocab(train_dataset, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = 'glove.6B.300d', 
                 unk_init = torch.Tensor.normal_)

.vector_cache/glove.6B.zip: 862MB [06:35, 2.18MB/s]                          
100%|█████████▉| 399316/400000 [00:51<00:00, 7496.05it/s]

In [21]:
LABEL.build_vocab(train_dataset)

In [22]:
vocab_text = TEXT.vocab
vocab_label = LABEL.vocab

In [23]:
len(vocab_text)

13692

In [24]:
vocab_label.stoi

defaultdict(<function torchtext.vocab._default_unk_index>, {'0': 0, '1': 1})

In [25]:
vocab_label.itos

['0', '1']

In [26]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [27]:
train_iter, valid_iter, test_iter = ttd.BucketIterator.splits((train_dataset,valid_dataset,test_dataset), 
                              sort_key=lambda x: len(x.data),
                              batch_sizes=(64,64,64), 
                              device=device)

In [28]:
for batch in train_iter:
  print("inputs:", batch.data, batch.data.shape)
  print("targets:",batch.label, "shape:", batch.label.shape)
  break

inputs: tensor([[    1,     1,     1,  ...,  4310,  1291, 13625],
        [    1,     1,     1,  ...,  1999,   175,   471],
        [    1,     1,     1,  ...,   497,  4303,    19],
        ...,
        [    1,     1,     1,  ...,  5784,  4848,   120],
        [    1,     1,     1,  ...,   131,     3,   134],
        [    1,     1,     1,  ...,    65,   234, 11235]], device='cuda:0') torch.Size([64, 61])
targets: tensor([0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
        1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0], device='cuda:0') shape: torch.Size([64])


In [29]:
for batch in valid_iter:
  print("inputs:", batch.data, batch.data.shape)
  print("targets:",batch.label, "shape:", batch.label.shape)
  break

inputs: tensor([[   2,    2,  607, 1466],
        [   2, 9518,  224,  328],
        [   2,   38,    0,  180],
        [   2, 1398,    8,  220],
        [   2,   16,    4,  320],
        [   2, 4246,    3,  678],
        [   2,   13,    4,   19],
        [   2,    0,  110,    0],
        [   2,  322,   37,    0],
        [   2,    2,    2,  467],
        [   2,   95,  183,  305],
        [   2,   16,    4,  406],
        [   2,   13,    4, 1918],
        [   2,   21,   54,   32],
        [   2,  180,   70, 2262],
        [   2,  149,   13,    4],
        [   2,    2,  205,  179],
        [   2,    6,  957,   18],
        [   2,  228,  110,   19],
        [   1,    2,  274,  222],
        [   1,    2,  670,  806],
        [   1,    2,   21, 9343],
        [   1,    2,    2,    0],
        [   1,    2,  257,  110],
        [   1,    2,  712,  665],
        [   1,    2,   53,    0],
        [   1,    2,  294,   53],
        [   1,    2,    8,  669],
        [   1,   47,   41,   19],
      

In [30]:
for batch in test_iter:
  print("inputs:", batch.data[0], batch.data[0].shape)
  print("targets:",batch.label, "shape:", batch.label.shape)
  break

inputs: tensor([   2,  101,   15,   49, 1037,    8,  898,  133], device='cuda:0') torch.Size([8])
targets: tensor([0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1], device='cuda:0') shape: torch.Size([64])


In [31]:
# Defining the model
class RNN(nn.Module):
  def __init__(self, n_vocab, embed_dim, n_hidden, n_rnnlayers, n_outputs, bidirectional, dropout_rate):
    super(RNN, self).__init__()
    self.V = n_vocab
    self.D = embed_dim
    self.M = n_hidden
    self.K = n_outputs
    self.L = n_rnnlayers
    self.num_diections= bidirectional
    self.dropout_rate=dropout_rate
    
    self.embed = nn.Embedding(self.V, self.D)
    
    self.rnn = nn.LSTM(
        input_size=self.D,
        hidden_size=self.M,
        num_layers=self.L,
        bidirectional=self.num_diections,
        dropout= self.dropout_rate,
        batch_first=True)
    
    self.fc = nn.Linear(self.M *2 , self.K)

    self.dropout= nn.Dropout(self.dropout_rate)
  
  def forward(self, X):
    h0 = torch.zeros(self.L*2, X.size(0), self.M).to(device)
    c0 = torch.zeros(self.L*2, X.size(0), self.M).to(device)

    embedding = self.embed(X)   
    embedding= self.dropout(embedding) 

    # get RNN unit output
    output, (hidden,cell) = self.rnn(embedding, (h0, c0))

    output, _ = torch.max(output, 1)
    output= self.dropout(output)
    output = self.fc(output)
    return output

In [32]:
n_vocab = len(TEXT.vocab)
embed_dim = 200
n_hidden = 256 
n_rnnlayers = 2
n_outputs =2
bidirectional = True 
dropout_rate = 0.5 


In [33]:
model = RNN(n_vocab, embed_dim, n_hidden, n_rnnlayers, n_outputs, bidirectional, dropout_rate)
model.to(device)

RNN(
  (embed): Embedding(13692, 200)
  (rnn): LSTM(200, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [34]:
for name, param in model.named_parameters():
  print(name, param.shape)

embed.weight torch.Size([13692, 200])
rnn.weight_ih_l0 torch.Size([1024, 200])
rnn.weight_hh_l0 torch.Size([1024, 256])
rnn.bias_ih_l0 torch.Size([1024])
rnn.bias_hh_l0 torch.Size([1024])
rnn.weight_ih_l0_reverse torch.Size([1024, 200])
rnn.weight_hh_l0_reverse torch.Size([1024, 256])
rnn.bias_ih_l0_reverse torch.Size([1024])
rnn.bias_hh_l0_reverse torch.Size([1024])
rnn.weight_ih_l1 torch.Size([1024, 512])
rnn.weight_hh_l1 torch.Size([1024, 256])
rnn.bias_ih_l1 torch.Size([1024])
rnn.bias_hh_l1 torch.Size([1024])
rnn.weight_ih_l1_reverse torch.Size([1024, 512])
rnn.weight_hh_l1_reverse torch.Size([1024, 256])
rnn.bias_ih_l1_reverse torch.Size([1024])
rnn.bias_hh_l1_reverse torch.Size([1024])
fc.weight torch.Size([2, 512])
fc.bias torch.Size([2])


In [35]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([13692, 300])


In [38]:
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

model.embed.weight.data[unk_idx] = torch.zeros(embed_dim)
model.embed.weight.data[pad_idx] = torch.zeros(embed_dim)

print(model.embed.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.9839, -0.0575, -0.9533,  ..., -0.5240, -0.2401,  0.7062],
        ...,
        [-0.3671,  1.2739,  1.0778,  ..., -0.4097, -0.6820,  0.9246],
        [ 0.1456, -1.0601,  0.1977,  ...,  1.4106, -0.4894, -0.2281],
        [-2.2618, -0.3834, -0.5456,  ...,  0.7737, -0.8125, -0.6338]],
       device='cuda:0')


## Training Loop

In [39]:
learning_rate = 0.005
epochs = 100
# STEP 5: INSTANTIATE LOSS CLASS
criterion = nn.CrossEntropyLoss()

# STEP 6: INSTANTIATE OPTIMIZER CLASS

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Freeze embedding Layer

#freeze embeddings
model.embed.weight.requires_grad  = False

# STEP 7: TRAIN THE MODEL

train_losses= np.zeros(epochs)
valid_losses= np.zeros(epochs)


for epoch in range(epochs):
  
  t0= datetime.now()
  train_loss=[]
  
  model.train()
  for batch in train_iter:
   
    # forward pass
    output= model(batch.data)
    loss=criterion(output,batch.label)

    # set gradients to zero 
    optimizer.zero_grad()

    # backward pass
    loss.backward()
    optimizer.step()
    train_loss.append(loss.item())
  
  train_loss=np.mean(train_loss)
      
  valid_loss=[]
  model.eval()
  with torch.no_grad():
    for batch in valid_iter:
 
      # forward pass
      output= model(batch.data)
      loss=criterion(output,batch.label)
      
      valid_loss.append(loss.item())

    valid_loss=np.mean(valid_loss)
  
  # save Losses
  train_losses[epoch]= train_loss
  valid_losses[epoch]= valid_loss
  dt= datetime.now()-t0
  print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}    Valid Loss: {valid_loss:.4f}, Duration: {dt}')

Epoch 1/100, Train Loss: 0.6426    Valid Loss: 0.6058, Duration: 0:00:03.297271
Epoch 2/100, Train Loss: 0.6013    Valid Loss: 0.5448, Duration: 0:00:03.150546
Epoch 3/100, Train Loss: 0.5777    Valid Loss: 0.5416, Duration: 0:00:03.180831
Epoch 4/100, Train Loss: 0.5605    Valid Loss: 0.5354, Duration: 0:00:03.180491
Epoch 5/100, Train Loss: 0.5451    Valid Loss: 0.5491, Duration: 0:00:03.136622
Epoch 6/100, Train Loss: 0.5361    Valid Loss: 0.5449, Duration: 0:00:03.149949
Epoch 7/100, Train Loss: 0.5246    Valid Loss: 0.5521, Duration: 0:00:03.178646
Epoch 8/100, Train Loss: 0.5160    Valid Loss: 0.5641, Duration: 0:00:03.192994
Epoch 9/100, Train Loss: 0.5158    Valid Loss: 0.5712, Duration: 0:00:03.206887
Epoch 10/100, Train Loss: 0.4946    Valid Loss: 0.6089, Duration: 0:00:03.144724
Epoch 11/100, Train Loss: 0.4981    Valid Loss: 0.5665, Duration: 0:00:03.179806
Epoch 12/100, Train Loss: 0.4880    Valid Loss: 0.5586, Duration: 0:00:03.161591
Epoch 13/100, Train Loss: 0.4899    V

In [40]:
# Accuracy- write a function to get accuracy
# use this function to get accuracy and print accuracy
def get_accuracy(data_iter, model):
  model.eval()
  with torch.no_grad():
    correct =0 
    total =0
    
    for batch in data_iter:

      output=model(batch.data)
      _,indices = torch.max(output,dim=1)
      correct+= (batch.label==indices).sum().item()
      total += batch.label.shape[0]
    
    acc= correct/total

    return acc

In [41]:
train_acc = get_accuracy(train_iter, model)
valid_acc = get_accuracy(valid_iter, model)
test_acc = get_accuracy(test_iter ,model)
print(f'Train acc: {train_acc:.4f},\t Valid acc: {valid_acc:.4f},\t Test acc: {test_acc:.4f}')

Train acc: 0.7947,	 Valid acc: 0.7511,	 Test acc: 0.7238


In [42]:
def get_predictions(test_iter, model):
  model.eval()
  with torch.no_grad():
    predictions= np.array([])
    y_test= np.array([])

    for batch in test_iter:
      
      output=model(batch.data)
      _,indices = torch.max(output,dim=1)
      predictions=np.concatenate((predictions,indices.cpu().numpy())) 
      y_test = np.concatenate((y_test,batch.label.cpu().numpy())) 
      
  return y_test, predictions

In [43]:
y_test, predictions=get_predictions(test_iter, model)

In [44]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, predictions)

array([[289,   4],
       [125,  49]])

In [45]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.70      0.99      0.82       293
         1.0       0.92      0.28      0.43       174

    accuracy                           0.72       467
   macro avg       0.81      0.63      0.62       467
weighted avg       0.78      0.72      0.67       467



In [46]:
cm=confusion_matrix(y_test,predictions)
cm

array([[289,   4],
       [125,  49]])