#Sentiment Classification

In [0]:
# SWITCH TO GPU

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import torch 
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

SEED = 15
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Get the data from github 

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/w-is-h/tmp/master/dataset.csv", encoding='cp1252')
x = df['SentimentText'].values
y = df['Sentiment'].values
print(y)
print(x[0])

[1 0 1 ... 0 0 1]
first think another Disney movie, might good, it's kids movie. watch it, can't help enjoy it. ages love movie. first saw movie 10 8 years later still love it! Danny Glover superb could play part better. Christopher Lloyd hilarious perfect part. Tony Danza believable Mel Clark. can't help, enjoy movie! give 10/10!


In [0]:
# Remove mails and https links
pat_1 = r"(?:\@|https?\://)\S+"
# Remove tags
pat_2 = r'#\w+ ?'
# Combine into one regex
combined_pat = r'|'.join((pat_1, pat_2))
# Remove websites
www_pat = r'www.[^ ]+'
# Remove HTML tags
html_tag = r'<[^>]+>'
def data_cleaner(text):
  cleantags = ""
  try:
    stripped = re.sub(combined_pat, '', text)
    stripped = re.sub(www_pat, '', stripped)
    cleantags = re.sub(html_tag, '', stripped)
  except Exception as e:
    print(e)
    cleantags = "None"
  return cleantags

x_original = x
x = [data_cleaner(review) for review in x]
print(x[0])

first think another Disney movie, might good, it's kids movie. watch it, can't help enjoy it. ages love movie. first saw movie 10 8 years later still love it! Danny Glover superb could play part better. Christopher Lloyd hilarious perfect part. Tony Danza believable Mel Clark. can't help, enjoy movie! give 10/10!


# SpaCy

We can use spacy to tokenize the text and further clean it.

In [0]:
import spacy
from spacy.attrs import LOWER
# Load the english model for spacy, the disable part is used to make it faster
nlp = spacy.load('en', disable=['ner', 'parser'])

tok_snts = []
for snt in x:
  tkns = [tkn.lemma_.lower() for tkn in nlp.tokenizer(snt) if not tkn.is_punct]
  tok_snts.append(tkns)
# Save back
x = tok_snts
# Print the first sentence
print(x[0])

['\ufeff1', 'think', 'another', 'disney', 'movie', 'may', 'good', '-pron-', 'have', 'kid', 'movie', 'watch', 'it', 'can', 'not', 'help', 'enjoy', 'it', 'age', 'love', 'movie', '\ufeff1', 'see', 'movie', '10', '8', 'year', 'late', 'still', 'love', 'it', 'danny', 'glover', 'superb', 'can', 'play', 'part', 'well', 'christopher', 'lloyd', 'hilarious', 'perfect', 'part', 'tony', 'danza', 'believable', 'mel', 'clark', 'can', 'not', 'help', 'enjoy', 'movie', 'give', '10/10']


# Train word2vec

In [0]:
from gensim.models import Word2Vec
w2v = Word2Vec(x, size=300, window=6, min_count=4, workers=4)
w2v.wv.most_similar("bad")

  if np.issubdtype(vec.dtype, np.int):


[('awful', 0.7379966974258423),
 ('terrible', 0.7341678142547607),
 ('horrible', 0.7156305313110352),
 ('suck', 0.690376877784729),
 ('lousy', 0.6557600498199463),
 ('lame', 0.6505146026611328),
 ('ritchie', 0.6240953207015991),
 ('crappy', 0.6187602877616882),
 ('alright', 0.6176861524581909),
 ('stupid', 0.6132094860076904)]

# Convert each sentence into the average sum of the vector representations of its tokens

Save the results into a new variable x_emb

In [0]:
# x_emb - embedded sentences
x_emb = np.zeros((len(x), 300))
# Loop over sentences
for i_snt, snt in enumerate(x):
  cnt = 0
  # Loop over the words of a sentence
  for i_word, word in enumerate(snt):
    if word in w2v.wv:
      x_emb[i_snt] += w2v.wv.get_vector(word)
      cnt += 1
  if cnt > 0:
    x_emb[i_snt] = x_emb[i_snt] / cnt
# Save the originals, will be needd later
x_or = x_emb
y_or = y

# Split the dataset into train/test/dev

In [0]:
# TODO: Find the indices where y_or == 1 and y_or == 1
inds_z = np.where(y_or == 0)[0]
inds_o = np.where(y_or == 1)[0]
print(inds_z)
print(inds_o)

[    1     3     6 ... 24995 24997 24998]
[    0     2     4 ... 24993 24996 24999]


In [0]:
# TODO: Create new x_emb and y, so that the new x_emb has 12500 negative examples and 1000 positive examples
x_emb = x_or[np.concatenate((inds_z, inds_o[0:1000])), :]
y = y_or[np.concatenate((inds_z, inds_o[0:1000]))]

# TODO: Create x_one that has 1000 positive examples and x_zero that has 12500 negative examples
x_one = x_or[inds_o[0:1000], :]
x_zero = x_or[inds_z, :]

In [0]:
print(y.shape)
print(x_emb.shape)
print("Number of positive examples in y: " + str(np.sum(y)))

(13500,)
(13500, 300)
Number of positive examples in y: 1000


In [0]:
# Get torch stuff
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import sklearn.metrics

In [0]:
from sklearn.model_selection import train_test_split
np.random.seed(SEED)
y = y.reshape(-1)
x_train, x_test, y_train, y_test = train_test_split(x_emb, y, test_size=0.2, random_state=SEED)
x_train = torch.tensor(x_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)

x_test = torch.tensor(x_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

x_one = torch.tensor(x_one, dtype=torch.float32)
x_zero = torch.tensor(x_zero, dtype=torch.float32)

#Build the network

In [0]:
device = torch.device('cuda')
class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      self.fc1 = nn.Linear(300, 100)
      self.fc4 = nn.Linear(100, 2)
      
      self.d1 = nn.Dropout(0.5)
      
    def forward(self, x):
      x = self.d1(torch.relu(self.fc1(x)))
      x = torch.sigmoid(self.fc4(x))
      return x
# Create the network and get CE loss
net = Net()
#criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss(weight=torch.tensor([0.1, 0.9]).to(device))
# Make a SGD optimizer with lr=0.002 and momentum=0.99
optimizer = optim.SGD(net.parameters(), lr=0.02, momentum=0.99)
# Move the net to the device
net.to(device)

Net(
  (fc1): Linear(in_features=300, out_features=100, bias=True)
  (fc4): Linear(in_features=100, out_features=2, bias=True)
  (d1): Dropout(p=0.5)
)

# Train

In [0]:
# Move data to the right device
x_train = x_train.to(device)
y_train = y_train.to(device)
x_test = x_test.to(device)
y_test = y_test.to(device)
net.train()
losses = []
accs = []
accs_dev = []
for epoch in range(10000): 
  optimizer.zero_grad()
  outputs = net(x_train)
  loss = criterion(outputs, y_train)
  loss.backward()
  optimizer.step()

  if epoch % 500 == 0:
      net.eval()
      acc = sklearn.metrics.accuracy_score(torch.max(outputs, 1)[1].cpu().detach().numpy(), y_train.cpu().numpy())
      
      outputs_dev = net(x_test)
      acc_dev = sklearn.metrics.accuracy_score(torch.max(outputs_dev, 1)[1].cpu().detach().numpy(), y_test.cpu().numpy())
      accs_dev.append(acc_dev)
      
      # TODO: calculate the f1_score, precision and recall
      f1_dev = f1_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      p_dev = precision_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      r_dev = recall_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      
      print("Epoch: {:4} Loss: {:.5f} Acc: {:.3f} Acc Dev: {:.3f} F1 Dev: {:.3f} p Dev: {:.3f} r Dev: {:.3f}".format(epoch, loss.item(), acc, acc_dev, f1_dev, p_dev, r_dev))
      net.train()
print('Finished Training')

Epoch:    0 Loss: 0.69230 Acc: 0.475 Acc Dev: 0.496 F1 Dev: 0.140 p Dev: 0.079 r Dev: 0.627
Epoch:  500 Loss: 0.45132 Acc: 0.859 Acc Dev: 0.856 F1 Dev: 0.411 p Dev: 0.280 r Dev: 0.768
Epoch: 1000 Loss: 0.43840 Acc: 0.877 Acc Dev: 0.861 F1 Dev: 0.417 p Dev: 0.288 r Dev: 0.757
Epoch: 1500 Loss: 0.43202 Acc: 0.884 Acc Dev: 0.868 F1 Dev: 0.429 p Dev: 0.300 r Dev: 0.757
Epoch: 2000 Loss: 0.42726 Acc: 0.888 Acc Dev: 0.873 F1 Dev: 0.439 p Dev: 0.309 r Dev: 0.757
Epoch: 2500 Loss: 0.42536 Acc: 0.893 Acc Dev: 0.876 F1 Dev: 0.443 p Dev: 0.314 r Dev: 0.751
Epoch: 3000 Loss: 0.42206 Acc: 0.896 Acc Dev: 0.879 F1 Dev: 0.442 p Dev: 0.316 r Dev: 0.734
Epoch: 3500 Loss: 0.41778 Acc: 0.902 Acc Dev: 0.881 F1 Dev: 0.445 p Dev: 0.320 r Dev: 0.729
Epoch: 4000 Loss: 0.41463 Acc: 0.907 Acc Dev: 0.883 F1 Dev: 0.450 p Dev: 0.326 r Dev: 0.729
Epoch: 4500 Loss: 0.41024 Acc: 0.910 Acc Dev: 0.884 F1 Dev: 0.445 p Dev: 0.324 r Dev: 0.712
Epoch: 5000 Loss: 0.40788 Acc: 0.910 Acc Dev: 0.886 F1 Dev: 0.445 p Dev: 0.326 r

In [0]:
np.sum(torch.max(outputs_dev, 1)[1].cpu().detach().numpy())

0

In [0]:
# Create the network and get BCE loss
net = Net()
criterion = nn.CrossEntropyLoss()
#criterion = nn.CrossEntropyLoss(weight=[0.8, 0.2])
# TODO: Make a SGD optimizer with lr=0.002 and momentum=0.99
optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.99)
# TODO: Move the net to the device
net.to(device)

# Let's do the same but with BATCHES
x_train = x_train.to(device)
y_train = y_train.to(device)
x_test = x_test.to(device)
y_test = y_test.to(device)
x_zero = x_zero.to(device)
x_one = x_one.to(device)
net.train()
losses = []
accs = []
accs_dev = []

batch_size = 1000
num_batches = int(np.ceil(len(x_train) / batch_size))
for epoch in range(10000): 
  for i in range(num_batches):
    start = i * batch_size
    end = (i+1) * batch_size
    
    x_train_batch = torch.cat((x_one[torch.randperm(len(x_one))[0:500]], x_zero[torch.randperm(len(x_zero))[0:500]]), 0)
    y_train_batch = torch.zeros_like(x_train_batch[:, 0], dtype=torch.long) 
    y_train_batch[0:500] = 1                  
    y_train.to(device)
    optimizer.zero_grad()
    outputs = net(x_train_batch)
    loss = criterion(outputs, y_train_batch)
    loss.backward()
    optimizer.step()

  if epoch % 500 == 0:
      net.eval()
      outputs = net(x_train)
      acc = sklearn.metrics.accuracy_score([1 if x > 0.5 else 0 for x in torch.max(outputs, 1)[1].cpu().detach().numpy()], y_train.cpu().numpy())

      outputs_dev = net(x_test)
      f1_dev = f1_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      p_dev = precision_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      r_dev = recall_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      
      print("Epoch: {:4} Loss: {:.5f} Acc: {:.3f} Acc Dev: {:.3f} F1 Dev: {:.3f} p Dev: {:.3f} r Dev: {:.3f}".format(epoch, loss.item(), acc, acc_dev, f1_dev, p_dev, r_dev))
      net.train()
print('Finished Training')

Epoch:    0 Loss: 0.69016 Acc: 0.924 Acc Dev: 0.934 F1 Dev: 0.011 p Dev: 1.000 r Dev: 0.006
Epoch:  500 Loss: 0.38121 Acc: 0.929 Acc Dev: 0.934 F1 Dev: 0.644 p Dev: 0.494 r Dev: 0.927
Epoch: 1000 Loss: 0.36795 Acc: 0.951 Acc Dev: 0.934 F1 Dev: 0.743 p Dev: 0.618 r Dev: 0.932
Epoch: 1500 Loss: 0.37364 Acc: 0.964 Acc Dev: 0.934 F1 Dev: 0.793 p Dev: 0.690 r Dev: 0.932
Epoch: 2000 Loss: 0.35429 Acc: 0.970 Acc Dev: 0.934 F1 Dev: 0.815 p Dev: 0.724 r Dev: 0.932
Epoch: 2500 Loss: 0.35130 Acc: 0.975 Acc Dev: 0.934 F1 Dev: 0.838 p Dev: 0.760 r Dev: 0.932
Epoch: 3000 Loss: 0.35295 Acc: 0.978 Acc Dev: 0.934 F1 Dev: 0.855 p Dev: 0.789 r Dev: 0.932
Epoch: 3500 Loss: 0.35595 Acc: 0.980 Acc Dev: 0.934 F1 Dev: 0.862 p Dev: 0.801 r Dev: 0.932
Epoch: 4000 Loss: 0.35849 Acc: 0.983 Acc Dev: 0.934 F1 Dev: 0.868 p Dev: 0.813 r Dev: 0.932
Epoch: 4500 Loss: 0.34939 Acc: 0.984 Acc Dev: 0.934 F1 Dev: 0.885 p Dev: 0.842 r Dev: 0.932
Epoch: 5000 Loss: 0.34665 Acc: 0.985 Acc Dev: 0.934 F1 Dev: 0.889 p Dev: 0.851 r