#Sentiment Classification

In [0]:
# SWITCH TO GPU

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import torch 

SEED = 15
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Find index of the maximum value

In [0]:
x = torch.tensor([[0.5, 0.7], [0.3, 0.1]])
print(x)

tensor([[0.5000, 0.7000],
        [0.3000, 0.1000]])


In [0]:
# TODO: Use torch max to calculate the maximum value in each row
#print(torch.max(x, 1))

# Get the data from github 

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/w-is-h/tmp/master/dataset.csv", encoding='cp1252')
x = df['SentimentText'].values[0:6000]
y = df['Sentiment'].values[0:6000]
print(y)
print(x[0])

[1 0 1 ... 1 1 0]
first think another Disney movie, might good, it's kids movie. watch it, can't help enjoy it. ages love movie. first saw movie 10 8 years later still love it! Danny Glover superb could play part better. Christopher Lloyd hilarious perfect part. Tony Danza believable Mel Clark. can't help, enjoy movie! give 10/10!


In [0]:
# Remove mails and https links
pat_1 = r"(?:\@|https?\://)\S+"
# Remove tags
pat_2 = r'#\w+ ?'
# Combine into one regex
combined_pat = r'|'.join((pat_1, pat_2))
# Remove websites
www_pat = r'www.[^ ]+'
# Remove HTML tags
html_tag = r'<[^>]+>'
def data_cleaner(text):
  cleantags = ""
  try:
    stripped = re.sub(combined_pat, '', text)
    stripped = re.sub(www_pat, '', stripped)
    cleantags = re.sub(html_tag, '', stripped)
  except Exception as e:
    print(e)
    cleantags = "None"
  return cleantags

x_original = x
x = [data_cleaner(review) for review in x]
print(x[0])

first think another Disney movie, might good, it's kids movie. watch it, can't help enjoy it. ages love movie. first saw movie 10 8 years later still love it! Danny Glover superb could play part better. Christopher Lloyd hilarious perfect part. Tony Danza believable Mel Clark. can't help, enjoy movie! give 10/10!


# SpaCy

We can use spacy to tokenize the text and further clean it.

In [0]:
import spacy
from spacy.attrs import LOWER
# Load the english model for spacy, the disable part is used to make it faster
nlp = spacy.load('en', disable=['ner', 'parser'])

tok_snts = []
for snt in x:
  tkns = [tkn.lemma_.lower() for tkn in nlp.tokenizer(snt) if not tkn.is_punct]
  tok_snts.append(tkns)
# Save back
x = tok_snts
# Print the first sentence
print(x[0])

['\ufeff1', 'think', 'another', 'disney', 'movie', 'may', 'good', '-pron-', 'have', 'kid', 'movie', 'watch', 'it', 'can', 'not', 'help', 'enjoy', 'it', 'age', 'love', 'movie', '\ufeff1', 'see', 'movie', '10', '8', 'year', 'late', 'still', 'love', 'it', 'danny', 'glover', 'superb', 'can', 'play', 'part', 'well', 'christopher', 'lloyd', 'hilarious', 'perfect', 'part', 'tony', 'danza', 'believable', 'mel', 'clark', 'can', 'not', 'help', 'enjoy', 'movie', 'give', '10/10']


In [0]:
from gensim.models import Word2Vec
w2v = Word2Vec(x, size=300, window=6, min_count=4, workers=4)
w2v.wv.most_similar("bad")

  if np.issubdtype(vec.dtype, np.int):


[('funny', 0.8754518032073975),
 ('pretty', 0.8673598766326904),
 ("it's", 0.8553382158279419),
 ('stupid', 0.8540974855422974),
 ('plain', 0.8357093334197998),
 ('yes', 0.8318045139312744),
 ('cool', 0.8311686515808105),
 ('ok', 0.830740749835968),
 ('okay', 0.8268406391143799),
 ('horrible', 0.8229296207427979)]

# Convert each sentence into the average sum of the vector representations of its tokens

Save the results into a new variable x_emb

In [0]:
# x_emb - embedded sentences
x_emb = np.zeros((len(x), 300))
# Loop over sentences
for i_snt, snt in enumerate(x):
  cnt = 0
  # Loop over the words of a sentence
  for i_word, word in enumerate(snt):
    if word in w2v.wv:
      x_emb[i_snt] += w2v.wv.get_vector(word)
      cnt += 1
  if cnt > 0:
    x_emb[i_snt] = x_emb[i_snt] / cnt
# Save the originals, will be needd later
x_or = x_emb
y_or = y

# Split the dataset into train/test - USE SKLEARN

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_emb, y, test_size=0.2, random_state=SEED)
x_train = torch.tensor(x_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)

x_test = torch.tensor(x_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

#Build the network

In [0]:
# Get torch stuff
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import sklearn.metrics

device = torch.device('cuda')
class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      self.fc1 = nn.Linear(300, 100)
      # TODO Switch from binary to multicalss with 2 classes
      self.fc2 = nn.Linear(100, 2)
      
      self.d1 = nn.Dropout(0.5)
      
    def forward(self, x):
      x = self.d1(torch.relu(self.fc1(x)))
      x = torch.sigmoid(self.fc2(x))
      return x
    
net = Net()
# TODO: Switch from Binary cross entropy to Cross Entropy
criterion = nn.CrossEntropyLoss() #nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.99)

# TODO LATER: Switch to Adam
#optimizer = optim.Adam(net.parameters(), lr=0.006)
net.to(device)

Net(
  (fc1): Linear(in_features=300, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=2, bias=True)
  (d1): Dropout(p=0.5)
)

# Train

In [0]:
# Move data to the right device
x_train = x_train.to(device)
y_train = y_train.to(device)
x_test = x_train.to(device)
y_test = y_train.to(device)

print("IGNORE THE ACC - WE ARE USING A SMALL SUBESET OF THE DATA")
net.train()
for epoch in range(5000): 
  optimizer.zero_grad()
  outputs = net(x_train)
  loss = criterion(outputs, y_train)
  loss.backward()
  optimizer.step()
  
  # print statistics
  if epoch % 500 == 0:
      net.eval()
      #TODO: the way we calculate the output is wrong, we can't use if x > 0.5, but have to use max index per row
      #acc = sklearn.metrics.accuracy_score([1 if x > 0.5 else 0 for x in outputs.cpu().detach().numpy()], y_train.cpu().numpy())
      outputs = torch.max(outputs, 1)[1]
      acc = sklearn.metrics.accuracy_score(outputs.cpu().detach().numpy(), y_train.cpu().numpy())
      
      outputs_dev = net(x_test)
      #TODO: Same for the dev accuracy
      #acc_dev = sklearn.metrics.accuracy_score([1 if x > 0.5 else 0 for x in outputs_dev.cpu().detach().numpy()], y_dev.cpu().numpy())
      outputs_dev = torch.max(outputs_dev, 1)[1]
      acc_dev = sklearn.metrics.accuracy_score(outputs_dev.cpu().detach().numpy(), y_test.cpu().numpy())
      
      print("Epoch: {:4} Loss: {:.5f} Acc: {:.3f} Acc Dev: {:.3f}".format(epoch, loss.item(), acc, acc_dev))
      net.train()
print('Finished Training')

IGNORE THE ACC - WE ARE USING A SMALL SUBESET OF THE DATA
Epoch:    0 Loss: 0.69322 Acc: 0.510 Acc Dev: 0.510
Epoch:  500 Loss: 0.55342 Acc: 0.746 Acc Dev: 0.753
Epoch: 1000 Loss: 0.53806 Acc: 0.767 Acc Dev: 0.777
Epoch: 1500 Loss: 0.52300 Acc: 0.781 Acc Dev: 0.794
Epoch: 2000 Loss: 0.51873 Acc: 0.789 Acc Dev: 0.800
Epoch: 2500 Loss: 0.52207 Acc: 0.782 Acc Dev: 0.802
Epoch: 3000 Loss: 0.51813 Acc: 0.790 Acc Dev: 0.805
Epoch: 3500 Loss: 0.51918 Acc: 0.782 Acc Dev: 0.805
Epoch: 4000 Loss: 0.52331 Acc: 0.781 Acc Dev: 0.800
Epoch: 4500 Loss: 0.52825 Acc: 0.773 Acc Dev: 0.792
Finished Training


In [0]:
print(outputs)

tensor([[2.2199e-01, 7.7617e-01],
        [2.4514e-03, 9.9758e-01],
        [3.5216e-02, 9.6457e-01],
        ...,
        [7.8378e-05, 9.9992e-01],
        [1.3793e-01, 8.6082e-01],
        [1.0000e+00, 1.7635e-07]], device='cuda:0', grad_fn=<SigmoidBackward>)
