In [6]:
# Importing necessary packages
import spacy
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
nlp = spacy.load('en_core_web_sm')

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import TensorDataset, DataLoader, random_split

import gc

ModuleNotFoundError: No module named 'pandas'

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


# Functions for sentence preprocessing

In [3]:
def remPunc(text):
  if type(text) == str:
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', '', text.lower())
    return text

def refineSentence(tempSentence):
  tokens = nlp(str(tempSentence))
  filtered = []
  for token in tokens:
    if (token.is_stop == False):
      textAfterRemPunc = remPunc(token.lemma_)
      if(textAfterRemPunc != ''):
        filtered.append(textAfterRemPunc)
  return filtered 

In [4]:
def posTo1Negto0(tempStr):
  if(tempStr == "positive"):
    return 1
  return 0


# Getting GloVe Embeddings

In [5]:
embeddingsDict = {}
dim = 300
with open("./Data/glove.6B.300d.txt", 'r', encoding="utf-8") as wordEmbeddings:
  for line in wordEmbeddings:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddingsDict[word]=vector

Function to convert refined sentence to sentence vector

In [6]:
def sentenceToWordEmbeddings(preProccessedList):
  preProccessedListVectors = []
  for preProcessedWordList in preProccessedList:
    tempList = []
    for word in preProcessedWordList:
      if(word in embeddingsDict):
        tempList.append(embeddingsDict[word])
    preProccessedListVectors.append(np.array(tempList).T)
  return preProccessedListVectors

Function for Raw Data to TensorDataset

In [7]:
def findTensorDataset(tempData,n_samples):
  tempData = np.array(tempData)[:n_samples]

  # Splitting Data into X and y
  X_sentences = tempData[:,0]
  y = [posTo1Negto0(x) for x in tempData[:,1]]
  
  # PreProcessing the Train Data
  preProccessedList = [refineSentence(sentence) for sentence in X_sentences]

  # Training Data to wordsVector Matrix -> WordsVector is a list of 2d tensors of data samples, since we can't create vairable length 3d tensors.
  wordsVector = sentenceToWordEmbeddings(preProccessedList)

  # Adding padding to the data-set
  max_len = max([i.shape[1] for i in wordsVector])

  X = []
  for i in wordsVector:
    curr_len = i.shape[1]
    temp = np.zeros((dim,max_len-curr_len))
    X.append(np.concatenate((i, temp),axis = 1).T)
  X = np.array(X)

  # Creating DataLoaders
  X = torch.Tensor(X)
  y = torch.Tensor(y)
  y = y.type(torch.LongTensor)

  y = y.type(torch.LongTensor)
  tempDatasetTensor = TensorDataset(X,y) # create your datset
  return tempDatasetTensor

HyperParameters

In [8]:
# HyperParameters
batchSize = 32
epochs = 10
hidden_size = 512
input_size = dim
# seq_size = dim
num_classes = 1
num_layers = 1
lr = 1e-2 

TRAIN DATA

In [None]:
trainPath = "./Data/Ass2/train.csv"
trainData = pd.read_csv(trainPath)
trainData = trainData.values.tolist()

n_samples = len(trainData)#//10
print(n_samples)
trainDatasetTensor = findTensorDataset(trainData, n_samples)
train_size = int(n_samples*(0.9))
val_size = n_samples - train_size
trainData, validData = random_split(trainDatasetTensor, [train_size,val_size])

trainDataLoader = DataLoader(trainData, batch_size = batchSize)
validDataLoader = DataLoader(validData, batch_size = batchSize)

40000


In [None]:
class LSTM(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, num_classes):
    super(LSTM, self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    
    self.LSTM = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, bias = True, batch_first = True, dropout = 0, bidirectional = False )

    self.fc = nn.Linear(hidden_size, num_classes)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    h0 = (  torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(device), torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(device) )
    out, _ = self.LSTM(x, h0)
    out = out[:,-1]
    out = self.fc(out)
    out = self.sigmoid(out)
    return out

In [None]:
class SA_LSTM:
  def __init__(self, input_size, hidden_size, num_layers, num_classes, epochs,lr):
    self.input_size = input_size
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.num_classes = num_classes
    self.epochs = epochs
    
    self.NNobj = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)

    self.cross_entropy_loss = nn.BCEWithLogitsLoss()
    self.optimizer = optim.SGD(self.NNobj.parameters(), lr = lr)
    
    self.trainLossList = []
    self.validLossList = []
    self.trainLossListPlotting = []
    self.validLossListPlotting = []

  def fit(self, trainDataLoader, validDataLoader):
    for i in range(self.epochs):
      self.trainLossList = []
      self.validLossList = []
      accuracyListTrain = []
      countBatch = 0
      totalBatch = n_samples // batchSize
      for batch in trainDataLoader:
        print('[',countBatch,'/',totalBatch,']')
        countBatch += 1
        self.NNobj.train()
        x, y = batch
        x, y = x.to(device), y.to(device)
        # b = x.size(0)
        # x = x.view(b, -1)

        forwardValue = self.NNobj(x)
        costFunction_J = self.cross_entropy_loss(forwardValue.reshape(forwardValue.shape[0],1), y.float().reshape(y.shape[0],1))
        self.NNobj.zero_grad()
        costFunction_J.backward()

        nn.utils.clip_grad_norm_(self.NNobj.parameters(), 3)

        self.optimizer.step()
        self.trainLossList.append(costFunction_J.item())
        accuracyListTrain.append(y.eq(forwardValue.detach().argmax(dim = 1)).float().mean())
          
      accuracyListVal = []
      for batch in validDataLoader:
        self.NNobj.eval()
        x, y = batch
        x, y = x.to(device), y.to(device)
        with torch.no_grad():
          forwardValue = self.NNobj(x)
          costFunction_J = self.cross_entropy_loss(forwardValue.reshape(forwardValue.shape[0],1), y.float().reshape(y.shape[0],1))
        self.validLossList.append(costFunction_J.item())
        accuracyListVal.append(y.eq(forwardValue.detach().argmax(dim = 1)).float().mean())
      
      self.trainLossListPlotting.append(torch.tensor(self.trainLossList).mean())
      self.validLossListPlotting.append(torch.tensor(self.validLossList).mean())
      accuracyTrain = torch.tensor(accuracyListTrain).mean() * 100
      accuracyVal = torch.tensor(accuracyListVal).mean() * 100

      print('At Epoch Number: ' + str(i+1) +'; Train Loss= ' + str("{:.2f}".format(torch.tensor(self.trainLossList).mean()))+'; Validation Loss= ' + str("{:.2f}".format(torch.tensor(self.validLossList).mean())) + "; " + "Train Accuracy = ", "{:.2f}".format(accuracyTrain), "%" + "; " + "Validation Accuracy = ", "{:.2f}".format(accuracyVal), "%")

  def predict(self, testDataLoader):
    accuracyListPred = []
    for batch in testDataLoader:
      x, y = batch
      x, y = batch
      x, y = x.to(device), y.to(device)
      with torch.no_grad():
        forwardValue = self.NNobj(x)
          
      accuracyListPred.append(y.eq(forwardValue.detach().argmax(dim = 1)).float().mean())
    accuracy = torch.tensor(accuracyListPred).mean() * 100
    print('\033[1m' + "Review Classification Accuracy on Test Data = ", "{:.2f}".format(accuracy), " %" + '\033[0m' )

  def plotLossCurve(self, flag):
    x = [(i+1) for i in range(self.epochs)]
    plt.xlabel('#Epoch')
    plt.ylabel('Loss')
    tempStr = "Loss curve for "
    if(flag == 0):
        tempStr += "Train Data"
        plt.plot(x,self.trainLossListPlotting)
    else:
        tempStr += "Validation Data"
        plt.plot(x,self.validLossListPlotting)
    plt.title(tempStr)
    plt.show()


In [None]:
# Clear CUDA Memory
torch.cuda.empty_cache()
gc.collect()

# Fitting the Model
obj = SA_LSTM(input_size, hidden_size, num_layers, num_classes, epochs,lr)
obj.fit(trainDataLoader, validDataLoader)
obj.plotLossCurve(0)
obj.plotLossCurve(1)

Loading Test Data & Finding Accuracy

In [None]:
testPath = "./Data/Ass2/test.csv"
testData = pd.read_csv(trainPath)
testData = testData.values.tolist()

testDatasetTensor = findTensorDataset(testData)
testDataLoader = DataLoader(testDatasetTensor)

In [None]:
obj.predict(testDataLoader)

In [None]:
# torch.cuda.memory_summary(device=None, abbreviated=False)