In [1]:
# Scrierea și comentarea codului.           Done
# Implimentarea clasei Vocabular.
# Implimentarea clasei Data set.            Done
# Implimentarea rețelei neuronale.          Done
# Implimentarea ciclului de învănțare.      Done 
# Prezentța graficului cu Learning Curve pentru acuratețe și eroare.  Done
# Reantrenarea modelului cu cem mai bun rezultat după learning curve. Done

In [21]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset,DataLoader
from torchvision import transforms
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from torch import nn as nn
from collections import Counter
import gensim
import os

In [3]:
# #Implementing Dataset class
# class Data(Dataset):
#     def __init__(self, path):
#         self.data = pd.read_csv(path, usecols = [2,3], names=['sentiment', 'content'])

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index):
#         return self.data.iloc[index].values

In [4]:
#Implementing Dataset class
class Data(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data.values)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [5]:
# Vocabulary class
class Vocabulary:
    def __init__(self, path_to_train, path_to_test, tokenizer, stemmer):
        self.train = pd.read_csv(path_to_train, usecols = [2,3], names=['sentiment', 'content'])
        self.test = pd.read_csv(path_to_test, usecols = [2,3], names=['sentiment', 'content'])
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.trained = False
    
    def word2vec(self,):
        model_train = gensim.models.Word2Vec(self.train['tokens'].sum(), min_count = 1, 
                              size = 100, window = 5)
        
        model_test = gensim.models.Word2Vec(self.test['tokens'].sum(), min_count = 1, 
                              size = 100, window = 5)
    
    def tokenize(self,):
        pass
    
    def text2tokens(self):
        # for i in range(len(self.train)):
        #     self.train.iloc[i]['tokens'] = nltk.work_tokenize(self.train.iloc[i]['content'])
        # for i in range(len(self.test)):
        #     self.test.iloc[i]['tokens'] = nltk.work_tokenize(self.test.iloc[i]['content'])
        self.train['tokens'] = self.train['content'].apply(lambda x: np.array([word.lower() for word in word_tokenize(str(x)) if len(word)>2]))
        self.test['tokens'] = self.test['content'].apply(lambda x: np.array([word.lower() for word in word_tokenize(str(x)) if len(word)>2]))
            
    def remove_stop_words(self,):
        # self.train['tokens'] = self.train['tokens'].apply(lambda x: x[x not in stopwords.words('english')])
        # self.test['tokens'] = self.test['tokens'].apply(lambda x: x[x not in stopwords.words('english')])
        self.train['tokens'] = self.train['tokens'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])
        self.test['tokens'] = self.test['tokens'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])

    def remove_hapaxes(self,):
        all_words = sum(vocabulary.train['tokens'], [])
        # counter = Counter(all_words)
        fdist = FreqDist(all_words)
        self.hapaxes = fdist.hapaxes()
        self.train['tokens'] = self.train['tokens'].apply(lambda x: [word for word in x if word not in self.hapaxes])
        self.test['tokens'] = self.test['tokens'].apply(lambda x: [word for word in x if word not in self.hapaxes])

    
    def stemming(self,):
        self.train['tokens'] = self.train['tokens'].apply(lambda x: [self.stemmer.stem(word) for word in x])
        self.test['tokens'] = self.test['tokens'].apply(lambda x: [self.stemmer.stem(word) for word in x])

    def preprocess(self):
        
        self.text2tokens()
        self.remove_stop_words()
        self.remove_hapaxes()
        self.stemming()
        
        new_train, new_test = self.tokenize()
        
        new_vec_train, new_vec_test = self.word2vec(new_train, new_test)
        
        return self.train[['tokens','sentiment']], self.test[['tokens','sentiment']]
    

In [6]:
# Network class
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, drop_prob=0.2):
        super(GRU, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x, h):
        out, h = self.gru(x, h)
        out = self.fc(self.relu(out[:,-1]))
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
        return hidden

In [7]:
# Train loop 
def train(epochs, optimizer, model, loss_fn, train_loader, test_loader, l2=0.001, print_plot=True):
    
    train_accuracy = np.zeros(epochs)
    test_accuracy = np.zeros(epochs)
    
    train_loss = np.zeros(epochs)
    test_loss = np. zeros(epochs)
    for epoch in range(epochs):
        # Train
        model.train()
        total = 0
        correct = 0
        current_train_loss = 0.0
        current_test_loss = 0.0

        for example, labels in train_loader:     

            #Translating calculations to gpu if is available
            example = example.to(DEVICE)
            labels = labels.to(DEVICE)

            # ensuring equal number of dimensions for labels and examples
            labels  = labels.unsqueeze(1)

            # running our data thru our data - forward
            predicted, _ = model(example)
            
            # Getting loss of our network right now
            loss = loss_fn(predicted, labels)
            current_train_loss += loss    # Check if should not detatch?

            total += labels.shape[0]
            correct += int((predicted == labels).sum())
            # l2_norm = sum(p.pow(2.0).sum() for p in model.parameters())
            # train_loss = train_loss + l2 *l2_norm

            # Zeroing the gradient to not stack it from other iterations
            optimizer.zero_grad()
            #Runing backward part of the neural network, getting gradiets
            train_loss.backward()
            #Updating our paramters
            optimizer.step()
        
        train_accuracy[epoch] = correct / total
        train_loss[epoch] = current_train_loss
        # Test
        model.eval()
        
        for example, labels in test_loader:

            #Translating calculations to gpu if is available
            example = example.to(DEVICE)
            labels = labels.to(DEVICE)
            
            # ensuring equal number of dimensions for labels and examples
            labels  = labels.unsqueeze(1)

            #Forward
            val_output = model(example.float())
            #Loss
            loss = loss_fn(val_output, labels.float())
            current_test_loss += loss # Check for detatch
            
            total += labels.shape[0]
            correct += int((predicted == labels).sum())
            
        test_accuracy[epoch] = correct / total
        test_loss[epoch] = current_test_loss
        #Print results for epochs
        if epoch % 10 == 0:
            print('Epoch {0}, Training loss - {1}, Validation loss {2} \n'.format(epoch,current_train_loss, current_test_loss))


    #If set to True, print graph of train and validation loss
    if print_plot:

        #Setting x-ticks
        epochs_range = range(1,epochs+1)
        
        fig, ax = plt.subplots(nrows=1, ncols=2)
        
        ax[0, 0].plot(epochs_range, train_loss, 'g', label='Training loss')
        ax[0, 0].plot(epochs_range, test_loss, 'b', label='validation loss')
        ax[0, 0].title('Training and Validation loss')
        ax[0, 0].xlabel('Epochs')
        ax[0, 0].ylabel('Loss')
        ax[0, 0].legend()
        
        ax[0, 1].plot(epochs_range, train_accuracy, 'g', label='Training loss')
        ax[0, 1].plot(epochs_range, test_accuracy, 'b', label='validation loss')
        ax[0, 1].title('Training and Validation loss')
        ax[0, 1].xlabel('Epochs')
        ax[0, 1].ylabel('Loss')
        ax[0, 1].legend()
        
        plt.show()
        # #Ploting both curves, train and val 
        # plt.plot(epochs_range, train_loss, 'g', label='Training loss')
        # plt.plot(epochs_range, test_loss, 'b', label='validation loss')
        # plt.title('Training and Validation loss')
        # plt.xlabel('Epochs')
        # plt.ylabel('Loss')
        # plt.legend()
        # plt.show()

In [8]:
# Loading data
path_to_data = 'GRU/data'
path_to_train = os.path.join(path_to_data, 'twitter_training.csv')
path_to_test = os.path.join(path_to_data, 'twitter_validation.csv')

# Applying the vocabulary

ps = PorterStemmer()

vocabulary = Vocabulary(path_to_train, path_to_test, tokenizer = None, stemmer = ps)

In [9]:
# self.text2tokens()
#         self.remove_stop_words()
#         self.remove_hapaxes()
#         self.stemming()

In [10]:
vocabulary.text2tokens()

In [11]:
vocabulary.train

Unnamed: 0,sentiment,content,tokens
0,Positive,im getting on borderlands and i will murder yo...,"[getting, borderlands, and, will, murder, you,..."
1,Positive,I am coming to the borders and I will kill you...,"[coming, the, borders, and, will, kill, you, all]"
2,Positive,im getting on borderlands and i will kill you ...,"[getting, borderlands, and, will, kill, you, all]"
3,Positive,im coming on borderlands and i will murder you...,"[coming, borderlands, and, will, murder, you, ..."
4,Positive,im getting on borderlands 2 and i will murder ...,"[getting, borderlands, and, will, murder, you,..."
...,...,...,...
74677,Positive,Just realized that the Windows partition of my...,"[just, realized, that, the, windows, partition..."
74678,Positive,Just realized that my Mac window partition is ...,"[just, realized, that, mac, window, partition,..."
74679,Positive,Just realized the windows partition of my Mac ...,"[just, realized, the, windows, partition, mac,..."
74680,Positive,Just realized between the windows partition of...,"[just, realized, between, the, windows, partit..."


In [12]:
vocabulary.remove_stop_words()

In [13]:
# vocabulary.train['content'].astype(str).sum()

In [14]:
vocabulary.remove_hapaxes()

In [15]:
vocabulary.train

Unnamed: 0,sentiment,content,tokens
0,Positive,im getting on borderlands and i will murder yo...,"[getting, borderlands, murder]"
1,Positive,I am coming to the borders and I will kill you...,"[coming, borders, kill]"
2,Positive,im getting on borderlands and i will kill you ...,"[getting, borderlands, kill]"
3,Positive,im coming on borderlands and i will murder you...,"[coming, borderlands, murder]"
4,Positive,im getting on borderlands 2 and i will murder ...,"[getting, borderlands, murder]"
...,...,...,...
74677,Positive,Just realized that the Windows partition of my...,"[realized, windows, partition, mac, like, year..."
74678,Positive,Just realized that my Mac window partition is ...,"[realized, mac, window, partition, years, behi..."
74679,Positive,Just realized the windows partition of my Mac ...,"[realized, windows, partition, mac, years, beh..."
74680,Positive,Just realized between the windows partition of...,"[realized, windows, partition, mac, like, year..."


In [16]:
vocabulary.stemming()

In [17]:
vocabulary.train

Unnamed: 0,sentiment,content,tokens
0,Positive,im getting on borderlands and i will murder yo...,"[get, borderland, murder]"
1,Positive,I am coming to the borders and I will kill you...,"[come, border, kill]"
2,Positive,im getting on borderlands and i will kill you ...,"[get, borderland, kill]"
3,Positive,im coming on borderlands and i will murder you...,"[come, borderland, murder]"
4,Positive,im getting on borderlands 2 and i will murder ...,"[get, borderland, murder]"
...,...,...,...
74677,Positive,Just realized that the Windows partition of my...,"[realiz, window, partit, mac, like, year, behi..."
74678,Positive,Just realized that my Mac window partition is ...,"[realiz, mac, window, partit, year, behind, nv..."
74679,Positive,Just realized the windows partition of my Mac ...,"[realiz, window, partit, mac, year, behind, nv..."
74680,Positive,Just realized between the windows partition of...,"[realiz, window, partit, mac, like, year, behi..."


In [24]:
model_train = gensim.models.Word2Vec(vocabulary.train.head()['tokens'].sum(), min_count = 1,window = 5)

In [25]:
model_train

<gensim.models.word2vec.Word2Vec at 0x20769700fa0>

In [23]:
# Loading data
path_to_data = 'GRU/data'
path_to_train = os.path.join(path_to_data, 'twitter_training.csv')
path_to_test = os.path.join(path_to_data, 'twitter_validation.csv')

# Applying the vocabulary

ps = PorterStemmer()

vocabulary = Vocabulary(path_to_train, path_to_test, stemmer = ps)
train_set, test_set = vocabulary.preprocess()


train_dataset = Data(train_set)
test_dataset = Data(test_set)


train_loader = DataLoader(
    train_dataset, batch_size=32, shuffle=True
)
test_loader = DataLoader(
    test_dataset, batch_size=32, shuffle=True
)

In [None]:
# Testing model

learning_rate = 1e-3

#Initializing model with nr of features from input
model = GRU().to(DEVICE)

#Optimizer and los„s funtion
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
loss_fn = nn.BCELoss()  

In [None]:
#Running training loop on the data with set parameters
training_loop(
    n_epochs=10,
    optimizer=optimizer,
    model = model,
    loss_fn = loss_fn,
    print_plot=True,
    train_loader=train_loader,
    test_loader = test_loader
)

In [None]:
#model retrain

# Testing model

learning_rate = 1e-3

#Initializing model with nr of features from input
model = GRU().to(DEVICE)

#Optimizer and los„s funtion
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
loss_fn = nn.BCELoss() 

#Running training loop on the data with set parameters
training_loop(
    n_epochs=10,
    optimizer=optimizer,
    model = model,
    loss_fn = loss_fn,
    print_plot=True,
    train_loader=train_loader,
    test_loader = test_loader
)