In [None]:
import torch, torchvision
import torch.nn as nn
from torch.utils import data
import numpy as np
from gensim.test.utils import common_texts
import gensim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from gensim.scripts.glove2word2vec import glove2word2vec
#load python helper functions
from read_data import *
from model_RNN_pytorch import GRU, LSTM
from dataset_pytorch import *
import importlib
import collections

from datetime import datetime
from tensorboardX import SummaryWriter

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

import pickle

In [None]:
%load_ext autoreload
%autoreload 2

## Params

In [None]:
data_path = './data/'
batch_size = 250
DIMENSION_SIZE = 200
DATATYPE = 'OWN' # 'GLOVE' or 'OWN'
hidden_nodes = 200
OUTPUT_NODES = 1
max_epochs = 100
eval_freq = 50
validation_set_size = 2500
max_iter = 2500
learning_rate = 0.001
reached_max_iter = False
model_type = 'LSTM'
number_of_layers = 3
dropout = 0.5

DATATYPE = DATATYPE + model_type + 'DROPOUT'

## Load wordembedding

In [None]:
glove2word2vec(glove_input_file="./data/self_200.txt", word2vec_output_file='./data/word2vec2.txt')
word2vec = gensim.models.KeyedVectors.load_word2vec_format('./data/word2vec2.txt', binary=False)
Dataset.en_model = word2vec

In [None]:
writer = SummaryWriter('./tensorboard/TSN-E/TWITTER/'+datetime.now().strftime("%Y-%m-%d %H:%M:%s"), filename_suffix='.' + str(DIMENSION_SIZE))
# writer.add_embedding(torch.tensor(word2vec2[all_words]), metadata=(list(all_words))) Used for t-SNE

# read dataset and get test, train and validation

In [None]:
train_data, train_labels, test_data ,test_labels = create_train_test_data(path=data_path,store_dataframe=True, pretrained=True, tokenized=True)


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Load model

In [None]:
if model_type == 'LSTM':
    print("Init LSTM...")
    model = LSTM(DIMENSION_SIZE, hidden_nodes, OUTPUT_NODES, batch_size, number_of_layers, dropout)
else:
    print("Init GRU...")
    model = GRU(DIMENSION_SIZE, hidden_nodes, OUTPUT_NODES, batch_size, number_of_layers, dropout)
model.to(device)

## helper functions

In [None]:
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (sequence, label).
  
    Args:
        data: list of tuple (sequence, label). 
            - sequence: numpy array of shape (seq_length, 300).
            - label:  numpy array of shape 1,
    Returns:
    
    """
    # Sort a data list by caption length (descending order).
    data.sort(key=lambda x: len(x[0]), reverse=True)
    # unzips into sequences and labels
    sequences, labels  = zip(*data)
    seq_lengths = [len(seq) for seq in sequences]

    sequences = [torch.from_numpy(seq) for seq in sequences]
    # calculate sequence lengths
    padded_sequences = pad_sequence(sequences)
    
    
    return padded_sequences, torch.Tensor(labels), seq_lengths

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def accuracy(prediction, target):
    correct = np.sum(target == prediction)
    return correct / target.shape[0]

def calculate_data_accuracy(data_generator, model, test_data=False):
    accuracy_list, loss_list= [], []
    if test_data:
        length_acc = collections.defaultdict(list)
        document_acc = []

    data_counter = 0
    model = model.eval()
    with torch.no_grad():
        for sample in data_generator:
            data_X, data_y, seq_length = sample
            data_X.transpose_(0,1)
            data_X, data_y = data_X.to(device), data_y.to(device)
            
            model.hidden = model.init_hidden()
            
            data_out = model.forward(data_X, seq_length)
            data_out, data_y = data_out.view(data_out.numel()), data_y.view(data_y.numel())

            data_loss = loss_function(data_out, data_y)
            data_acc = accuracy(np.round(sigmoid(data_out.cpu().numpy())), data_y.cpu().numpy())
            
            if test_data:
                for i, item in enumerate(np.round(sigmoid(data_out.cpu().numpy())) == data_y.cpu().numpy()):
                    length_acc[seq_lengths[i]].append(item)
                    document_acc.append(item)
            
            loss_list.append(data_loss)
            accuracy_list.append(data_acc)
            data_counter += 1
    
    accuracy_list = np.array(accuracy_list)
    loss_list = np.array(loss_list)

    if test_data:
        return accuracy_list.sum() / data_counter, loss_list.sum() / data_counter, length_acc, np.array(document_acc)
    else:
        return accuracy_list.sum() / data_counter, loss_list.sum() / data_counter #fixme

#fixme
def get_dataloaders(train_data, train_labels, test_data, test_labels, val_size=500, batch_size=256):
    # Get Dataset and use efficient dataloaders.
    params_train = {'batch_size': batch_size,
              'shuffle': True,
              'num_workers': 0,
              'collate_fn': collate_fn,
              'drop_last': True}

    params_validation = {'batch_size': batch_size,
              'shuffle': True,
              'num_workers': 0,
              'collate_fn': collate_fn,
              'drop_last': True}

    params_test = {'batch_size': batch_size,
              'shuffle': False,
              'num_workers': 1,
              'collate_fn': collate_fn,
              'drop_last': True}



    training_set = Dataset(train_data, train_labels)
    training_set, validation_set = torch.utils.data.random_split(training_set, [len(training_set) - val_size, val_size])
    
    test_set = Dataset(test_data, test_labels)

    training_generator = data.DataLoader(training_set, **params_train)
    validation_generator = data.DataLoader(validation_set, **params_validation)
    test_generator = data.DataLoader(test_set, **params_test)

    return training_generator, validation_generator, test_generator

## dataloader

In [None]:
# Get dataloaders
training_generator, validation_generator, test_generator = get_dataloaders(train_data, train_labels, test_data, test_labels,
                                                                            val_size=validation_set_size, batch_size=batch_size)

## Init Optimizer

In [None]:
import torch.optim as optim
loss_function = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

## TRAIN

In [None]:
# Loop over epochs
validation_accuracy, train_accuracy = [], []
validation_loss, train_loss = [], []

iterations = 0
train_acc = 0
train_counter = 0
best_acc = 0

print('Starting training')
for epoch in range(max_epochs):
    for sample in training_generator:
         # get the inputs
        train_X, train_y, seq_lengths = sample
        train_X = train_X.transpose(0,1)
        train_X, train_y = train_X.to(device), train_y.to(device)
        model.zero_grad()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()
        model = model.train()
        train_out = model.forward(train_X, seq_lengths)
        train_out, train_y = train_out.view(train_out.numel()), train_y.view(train_y.numel())

        loss = loss_function(train_out,train_y)
        loss.backward()
        optimizer.step()   
         
        writer.add_scalar('loss', loss.item(), iterations)
        writer.add_scalar('accuracy', accuracy(np.round(sigmoid(train_out.cpu().detach().numpy())), train_y.cpu().detach().numpy()), iterations)
        train_loss.append(loss.item())
        train_accuracy.append(accuracy(np.round(sigmoid(train_out.cpu().detach().numpy())), train_y.cpu().detach().numpy()))
         
        iterations += 1
        train_counter += 1
        train_acc += train_accuracy[-1]
        if iterations % eval_freq == 0:
            val_acc, val_loss = calculate_data_accuracy(validation_generator, model)
            validation_accuracy.append(val_acc)
            validation_loss.append(val_loss)
            
            # Not very nice to calculate train acc and loss like this, maybe fix it later.
            print("Iteration accuracy: %.2f, Train accuracy: %.2f, Validation_accuracy: %.2f, loss: %.3f " % (iterations, (train_acc / train_counter), val_acc, np.mean(np.asarray(train_loss))))
            train_acc = 0
            train_counter = 0
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(),'./pickles/best_model_'+str(DIMENSION_SIZE)+'_'+str(DATATYPE)+'.pt')
            
        if iterations % max_iter == 0:
            print('Reached maximum number of iterations')
            reached_max_iter = True
            break

    if reached_max_iter == True:
        break

test_acc, test_loss, test_length_acc, test_document_acc = calculate_data_accuracy(test_generator, model, test_data=True)
print('Accuracy on the test set: %.3f' % test_acc)

## DUMP DATA

In [None]:
with open('./pickles/train_accuracy_'+str(DIMENSION_SIZE)+'_'+str(DATATYPE)+'.pkl','wb') as f:
    pickle.dump(np.array(train_accuracy), f)
with open('./pickles/validation_accuracy_'+str(DIMENSION_SIZE)+'_'+str(DATATYPE)+'.pkl','wb') as f:
    pickle.dump(np.array(validation_accuracy), f)
with open('./pickles/train_loss_'+str(DIMENSION_SIZE)+'_'+str(DATATYPE)+'.pkl','wb') as f:
    pickle.dump(np.array(train_loss), f)
with open('./pickles/validation_loss_'+str(DIMENSION_SIZE)+'_'+str(DATATYPE)+'.pkl','wb') as f:
    pickle.dump(np.array(validation_loss), f)
    
with open('./pickles/val_length_acc_'+str(DIMENSION_SIZE)+'_'+str(DATATYPE)+'.pkl','wb') as f:
    pickle.dump(test_length_acc, f)
with open('./pickles/val_document_acc_'+str(DIMENSION_SIZE)+'_'+str(DATATYPE)+'.pkl','wb') as f:
    pickle.dump(test_document_acc, f)