In [1]:
import os
import torch
import numpy as np
import csv

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.pretrain_vec = [] # should match index order of words in dict.

    def add_word(self, word, vec=None):
        if vec is None:
            if word not in self.word2idx:
                self.idx2word.append(word)
                self.word2idx[word] = len(self.idx2word) - 1
        else:
            if word not in self.word2idx:
                self.pretrain_vec.append(vec)
                self.idx2word.append(word)
                self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path, language):
        self.dictionary = Dictionary()
        if language is not None:
            self.pretrained = self.add_pretrained(os.path.join('', 'wiki.' + language + '.vec'))
        #self.trainid, self.trainlab, self.trainidx = self.tokenize_by_user(os.path.join(path, 'train.csv'),True)
        #self.validid, self.validlab, self.valididx = self.tokenize_by_user(os.path.join(path, 'valid.csv'),False)
        #self.testid, self.testlab, self.testidx = self.tokenize_by_user(os.path.join(path, 'test.csv'),False)
        self.X_train, self.y_train = self.tokenize(os.path.join('', 'train.csv'),True)
        #self.X_valid, self.y_valid = self.tokenize(os.path.join(path, 'valid.csv'),False)
        self.X_test, self.y_test = self.tokenize(os.path.join('', 'test.csv'),False)

    def add_pretrained(self, path):
        assert os.path.exists(path)

        # Add words with pretrained vectors to the dictionary
        # might be weird because no eos was added?
        with open(path, 'r', encoding='utf-8') as f:
            tokens = 0
            for line in f:
                words = line.split()
                if len(words) == 2: #first line
                    continue
                word = words[0]
                vec = words[1:]
                if len(vec) != 300:
                    continue #this skips the space embedding
                #vec = np.array(list(map(float, vec)))
                vec = list(map(float,vec))
                tokens += 1
                
                self.dictionary.add_word(word, vec)
    def tokenize(self, path, header):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            tokens = 0
            prev = None
            if header:
                first = True
            else:
                first = False
            tweet_count = 0
            user_idx = -1
            for row in reader:
                if first:
                    first = False
                    continue
                if len(row) is not 6:
                    continue
                
                tweet = row[0]
                label = row[1]
                if not label.isdigit():
                    continue
                extra = row[2:5] #bio, tweet pic, profile pic, user id
                if row[2] != prev: #new user
                    prev = row[2]
                    tweet_count = 0
                    user_idx += 1

                words = tweet.split()
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding='utf-8') as f:
            x = np.zeros(user_idx+1,dtype='object')
            y = np.zeros(user_idx+1,dtype='int')
            #ids = torch.LongTensor(tokens)
            #idxs = torch.LongTensor(user_idx+1)
            #labels = torch.LongTensor(user_idx+1)
            #print(user_idx+1)
            token = 0
            prev = None

            reader = csv.reader(f)
            if header:
                first = True
            else:
                first = False
            user_idx = -1
            for row in reader:
                if first:
                    first = False
                    continue
                if len(row) is not 6:
                    continue
                
                tweet = row[0]
                label = row[1]
                if not label.isdigit():
                    continue
                extra = row[2:5] #bio, tweet pic, profile pic, user id
                if row[2] != prev:
                    tweet_idx = -1
                    user_idx += 1
                    prev = row[2]
                    y[user_idx] = int(label)
                    x[user_idx] = []
                    #print(token, "NEW USER")
                    #idxs[user_idx] = token
                

                words = tweet.split()
                token = 0
                tweet_idx+=1
                if tweet_idx >=20:
                    #print(tweet_idx)
                    continue
                if tweet_idx==0:
                    x[user_idx].append([])
                for word in words:
                    #FLAT STRUCTURE SO THERE'S ONLY ONE 'TWEET'
                    x[user_idx][0].append(self.dictionary.word2idx[word])
                    token+=1
                

        return x, y


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

#attention functions

def batch_matmul_bias(seq, weight, bias, nonlinearity=''):
    s = None
    bias_dim = bias.size()
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight) 
        _s_bias = _s + bias.expand(bias_dim[0], _s.size()[0]).transpose(0,1)
        if(nonlinearity=='tanh'):
            _s_bias = torch.tanh(_s_bias)
        _s_bias = _s_bias.unsqueeze(0)
        if(s is None):
            s = _s_bias
        else:
            s = torch.cat((s,_s_bias),0)
    return s.squeeze()

def batch_matmul(seq, weight, nonlinearity=''):
    s = None
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight)
        if(nonlinearity=='tanh'):
            _s = torch.tanh(_s)
        _s = _s.unsqueeze(0)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)
    return s.squeeze()

def attention_mul(rnn_outputs, att_weights):
    attn_vectors = None
    for i in range(rnn_outputs.size(0)):
        h_i = rnn_outputs[i]
        a_i = att_weights[i].unsqueeze(1).expand_as(h_i)
        h_i = a_i * h_i
        h_i = h_i.unsqueeze(0)
        if(attn_vectors is None):
            attn_vectors = h_i
        else:
            attn_vectors = torch.cat((attn_vectors,h_i),0)
    return torch.sum(attn_vectors, 0).unsqueeze(0)

'''
def batch_matmul_bias(seq, weight, bias, nonlinearity=''):
    s = None
    bias_dim = bias.size()
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight) 
        _s_bias = _s + bias.expand(bias_dim[0], _s.size()[0]).transpose(0,1)
        if(nonlinearity=='tanh'):
            _s_bias = torch.tanh(_s_bias)
        _s_bias = _s_bias.unsqueeze(0)
        if(s is None):
            s = _s_bias
        else:
            s = torch.cat((s,_s_bias),0)
    return s.squeeze()
    
def batch_matmul(seq, weight, nonlinearity=''):
    s = None
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight)
        if(nonlinearity=='tanh'):
            _s = torch.tanh(_s)
        _s = _s.unsqueeze(0)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)
    return s.squeeze()

def attention_mul(self, rnn_outputs, att_weights):
    attn_vectors = None
    for i in range(rnn_outputs.size(0)):
        h_i = rnn_outputs[i]
        a_i = att_weights[i].unsqueeze(1).expand_as(h_i)
        h_i = a_i * h_i
        h_i = h_i.unsqueeze(0)
        if(attn_vectors is None):
            attn_vectors = h_i
        else:
            attn_vectors = torch.cat((attn_vectors,h_i),0)
    return torch.sum(attn_vectors, 0).unsqueeze(0)
'''
class AttentionWordRNN(nn.Module):
    
    
    def __init__(self, embeds, batch_size, num_tokens, embed_size, word_gru_hidden, dropout, n_classes, bidirectional= True):        
        
        super(AttentionWordRNN, self).__init__()
        
        self.batch_size = batch_size
        self.num_tokens = num_tokens
        self.embed_size = embed_size
        self.word_gru_hidden = word_gru_hidden
        self.bidirectional = bidirectional
        self.drop = nn.Dropout(dropout)
        

        
        
        self.lookup = nn.Embedding(num_tokens, embed_size)

        #init lookup table
        

        
        initrange = 0.1

        k = len(embeds) # the first k indices are pretrained. the rest are unknown
        
        if k is not 0:
            first = np.array(embeds)
            second = np.random.uniform(-initrange,initrange,size=(num_tokens-k,embed_size))
            self.lookup.weight.data.copy_(torch.from_numpy(np.concatenate((first,second),axis=0)))
        else:
            self.lookup.weight.data.uniform_(-initrange, initrange)
        

        if bidirectional == True:
            self.word_gru = nn.GRU(embed_size, word_gru_hidden, bidirectional= True)
            self.weight_W_word = nn.Parameter(torch.Tensor(2* word_gru_hidden,2*word_gru_hidden))
            self.bias_word = nn.Parameter(torch.Tensor(2* word_gru_hidden,1))
            self.weight_proj_word = nn.Parameter(torch.Tensor(2*word_gru_hidden, 1))
            self.final_linear = nn.Linear(2*word_gru_hidden, n_classes)
        else:
            self.word_gru = nn.GRU(embed_size, word_gru_hidden, bidirectional= False)
            self.weight_W_word = nn.Parameter(torch.Tensor(word_gru_hidden, word_gru_hidden))
            self.bias_word = nn.Parameter(torch.Tensor(word_gru_hidden,1))
            self.weight_proj_word = nn.Parameter(torch.Tensor(word_gru_hidden, 1))
            self.final_linear = nn.Linear(word_gru_hidden, n_classes)
            
        self.softmax_word = nn.Softmax()
        #self.word_gru.data.uniform_(-initrange,initrange)
        self.weight_W_word.data.uniform_(-initrange, initrange)
        self.weight_proj_word.data.uniform_(-initrange,initrange)
        self.bias_word.data.uniform_(-initrange,initrange)

        
        
    def forward(self, embed, state_word):
        # embeddings
        #print(embed)
        embedded = self.drop(self.lookup(embed))
        # word level gru
        #state_word = self.drop(state_word) #idk
        output_word, state_word = self.word_gru(embedded, state_word)
        #state_word = self.drop(state_word) #idk
        output_word = self.drop(output_word)
        #print output_word.size()
        word_squish = self.drop(batch_matmul_bias(output_word, self.weight_W_word,self.bias_word, nonlinearity='tanh'))
        word_attn = self.drop(batch_matmul(word_squish, self.weight_proj_word))
        word_attn_norm = self.drop(self.softmax_word(word_attn.transpose(1,0)))
        word_attn_vectors = self.drop(attention_mul(output_word, word_attn_norm.transpose(1,0)))
        
        #take the average of output (only for non-attention)
        #feature_vec = torch.mean(output_word,0)
        final_map = self.final_linear(word_attn_vectors.squeeze(0))
        return F.log_softmax(final_map), state_word, None
    
    def init_hidden(self):
        if self.bidirectional == True:
            return Variable(torch.zeros(2, self.batch_size, self.word_gru_hidden))
        else:
            return Variable(torch.zeros(1, self.batch_size, self.word_gru_hidden))
'''
class AttentionSentRNN(nn.Module):
    
    
    def __init__(self, batch_size, sent_gru_hidden, word_gru_hidden, n_classes, dropout, bidirectional= True):        
        
        super(AttentionSentRNN, self).__init__()
        
        self.batch_size = batch_size
        self.sent_gru_hidden = sent_gru_hidden
        self.n_classes = n_classes
        self.word_gru_hidden = word_gru_hidden
        self.bidirectional = bidirectional
        
        self.drop = nn.Dropout(dropout)

        initrange = 0.1
        
        
        if bidirectional == True:
            self.sent_gru = nn.GRU(2 * word_gru_hidden, sent_gru_hidden, bidirectional= True)        
            self.weight_W_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden ,2* sent_gru_hidden))
            self.bias_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden,1))
            self.weight_proj_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden, 1))
            self.final_linear = nn.Linear(2* sent_gru_hidden, n_classes)
        else:
            self.sent_gru = nn.GRU(word_gru_hidden, sent_gru_hidden, bidirectional= True)        
            self.weight_W_sent = nn.Parameter(torch.Tensor(sent_gru_hidden ,sent_gru_hidden))
            self.bias_sent = nn.Parameter(torch.Tensor(sent_gru_hidden,1))
            self.weight_proj_sent = nn.Parameter(torch.Tensor(sent_gru_hidden, 1))
            self.final_linear = nn.Linear(sent_gru_hidden, n_classes)
        self.softmax_sent = nn.Softmax()
        self.final_softmax = nn.Softmax()
        self.bias_sent.data.uniform_(-initrange, initrange)
        #self.sent_gru.data.uniform_(-initrange,initrange)
        self.weight_W_sent.data.uniform_(-initrange, initrange)
        self.weight_proj_sent.data.uniform_(-initrange,initrange)
        
        
    def forward(self, word_attention_vectors, state_sent):
        #MANUALLY DROPOUT THE GRU
        #state_word = self.drop(state_sent)
        output_sent, state_sent = self.sent_gru(word_attention_vectors, state_sent)   
        #state_word = self.drop(state_sent)
        output_sent = self.drop(output_sent)
        sent_squish = self.drop(batch_matmul_bias(output_sent, self.weight_W_sent,self.bias_sent, nonlinearity='tanh'))
        sent_attn = self.drop(batch_matmul(sent_squish, self.weight_proj_sent))
        sent_attn_norm = self.drop(self.softmax_sent(sent_attn.transpose(1,0)))
        sent_attn_vectors = self.drop(attention_mul(output_sent, sent_attn_norm.transpose(1,0)))    
        # final classifier
        final_map = self.final_linear(sent_attn_vectors.squeeze(0))
        return F.log_softmax(final_map), state_sent, sent_attn_norm
    
    def init_hidden(self):
        if self.bidirectional == True:
            return Variable(torch.zeros(2, self.batch_size, self.sent_gru_hidden))
        else:
            return Variable(torch.zeros(1, self.batch_size, self.sent_gru_hidden))
'''

"\nclass AttentionSentRNN(nn.Module):\n    \n    \n    def __init__(self, batch_size, sent_gru_hidden, word_gru_hidden, n_classes, dropout, bidirectional= True):        \n        \n        super(AttentionSentRNN, self).__init__()\n        \n        self.batch_size = batch_size\n        self.sent_gru_hidden = sent_gru_hidden\n        self.n_classes = n_classes\n        self.word_gru_hidden = word_gru_hidden\n        self.bidirectional = bidirectional\n        \n        self.drop = nn.Dropout(dropout)\n\n        initrange = 0.1\n        \n        \n        if bidirectional == True:\n            self.sent_gru = nn.GRU(2 * word_gru_hidden, sent_gru_hidden, bidirectional= True)        \n            self.weight_W_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden ,2* sent_gru_hidden))\n            self.bias_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden,1))\n            self.weight_proj_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden, 1))\n            self.final_linear = nn.Linear(

In [3]:
#import model
#import data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import sklearn.metrics

dropout=0.5
my_batch=64
lang='en'
datapath = './data/'+lang
corpus = Corpus(datapath, lang)
ntokens = len(corpus.dictionary)
pretrain = corpus.dictionary.pretrain_vec

word_attn = AttentionWordRNN(embeds=pretrain, batch_size=my_batch, num_tokens=ntokens, embed_size=300, 
                             word_gru_hidden=100, dropout=dropout, n_classes=2, bidirectional= True)

#sent_attn = AttentionSentRNN(batch_size=my_batch, sent_gru_hidden=100, word_gru_hidden=100, 
#                             n_classes=2, dropout=dropout, bidirectional= True)

def train_data(mini_batch, targets, word_attn_model, word_optimizer, criterion):
    state_word = word_attn_model.init_hidden().cuda()
    #state_sent = sent_attn_model.init_hidden().cuda()
    max_sents, batch_size, max_tokens = mini_batch.size()
    word_optimizer.zero_grad()
    #sent_optimizer.zero_grad()
    #s = None
    y_pred = None
    for i in range(max_sents):
        #torch.cuda.empty_cache()
        _s, state_word, _ = word_attn_model(mini_batch[i,:,:].transpose(0,1), state_word) #train ith user
        #if(s is None):
        if(y_pred is None):
            y_pred = _s
        else:
            y_pred = torch.cat((y_pred,_s),0)            
    #y_pred, state_sent, _ = sent_attn_model(s, state_sent)
    loss = criterion(y_pred.cuda(), targets)

    state_word = None
    #state_sent = None
    max_sents = None
    batch_size = None
    max_tokens = None 
    mini_batch = None
    torch.cuda.empty_cache()
    loss.backward()
    
    word_optimizer.step()
    #sent_optimizer.step()
    
    return loss.data.item()



def get_predictions(val_tokens, word_attn_model):
    max_sents, batch_size, max_tokens = val_tokens.size()
    state_word = word_attn_model.init_hidden().cuda()
    #state_sent = sent_attn_model.init_hidden().cuda()
    s = None
    #print(max_sents, max_tokens, "UHJKSDG")
    for i in range(max_sents):
        _s, state_word, _ = word_attn_model(val_tokens[i,:,:].transpose(0,1), state_word)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)            
    #y_pred, state_sent, _ = sent_attn_model(s, state_sent)    
    return s



#learning_rate = 0.001
#momentum = 0.9
#word_optimizer = torch.optim.SGD(word_attn.parameters(), lr=learning_rate, momentum= momentum)
#sent_optimizer = torch.optim.SGD(sent_attn.parameters(), lr=learning_rate, momentum= momentum)
word_optimizer = torch.optim.Adam(word_attn.parameters())
#sent_optimizer = torch.optim.Adam(sent_attn.parameters())
criterion = nn.NLLLoss()

word_attn.cuda()
#sent_attn.cuda()



def pad_batch(mini_batch):
    mini_batch_size = len(mini_batch)
    max_sent_len = int(np.mean([len(x) for x in mini_batch]))
    max_token_len = int(np.mean([len(val) for sublist in mini_batch for val in sublist]))
    main_matrix = np.zeros((mini_batch_size, max_sent_len, max_token_len), dtype= np.int)
    for i in range(main_matrix.shape[0]):
        for j in range(main_matrix.shape[1]):
            for k in range(main_matrix.shape[2]):
                try:
                    main_matrix[i,j,k] = mini_batch[i][j][k]
                except IndexError:
                    pass
    #return Variable(torch.from_numpy(main_matrix).transpose(0,1))
    return Variable(torch.LongTensor(main_matrix).transpose(0,1))



def test_accuracy_mini_batch(tokens, labels, word_attn):
    y_pred = get_predictions(tokens, word_attn)
    #print("PRED",y_pred)
    _, y_pred = torch.max(y_pred, 1)
    correct = np.ndarray.flatten(y_pred.data.cpu().numpy())
    labels = np.ndarray.flatten(labels.data.cpu().numpy())
    #print("CORR",correct)
    #print("LABELS",labels)
    num_correct = sum(correct == labels)
    return float(num_correct) / len(correct)

def test_accuracy_full_batch(tokens, labels, mini_batch_size, word_attn):
    p = []
    p_nonlinear = []
    l = []
    g = gen_minibatch(tokens, labels, mini_batch_size)
    for token, label in g:
        y_pred = get_predictions(token.cuda(), word_attn)
        #print("BEFORE",y_pred)
        p_nonlinear.append(np.ndarray.flatten(y_pred[:,1].data.cpu().numpy()))
        _, y_pred = torch.max(y_pred, 1)
        #print("AFTER",y_pred)
        p.append(np.ndarray.flatten(y_pred.data.cpu().numpy()))
        l.append(np.ndarray.flatten(label.data.cpu().numpy()))
    p = [item for sublist in p for item in sublist]
    l = [item for sublist in l for item in sublist]
    p_nonlinear = [np.exp(item) for sublist in p_nonlinear for item in sublist]
    p = np.array(p)
    l = np.array(l)
    #print("TOKEN LEN",len(tokens))
    #print("NONLINEAR",p_nonlinear)
    #print("PREDICT",p)
    #print("LABEL",l)
    num_correct = sum(p == l)
    return float(num_correct)/ len(p), sklearn.metrics.roc_auc_score(l, p_nonlinear)

def test_data(mini_batch, targets, word_attn_model):    
    state_word = word_attn_model.init_hidden().cuda()
    #state_sent = sent_attn_model.init_hidden().cuda()
    max_sents, batch_size, max_tokens = mini_batch.size()
    s = None
    for i in range(max_sents):
        _s, state_word, _ = word_attn_model(mini_batch[i,:,:].transpose(0,1), state_word)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)            
    #y_pred, state_sent,_ = sent_attn_model(s, state_sent)
    loss = criterion(s.cuda(), targets)     
    return loss.data.item()

def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert inputs.shape[0] == targets.shape[0]
    if shuffle:
        indices = np.arange(inputs.shape[0])
        np.random.shuffle(indices)
    #print(inputs.shape[0] - batchsize+1, batchsize, "HOO")
    for start_idx in range(0, inputs.shape[0] - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]



def gen_minibatch(tokens, labels, mini_batch_size, shuffle= True):
    for token, label in iterate_minibatches(tokens, labels, mini_batch_size, shuffle= shuffle):
        token = pad_batch(token)
        yield token.cuda(), Variable(torch.LongTensor(label), requires_grad= False).cuda()

def check_val_loss(val_tokens, val_labels, mini_batch_size, word_attn_model):
    val_loss = []
    for token, label in iterate_minibatches(val_tokens, val_labels, mini_batch_size, shuffle= True):
        val_loss.append(test_data(pad_batch(token).cuda(), Variable(torch.LongTensor(label), requires_grad= False).cuda(), 
                                  word_attn_model))
    return np.mean(val_loss)

import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def train_early_stopping(mini_batch_size, X_train, y_train, X_test, y_test, word_attn_model,
                         word_attn_optimiser, loss_criterion, num_epoch, 
                         print_val_loss_every = 1000, print_loss_every = 50):
    #for i in word_attn_model.parameters():
        #print(i.data, "PARAM")
    max_eval_acc = 0
    max_train_acc = 0
    max_eval_aucroc = 0
    max_train_aucroc = 0
    word_attn_model.train()
    #sent_attn_model.train()
    start = time.time()
    loss_full = []
    loss_epoch = []
    accuracy_epoch = []
    loss_smooth = []
    accuracy_full = []
    epoch_counter = 0
    g = gen_minibatch(X_train, y_train, mini_batch_size)
    for i in range(1, num_epoch + 1):
        try:
            word_attn_model.train()
            #sent_attn_model.train()
            tokens, labels = next(g)
            loss = train_data(tokens, labels, word_attn_model, word_attn_optimiser, loss_criterion)
            acc = test_accuracy_mini_batch(tokens, labels, word_attn_model)
            accuracy_full.append(acc)
            accuracy_epoch.append(acc)
            loss_full.append(loss)
            loss_epoch.append(loss)
            # print loss every n passes
            if i % print_loss_every == 0:
                print('Loss at %d minibatches, %d epoch,(%s) is %f' %(i, epoch_counter, timeSince(start), np.mean(loss_epoch)))
                print('Accuracy at %d minibatches is %f' % (i, np.mean(accuracy_epoch)))
            # check validation loss every n passes
            if i % print_val_loss_every == 0:
                word_attn_model.eval()
                #sent_attn_model.eval()
                val_loss = check_val_loss(X_test, y_test, mini_batch_size, word_attn_model)
                print('Average training loss at this epoch..minibatch..%d..is %f' % (i, np.mean(loss_epoch)))
                print('Validation loss after %d passes is %f' %(i, val_loss))
                if val_loss > np.mean(loss_full):
                    print('Validation loss is higher than training loss at %d is %f , stopping training!' % (i, val_loss))
                    print('Average training loss at %d is %f' % (i, np.mean(loss_full)))
        except StopIteration:
            epoch_counter += 1
            print('Reached %d epochs' % epoch_counter)
            print('i %d' % i)
            word_attn_model.eval()
            #sent_attn_model.eval()
            acc, aucroc = test_accuracy_full_batch(corpus.X_test, corpus.y_test, my_batch, word_attn)
            if acc>max_eval_acc:
                max_eval_acc = acc
            if aucroc>max_eval_aucroc:
                max_eval_aucroc = aucroc
            print("Test accuracy:",acc)
            print("Max test accruacy:",max_eval_acc)
            print("Test aucroc:",aucroc)
            print("Max test aucroc:",max_eval_aucroc)
            word_attn_model.train()
            #sent_attn_model.train()
            acc, aucroc = test_accuracy_full_batch(corpus.X_train, corpus.y_train, my_batch, word_attn)
            if acc>max_train_acc:
                max_train_acc = acc
            if aucroc>max_train_aucroc:
                max_train_aucroc = aucroc
            print("Train accuracy:",acc)
            print("Max train accruacy:",max_train_acc)
            print("Train aucroc:",aucroc)
            print("Max train aucroc:",max_train_aucroc)
            #if epoch_counter == 1:
                #break
            g = gen_minibatch(X_train, y_train, mini_batch_size)
            loss_epoch = []
            accuracy_epoch = []
            if epoch_counter==20:
                break
    return loss_full

In [4]:
loss_full= train_early_stopping(my_batch, corpus.X_train, corpus.y_train, corpus.X_test, corpus.y_test, word_attn, word_optimizer, 
                            criterion, 5000, 1, 1)



Loss at 1 minibatches, 0 epoch,(0m 1s) is 0.675891
Accuracy at 1 minibatches is 0.671875
Average training loss at this epoch..minibatch..1..is 0.675891
Validation loss after 1 passes is 0.677166
Validation loss is higher than training loss at 1 is 0.677166 , stopping training!
Average training loss at 1 is 0.675891
Loss at 2 minibatches, 0 epoch,(0m 2s) is 0.666181
Accuracy at 2 minibatches is 0.671875
Average training loss at this epoch..minibatch..2..is 0.666181
Validation loss after 2 passes is 0.673878
Validation loss is higher than training loss at 2 is 0.673878 , stopping training!
Average training loss at 2 is 0.666181
Loss at 3 minibatches, 0 epoch,(0m 3s) is 0.646948
Accuracy at 3 minibatches is 0.697917
Average training loss at this epoch..minibatch..3..is 0.646948
Validation loss after 3 passes is 0.665389
Validation loss is higher than training loss at 3 is 0.665389 , stopping training!
Average training loss at 3 is 0.646948
Loss at 4 minibatches, 0 epoch,(0m 5s) is 0.63097

Test accuracy: 0.578125
Max test accruacy: 0.828125
Test aucroc: 0.890890890890891
Max test aucroc: 0.9020833333333332
Train accuracy: 0.7256944444444444
Max train accruacy: 0.7256944444444444
Train aucroc: 0.7697061584001155
Max train aucroc: 0.7697061584001155
Loss at 31 minibatches, 3 epoch,(0m 41s) is 0.602595
Accuracy at 31 minibatches is 0.750000
Average training loss at this epoch..minibatch..31..is 0.602595
Validation loss after 31 passes is 0.604774
Validation loss is higher than training loss at 31 is 0.604774 , stopping training!
Average training loss at 31 is 0.568150
Loss at 32 minibatches, 3 epoch,(0m 42s) is 0.530447
Accuracy at 32 minibatches is 0.796875
Average training loss at this epoch..minibatch..32..is 0.530447
Validation loss after 32 passes is 0.561377
Loss at 33 minibatches, 3 epoch,(0m 43s) is 0.523794
Accuracy at 33 minibatches is 0.760417
Average training loss at this epoch..minibatch..33..is 0.523794
Validation loss after 33 passes is 0.583567
Validation lo

Loss at 63 minibatches, 6 epoch,(1m 23s) is 0.452768
Accuracy at 63 minibatches is 0.807292
Average training loss at this epoch..minibatch..63..is 0.452768
Validation loss after 63 passes is 0.361183
Loss at 64 minibatches, 6 epoch,(1m 24s) is 0.413320
Accuracy at 64 minibatches is 0.835938
Average training loss at this epoch..minibatch..64..is 0.413320
Validation loss after 64 passes is 0.329434
Loss at 65 minibatches, 6 epoch,(1m 26s) is 0.392932
Accuracy at 65 minibatches is 0.856250
Average training loss at this epoch..minibatch..65..is 0.392932
Validation loss after 65 passes is 0.322961
Loss at 66 minibatches, 6 epoch,(1m 27s) is 0.371830
Accuracy at 66 minibatches is 0.864583
Average training loss at this epoch..minibatch..66..is 0.371830
Validation loss after 66 passes is 0.426804
Loss at 67 minibatches, 6 epoch,(1m 28s) is 0.344281
Accuracy at 67 minibatches is 0.881696
Average training loss at this epoch..minibatch..67..is 0.344281
Validation loss after 67 passes is 0.523657


Loss at 93 minibatches, 9 epoch,(2m 3s) is 0.105242
Accuracy at 93 minibatches is 0.973958
Average training loss at this epoch..minibatch..93..is 0.105242
Validation loss after 93 passes is 0.387151
Loss at 94 minibatches, 9 epoch,(2m 5s) is 0.105788
Accuracy at 94 minibatches is 0.972656
Average training loss at this epoch..minibatch..94..is 0.105788
Validation loss after 94 passes is 0.373160
Loss at 95 minibatches, 9 epoch,(2m 6s) is 0.111424
Accuracy at 95 minibatches is 0.971875
Average training loss at this epoch..minibatch..95..is 0.111424
Validation loss after 95 passes is 0.335475
Loss at 96 minibatches, 9 epoch,(2m 7s) is 0.117337
Accuracy at 96 minibatches is 0.973958
Average training loss at this epoch..minibatch..96..is 0.117337
Validation loss after 96 passes is 0.390301
Loss at 97 minibatches, 9 epoch,(2m 8s) is 0.105575
Accuracy at 97 minibatches is 0.977679
Average training loss at this epoch..minibatch..97..is 0.105575
Validation loss after 97 passes is 0.253214
Loss 

Loss at 122 minibatches, 12 epoch,(2m 42s) is 0.029286
Accuracy at 122 minibatches is 0.992188
Average training loss at this epoch..minibatch..122..is 0.029286
Validation loss after 122 passes is 0.414570
Validation loss is higher than training loss at 122 is 0.414570 , stopping training!
Average training loss at 122 is 0.326015
Loss at 123 minibatches, 12 epoch,(2m 43s) is 0.067008
Accuracy at 123 minibatches is 0.989583
Average training loss at this epoch..minibatch..123..is 0.067008
Validation loss after 123 passes is 0.389506
Validation loss is higher than training loss at 123 is 0.389506 , stopping training!
Average training loss at 123 is 0.324361
Loss at 124 minibatches, 12 epoch,(2m 44s) is 0.065410
Accuracy at 124 minibatches is 0.988281
Average training loss at this epoch..minibatch..124..is 0.065410
Validation loss after 124 passes is 0.248444
Loss at 125 minibatches, 12 epoch,(2m 46s) is 0.057977
Accuracy at 125 minibatches is 0.990625
Average training loss at this epoch..m

Loss at 148 minibatches, 14 epoch,(3m 16s) is 0.013098
Accuracy at 148 minibatches is 0.998047
Average training loss at this epoch..minibatch..148..is 0.013098
Validation loss after 148 passes is 0.354768
Validation loss is higher than training loss at 148 is 0.354768 , stopping training!
Average training loss at 148 is 0.271821
Loss at 149 minibatches, 14 epoch,(3m 17s) is 0.014484
Accuracy at 149 minibatches is 0.996528
Average training loss at this epoch..minibatch..149..is 0.014484
Validation loss after 149 passes is 0.472368
Validation loss is higher than training loss at 149 is 0.472368 , stopping training!
Average training loss at 149 is 0.269997
Reached 15 epochs
i 150
Test accuracy: 0.921875
Max test accruacy: 0.9375
Test aucroc: 0.978978978978979
Max test aucroc: 0.978978978978979
Train accuracy: 0.9965277777777778
Max train accruacy: 0.9965277777777778
Train aucroc: 0.9998982306674614
Max train aucroc: 0.9998982306674614
Loss at 151 minibatches, 15 epoch,(3m 20s) is 0.002976

Loss at 175 minibatches, 17 epoch,(3m 52s) is 0.009288
Accuracy at 175 minibatches is 1.000000
Average training loss at this epoch..minibatch..175..is 0.009288
Validation loss after 175 passes is 0.557881
Validation loss is higher than training loss at 175 is 0.557881 , stopping training!
Average training loss at 175 is 0.232897
Loss at 176 minibatches, 17 epoch,(3m 53s) is 0.008140
Accuracy at 176 minibatches is 1.000000
Average training loss at this epoch..minibatch..176..is 0.008140
Validation loss after 176 passes is 0.567012
Validation loss is higher than training loss at 176 is 0.567012 , stopping training!
Average training loss at 176 is 0.231447
Loss at 177 minibatches, 17 epoch,(3m 54s) is 0.015260
Accuracy at 177 minibatches is 1.000000
Average training loss at this epoch..minibatch..177..is 0.015260
Validation loss after 177 passes is 0.485398
Validation loss is higher than training loss at 177 is 0.485398 , stopping training!
Average training loss at 177 is 0.230363
Loss at

Test accuracy: 0.921875
Max test accruacy: 0.9375
Test aucroc: 0.9807692307692308
Max test aucroc: 0.9862745098039216
Train accuracy: 0.9965277777777778
Max train accruacy: 0.9982638888888888
Train aucroc: 0.9999857036655802
Max train aucroc: 0.9999857036655802


In [5]:
#print(test_accuracy_full_batch(corpus.X_test, corpus.y_test, my_batch, word_attn, sent_attn))

In [6]:
#test_accuracy_full_batch(corpus.X_train, corpus.y_train, my_batch, word_attn, sent_attn)