In [1]:
import os
import torch
import numpy as np
import csv
import collections
from img_to_vec import Img2Vec
from PIL import Image
import requests
from io import BytesIO

img2vec = Img2Vec(cuda=True)



class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = [] #index gives word
        self.idx2abv = [] #discrete feature: is index nazi abv
        self.idx2phrase = []
        self.idx2german = []
        self.idx2group = []
        self.pretrain_vec = [] # should match index order of words in dict.
        
        self.abvs = []
        self.phrases = [] #only 1-word phrases + echo
        self.german = [] #only 1-word german 
        self.groups = [] #only 1-word groups
        
        self.load_nazi_features()
    
    def load_nazi_features(self):
        with open('abbreviations.txt', 'r', encoding='utf-8') as f:
            for line in f:
                words = line.split()
                #print(words[0].lower())
                self.abvs.append(words[0].lower())
        self.phrases.append('(((')
        self.phrases.append(')))')
        '''
        with open('phrases.txt', 'r', encoding='utf-8') as f:
            for line in f:
                words = line.split()
                if len(words)==1:
                    self.phrases.append(words[0].lower())
        
        with open('german.txt', 'r', encoding='utf-8') as f:
            for line in f:
                words = line.split()
                if len(words)==1:
                    self.german.append(words[0].lower())
        with open('groups.txt', 'r', encoding='utf-8') as f:
            for line in f:
                words = line.split()
                if len(words)==1:
                    self.groups.append(words[0].lower())
        '''
    
    #just abbreviations for now
    #cause finding chunks means keeping track of last 6 words
    def find_nazi_features(self,word):#create lookup for discrete feature
        if word.lower() in self.abvs:
            #print("ding",word.lower())
            self.idx2abv.append([0.1]) #yes
        else:
            self.idx2abv.append([0]) #no
            
        #For subphrases
        found = False
        for i in self.phrases:
            if i in word.lower():
                #print("ding",word.lower())
                found = True
                break
                
        if found:
            self.idx2phrase.append([0.1])
        else:
            self.idx2phrase.append([0])
            
        '''
        #For subphrases
        found = False
        for i in self.german:
            if i in word.lower():
                #print("ding",word.lower())
                found = True
                break
                
        if found:
            self.idx2german.append([1])
        else:
            self.idx2german.append([0])
            
        #For subphrases
        found = False
        for i in self.groups:
            if i in word.lower():
                #print("ding",word.lower())
                found = True
                break
                
        if found:
            self.idx2group.append([1])
        else:
            self.idx2group.append([0])
        '''
            
        
    def add_word(self, word, ahead_six=None, vec=None):
        if vec is None:
            if word not in self.word2idx:
                self.idx2word.append(word)
                self.word2idx[word] = len(self.idx2word) - 1
                self.find_nazi_features(word)
        else:
            if word not in self.word2idx:
                self.pretrain_vec.append(vec)
                self.idx2word.append(word)
                self.word2idx[word] = len(self.idx2word) - 1
                self.find_nazi_features(word)
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path, language):
        self.dictionary = Dictionary()
        if language is not None:
            self.pretrained = self.add_pretrained(os.path.join('', 'wiki.' + language + '.vec'))
        #self.trainid, self.trainlab, self.trainidx = self.tokenize_by_user(os.path.join(path, 'train.csv'),True)
        #self.validid, self.validlab, self.valididx = self.tokenize_by_user(os.path.join(path, 'valid.csv'),False)
        #self.testid, self.testlab, self.testidx = self.tokenize_by_user(os.path.join(path, 'test.csv'),False)
        self.X_train, self.y_train, self.pic_train, self.feat_train = self.tokenize(os.path.join('', 'train.csv'),True)
        #self.X_valid, self.y_valid = self.tokenize(os.path.join(path, 'valid.csv'),False)
        self.X_test, self.y_test, self.pic_test, self.feat_test = self.tokenize(os.path.join('', 'test.csv'),False)
        

    def add_pretrained(self, path):
        assert os.path.exists(path)

        # Add words with pretrained vectors to the dictionary
        # might be weird because no eos was added?
        with open(path, 'r', encoding='utf-8') as f:
            tokens = 0
            for line in f:
                words = line.split()
                if len(words) == 2: #first line
                    continue
                word = words[0]
                vec = words[1:]
                if len(vec) != 300:
                    continue #this skips the space embedding
                #vec = np.array(list(map(float, vec)))
                vec = list(map(float,vec))
                tokens += 1
                
                self.dictionary.add_word(word, vec)
    def tokenize(self, path, header):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            tokens = 0
            prev = None
            if header:
                first = True
            else:
                first = False
            tweet_count = 0
            user_idx = -1
            for row in reader:
                if first:
                    first = False
                    continue
                if len(row) is not 6:
                    continue
                
                tweet = row[0]
                label = row[1]
                if not label.isdigit():
                    continue
                extra = row[2:5] #bio, tweet pic, profile pic, user id
                if row[2] != prev: #new user
                    prev = row[2]
                    tweet_count = 0
                    user_idx += 1
                    
                    
                    words = row[2].split() #add bio
                    for word in words:
                        self.dictionary.add_word(word)

                words = tweet.split()
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding='utf-8') as f:
            x = np.zeros(user_idx+1,dtype='object')
            y = np.zeros(user_idx+1,dtype='int')
            z = np.zeros((user_idx+1,512),dtype='float')
            q = np.zeros((user_idx+1,2),dtype='float') #does the user ever have any nazi words?
            #ids = torch.LongTensor(tokens)
            #idxs = torch.LongTensor(user_idx+1)
            #labels = torch.LongTensor(user_idx+1)
            #print(user_idx+1)
            token = 0
            prev = None

            reader = csv.reader(f)
            if header:
                first = True
            else:
                first = False
            user_idx = -1
            for row in reader:
                if first:
                    first = False
                    continue
                if len(row) is not 6:
                    continue
                
                tweet = row[0]
                label = row[1]
                if not label.isdigit():
                    continue
                extra = row[2:5] #bio, tweet pic, profile pic, user id
                if row[2] != prev:
                    print(user_idx,"user")
                    tweet_idx = -1
                    user_idx += 1
                    prev = row[2]
                    y[user_idx] = int(label)
                    x[user_idx] = []
                    #print(token, "NEW USER")
                    #idxs[user_idx] = token
                    x[user_idx].append([])
                    
                    pic_url = row[4]
                    if len(pic_url) >4: #http:
                        response = requests.get(pic_url)
                        if(response.status_code != 200):
                            #print(response.status_code)
                            z[user_idx] = np.zeros(512)
                        else:

                            img = Image.open(BytesIO(response.content)).convert("RGB")
                            #print(img.size)
                            vec = img2vec.get_vec(img, tensor=False)
                            z[user_idx] = vec
                    else:
                        z[user_idx] = np.zeros(512)
                    
                    words = row[2].split() #take the bio as the 'first tweet'
                    tweet_idx+=1
                    for word in words:
                        x[user_idx][0].append(self.dictionary.word2idx[word])
                

                words = tweet.split()
                token = 0
                tweet_idx+=1
                if tweet_idx >=21:
                    #print(tweet_idx)
                    continue
                x[user_idx].append([])
                for word in words:
                    idx = self.dictionary.word2idx[word]
                    if self.dictionary.idx2abv[idx] == 0.1:
                        q[user_idx][0] = 0.1
                    if self.dictionary.idx2phrase[idx] == 0.1:
                        q[user_idx][1] = 0.1
                    x[user_idx][tweet_idx].append(idx)
                    token+=1
                

        return x, y, q, q


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

#attention functions

def batch_matmul_bias(seq, weight, bias, nonlinearity=''):
    s = None
    bias_dim = bias.size()
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight) 
        _s_bias = _s + bias.expand(bias_dim[0], _s.size()[0]).transpose(0,1)
        if(nonlinearity=='tanh'):
            _s_bias = torch.tanh(_s_bias)
        _s_bias = _s_bias.unsqueeze(0)
        if(s is None):
            s = _s_bias
        else:
            s = torch.cat((s,_s_bias),0)
    return s.squeeze()

def batch_matmul(seq, weight, nonlinearity=''):
    s = None
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight)
        if(nonlinearity=='tanh'):
            _s = torch.tanh(_s)
        _s = _s.unsqueeze(0)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)
    return s.squeeze()

def attention_mul(rnn_outputs, att_weights):
    attn_vectors = None
    for i in range(rnn_outputs.size(0)):
        h_i = rnn_outputs[i]
        a_i = att_weights[i].unsqueeze(1).expand_as(h_i)
        h_i = a_i * h_i
        h_i = h_i.unsqueeze(0)
        if(attn_vectors is None):
            attn_vectors = h_i
        else:
            attn_vectors = torch.cat((attn_vectors,h_i),0)
    return torch.sum(attn_vectors, 0).unsqueeze(0)

'''
def batch_matmul_bias(seq, weight, bias, nonlinearity=''):
    s = None
    bias_dim = bias.size()
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight) 
        _s_bias = _s + bias.expand(bias_dim[0], _s.size()[0]).transpose(0,1)
        if(nonlinearity=='tanh'):
            _s_bias = torch.tanh(_s_bias)
        _s_bias = _s_bias.unsqueeze(0)
        if(s is None):
            s = _s_bias
        else:
            s = torch.cat((s,_s_bias),0)
    return s.squeeze()
    
def batch_matmul(seq, weight, nonlinearity=''):
    s = None
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight)
        if(nonlinearity=='tanh'):
            _s = torch.tanh(_s)
        _s = _s.unsqueeze(0)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)
    return s.squeeze()

def attention_mul(self, rnn_outputs, att_weights):
    attn_vectors = None
    for i in range(rnn_outputs.size(0)):
        h_i = rnn_outputs[i]
        a_i = att_weights[i].unsqueeze(1).expand_as(h_i)
        h_i = a_i * h_i
        h_i = h_i.unsqueeze(0)
        if(attn_vectors is None):
            attn_vectors = h_i
        else:
            attn_vectors = torch.cat((attn_vectors,h_i),0)
    return torch.sum(attn_vectors, 0).unsqueeze(0)
'''
class AttentionWordRNN(nn.Module):
    
    
    def __init__(self, embeds, abvs, phrases, german, groups, batch_size,
                 num_tokens, embed_size, word_gru_hidden, dropout, bidirectional= True):        
        
        super(AttentionWordRNN, self).__init__()
        
        self.batch_size = batch_size
        self.num_tokens = num_tokens
        self.embed_size = embed_size #add abv feature size on
        self.word_gru_hidden = word_gru_hidden
        self.bidirectional = bidirectional
        self.drop = nn.Dropout(dropout)

        
        
        self.lookup = nn.Embedding(num_tokens, 300)
        self.lookup_abv = nn.Embedding(num_tokens, 1)
        self.lookup_phr = nn.Embedding(num_tokens, 1)
        #self.lookup_ger = nn.Embedding(num_tokens, 1)
        #self.lookup_gro = nn.Embedding(num_tokens, 1)
        #self.abvs = abvs #doesn't need gradient, it's static/discrete

        #init lookup table
        #print(len(embeds),embeds[0])
        #print(len(abvs),abvs[0])

        
        initrange = 0.1

        k = len(embeds) # the first k indices are pretrained. the rest are unknown
        
        if k is not 0:
            first = np.array(embeds)
            second = np.random.uniform(-initrange,initrange,size=(num_tokens-k,300))
            self.lookup.weight.data.copy_(torch.from_numpy(np.concatenate((first,second),axis=0)))
        else:
            self.lookup.weight.data.uniform_(-initrange, initrange)
        self.lookup_abv.weight.requires_grad=False
        self.lookup_abv.weight.data.copy_(torch.from_numpy(np.array(abvs)))
        self.lookup_phr.weight.requires_grad=False
        self.lookup_phr.weight.data.copy_(torch.from_numpy(np.array(phrases)))
        #self.lookup_ger.weight.requires_grad=False
        #self.lookup_ger.weight.data.copy_(torch.from_numpy(np.array(german)))
        #self.lookup_gro.weight.requires_grad=False
        #self.lookup_gro.weight.data.copy_(torch.from_numpy(np.array(groups)))
        

        if bidirectional == True:
            self.word_gru = nn.GRU(embed_size, word_gru_hidden, bidirectional= True)
            self.weight_W_word = nn.Parameter(torch.Tensor(2* word_gru_hidden,2*word_gru_hidden))
            self.bias_word = nn.Parameter(torch.Tensor(2* word_gru_hidden,1))
            self.weight_proj_word = nn.Parameter(torch.Tensor(2*word_gru_hidden, 1))
        else:
            self.word_gru = nn.GRU(embed_size, word_gru_hidden, bidirectional= False)
            self.weight_W_word = nn.Parameter(torch.Tensor(word_gru_hidden, word_gru_hidden))
            self.bias_word = nn.Parameter(torch.Tensor(word_gru_hidden,1))
            self.weight_proj_word = nn.Parameter(torch.Tensor(word_gru_hidden, 1))
            
        self.softmax_word = nn.Softmax()
        #self.word_gru.data.uniform_(-initrange,initrange)
        self.weight_W_word.data.uniform_(-initrange, initrange)
        self.weight_proj_word.data.uniform_(-initrange,initrange)
        self.bias_word.data.uniform_(-initrange,initrange)

        
        
    def forward(self, embed, state_word):
        # embeddings
        #print(embed)
        embedded = self.drop(self.lookup(embed))
        abv_feature = self.lookup_abv(embed)
        phr_feature = self.lookup_phr(embed)
        #ger_feature = self.lookup_ger(embed)
        #gro_feature = self.lookup_gro(embed)
        #print(len(embedded), len(abv_feature))
        #embedded = torch.cat((embedded,abv_feature,phr_feature,ger_feature,gro_feature),dim=2)
        embedded = torch.cat((embedded,abv_feature,phr_feature),dim=2)
        # word level gru
        #state_word = self.drop(state_word) #idk
        output_word, state_word = self.word_gru(embedded, state_word)
        state_word = self.drop(state_word) #idk
        output_word = self.drop(output_word)
#         print output_word.size()
        word_squish = self.drop(batch_matmul_bias(output_word, self.weight_W_word,self.bias_word, nonlinearity='tanh'))
        word_attn = self.drop(batch_matmul(word_squish, self.weight_proj_word))
        word_attn_norm = self.drop(self.softmax_word(word_attn.transpose(1,0)))
        word_attn_vectors = self.drop(attention_mul(output_word, word_attn_norm.transpose(1,0)))
        return word_attn_vectors, state_word, word_attn_norm
    
    def init_hidden(self):
        if self.bidirectional == True:
            return Variable(torch.zeros(2, self.batch_size, self.word_gru_hidden))
        else:
            return Variable(torch.zeros(1, self.batch_size, self.word_gru_hidden))

class AttentionSentRNN(nn.Module):
    
    
    def __init__(self, batch_size, sent_gru_hidden, word_gru_hidden, n_classes, dropout, bidirectional= True):        
        
        p = 2
        super(AttentionSentRNN, self).__init__()
        
        self.batch_size = batch_size
        self.sent_gru_hidden = sent_gru_hidden
        self.n_classes = n_classes
        self.word_gru_hidden = word_gru_hidden
        self.bidirectional = bidirectional
        
        self.drop = nn.Dropout(dropout)

        initrange = 0.1
        
        
        if bidirectional == True:
            self.sent_gru = nn.GRU(2 * word_gru_hidden, sent_gru_hidden, bidirectional= True)        
            self.weight_W_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden+p ,2* sent_gru_hidden+p))
            self.bias_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden+p,1))
            self.weight_proj_sent = nn.Parameter(torch.Tensor(2* sent_gru_hidden+p, 1))
            self.final_linear = nn.Linear(2* sent_gru_hidden+p, n_classes)
        else:
            self.sent_gru = nn.GRU(word_gru_hidden, sent_gru_hidden, bidirectional= True)        
            self.weight_W_sent = nn.Parameter(torch.Tensor(sent_gru_hidden+p ,sent_gru_hidden+p))
            self.bias_sent = nn.Parameter(torch.Tensor(sent_gru_hidden+p,1))
            self.weight_proj_sent = nn.Parameter(torch.Tensor(sent_gru_hidden+p, 1))
            self.final_linear = nn.Linear(sent_gru_hidden+p, n_classes)
        self.softmax_sent = nn.Softmax()
        self.final_softmax = nn.Softmax()
        self.bias_sent.data.uniform_(-initrange, initrange)
        #self.sent_gru.data.uniform_(-initrange,initrange)
        self.weight_W_sent.data.uniform_(-initrange, initrange)
        self.weight_proj_sent.data.uniform_(-initrange,initrange)
        
        
    def forward(self, word_attention_vectors, state_sent,pics):
        #pics is a a batchxvec dump of prof image vectors
        #MANUALLY DROPOUT THE GRU
        #state_word = self.drop(state_sent)
        pics = pics.unsqueeze(0)
        pics = pics.repeat(word_attention_vectors.size(0),1,1)
        
        #word_attention_vectors = torch.cat((word_attention_vectors,pics.unsqueeze(0)),dim=0)
        output_sent, state_sent = self.sent_gru(word_attention_vectors, state_sent) 
        #when it comes out of the gru, concatenate it.
        state_word = self.drop(state_sent)
        output_sent = self.drop(output_sent)
        #print(output_sent.size(), pics.size())
        output_sent = torch.cat((output_sent,pics),dim=2)
        sent_squish = self.drop(batch_matmul_bias(output_sent, self.weight_W_sent,self.bias_sent, nonlinearity='tanh'))
        sent_attn = self.drop(batch_matmul(sent_squish, self.weight_proj_sent))
        sent_attn_norm = self.drop(self.softmax_sent(sent_attn.transpose(1,0)))
        sent_attn_vectors = self.drop(attention_mul(output_sent, sent_attn_norm.transpose(1,0)))    
        # final classifier
        final_map = self.final_linear(sent_attn_vectors.squeeze(0))
        return F.log_softmax(final_map), state_sent, sent_attn_norm
    
    def init_hidden(self):
        if self.bidirectional == True:
            return Variable(torch.zeros(2, self.batch_size, self.sent_gru_hidden))
        else:
            return Variable(torch.zeros(1, self.batch_size, self.sent_gru_hidden))

In [3]:
#import model
#import data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import sklearn.metrics

dropout=0
my_batch=64
lang='en'
datapath = './data/'+lang
corpus = Corpus(datapath, lang)
ntokens = len(corpus.dictionary)
pretrain = corpus.dictionary.pretrain_vec
idx2abv = corpus.dictionary.idx2abv
idx2phrase = corpus.dictionary.idx2phrase
#idx2german = corpus.dictionary.idx2german
#idx2group = corpus.dictionary.idx2group

word_attn = AttentionWordRNN(embeds=pretrain, abvs=idx2abv, phrases=idx2phrase, german=None, groups=None,
                             batch_size=my_batch, num_tokens=ntokens, embed_size=300+2, 
                             word_gru_hidden=100, dropout=dropout, bidirectional= True)

sent_attn = AttentionSentRNN(batch_size=my_batch, sent_gru_hidden=100, word_gru_hidden=100, 
                             n_classes=2, dropout=dropout, bidirectional= True)

def train_data(mini_batch, targets, pics, word_attn_model, sent_attn_model, word_optimizer, sent_optimizer, criterion):
    state_word = word_attn_model.init_hidden().cuda()
    state_sent = sent_attn_model.init_hidden().cuda()
    max_sents, batch_size, max_tokens = mini_batch.size()
    word_optimizer.zero_grad()
    sent_optimizer.zero_grad()
    s = None
    for i in range(max_sents):
        #torch.cuda.empty_cache()
        _s, state_word, _ = word_attn_model(mini_batch[i,:,:].transpose(0,1), state_word)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)            
    y_pred, state_sent, _ = sent_attn_model.forward(s, state_sent,pics)
    loss = criterion(y_pred.cuda(), targets)

    state_word = None
    state_sent = None
    max_sents = None
    batch_size = None
    max_tokens = None 
    mini_batch = None
    torch.cuda.empty_cache()
    loss.backward()
    
    word_optimizer.step()
    sent_optimizer.step()
    
    return loss.data.item()



def get_predictions(val_tokens, pics, word_attn_model, sent_attn_model):
    max_sents, batch_size, max_tokens = val_tokens.size()
    state_word = word_attn_model.init_hidden().cuda()
    state_sent = sent_attn_model.init_hidden().cuda()
    s = None
    for i in range(max_sents):
        _s, state_word, _ = word_attn_model(val_tokens[i,:,:].transpose(0,1), state_word)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)            
    y_pred, state_sent, _ = sent_attn_model.forward(s, state_sent, pics)    
    return y_pred



#learning_rate = 0.001
#momentum = 0.9
#word_optimizer = torch.optim.SGD(word_attn.parameters(), lr=learning_rate, momentum= momentum)
#sent_optimizer = torch.optim.SGD(sent_attn.parameters(), lr=learning_rate, momentum= momentum)
#word_optimizer = torch.optim.Adam(word_attn.parameters())
word_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, word_attn.parameters()))
sent_optimizer = torch.optim.Adam(sent_attn.parameters())
criterion = nn.NLLLoss()

word_attn.cuda()
sent_attn.cuda()



def pad_batch(mini_batch):
    mini_batch_size = len(mini_batch)
    max_sent_len = int(np.mean([len(x) for x in mini_batch]))
    max_token_len = int(np.mean([len(val) for sublist in mini_batch for val in sublist]))
    main_matrix = np.zeros((mini_batch_size, max_sent_len, max_token_len), dtype= np.int)
    for i in range(main_matrix.shape[0]):
        for j in range(main_matrix.shape[1]):
            for k in range(main_matrix.shape[2]):
                try:
                    main_matrix[i,j,k] = mini_batch[i][j][k]
                except IndexError:
                    pass
    #return Variable(torch.from_numpy(main_matrix).transpose(0,1))
    return Variable(torch.LongTensor(main_matrix).transpose(0,1))



def test_accuracy_mini_batch(tokens, labels, pics, word_attn, sent_attn):
    y_pred = get_predictions(tokens, pics, word_attn, sent_attn)
    _, y_pred = torch.max(y_pred, 1)
    correct = np.ndarray.flatten(y_pred.data.cpu().numpy())
    labels = np.ndarray.flatten(labels.data.cpu().numpy())
    num_correct = sum(correct == labels)
    return float(num_correct) / len(correct)

def test_accuracy_full_batch(tokens, labels, pics, mini_batch_size, word_attn, sent_attn):
    p = []
    p_nonlinear = []
    l = []
    g = gen_minibatch(tokens, labels, pics, mini_batch_size)
    for token, label, pic in g:
        y_pred = get_predictions(token.cuda(), pic, word_attn, sent_attn)
        #print("BEFORE",y_pred)
        p_nonlinear.append(np.ndarray.flatten(y_pred[:,1].data.cpu().numpy()))
        _, y_pred = torch.max(y_pred, 1)
        #print("AFTER",y_pred)
        p.append(np.ndarray.flatten(y_pred.data.cpu().numpy()))
        l.append(np.ndarray.flatten(label.data.cpu().numpy()))
    p = [item for sublist in p for item in sublist]
    l = [item for sublist in l for item in sublist]
    p_nonlinear = [np.exp(item) for sublist in p_nonlinear for item in sublist]
    p = np.array(p)
    l = np.array(l)
    #print("TOKEN LEN",len(tokens))
    #print("NONLINEAR",p_nonlinear)
    #print("PREDICT",p)
    #print("LABEL",l)
    num_correct = sum(p == l)
    return float(num_correct)/ len(p), sklearn.metrics.roc_auc_score(l, p_nonlinear)

def test_data(mini_batch, targets, pics, word_attn_model, sent_attn_model):    
    state_word = word_attn_model.init_hidden().cuda()
    state_sent = sent_attn_model.init_hidden().cuda()
    max_sents, batch_size, max_tokens = mini_batch.size()
    s = None
    for i in range(max_sents):
        _s, state_word, _ = word_attn_model(mini_batch[i,:,:].transpose(0,1), state_word)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)            
    y_pred, state_sent,_ = sent_attn_model.forward(s, state_sent,pics)
    loss = criterion(y_pred.cuda(), targets)     
    return loss.data.item()

def iterate_minibatches(inputs, targets, pics, batchsize, shuffle=False):
    assert inputs.shape[0] == targets.shape[0]
    if shuffle:
        indices = np.arange(inputs.shape[0])
        np.random.shuffle(indices)
    #print(inputs.shape[0] - batchsize+1, batchsize, "HOO")
    for start_idx in range(0, inputs.shape[0] - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt], pics[excerpt]



def gen_minibatch(tokens, labels, pics, mini_batch_size, shuffle= True):
    for token, label, pic in iterate_minibatches(tokens, labels, pics, mini_batch_size, shuffle= shuffle):
        token = pad_batch(token)
        #yield token.cuda(), Variable(torch.LongTensor(label), requires_grad= False).cuda(), pic
        yield token.cuda(), Variable(torch.LongTensor(label), requires_grad= False).cuda(), Variable(torch.FloatTensor(np.array(pic)), requires_grad=False).cuda()

def check_val_loss(val_tokens, val_labels, val_pics, mini_batch_size, word_attn_model, sent_attn_model):
    val_loss = []
    for token, label, pic in iterate_minibatches(val_tokens, val_labels, val_pics, mini_batch_size, shuffle= True):
        val_loss.append(test_data(pad_batch(token).cuda(), Variable(torch.LongTensor(label), requires_grad= False).cuda(),
                                  Variable(torch.FloatTensor(np.array(pic)).cuda(), requires_grad= False), 
                                  word_attn_model, sent_attn_model))
    return np.mean(val_loss)

import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def train_early_stopping(mini_batch_size, X_train, y_train, pic_train, X_test, y_test, pic_test, word_attn_model, sent_attn_model, 
                         word_attn_optimiser, sent_attn_optimiser, loss_criterion, num_epoch, 
                         print_val_loss_every = 1000, print_loss_every = 50):
    #for i in word_attn_model.parameters():
        #print(i.data, "PARAM")
    max_eval_acc = 0
    max_train_acc = 0
    max_eval_aucroc = 0
    max_train_aucroc = 0
    word_attn_model.train()
    sent_attn_model.train()
    start = time.time()
    loss_full = []
    loss_epoch = []
    accuracy_epoch = []
    loss_smooth = []
    accuracy_full = []
    epoch_counter = 0
    g = gen_minibatch(X_train, y_train, pic_train, mini_batch_size)
    for i in range(1, num_epoch + 1):
        try:
            word_attn_model.train()
            sent_attn_model.train()
            tokens, labels, pics = next(g)
            loss = train_data(tokens, labels, pics, word_attn_model, sent_attn_model, word_attn_optimiser, sent_attn_optimiser, loss_criterion)
            acc = test_accuracy_mini_batch(tokens, labels, pics, word_attn_model, sent_attn_model)
            accuracy_full.append(acc)
            accuracy_epoch.append(acc)
            loss_full.append(loss)
            loss_epoch.append(loss)
            # print loss every n passes
            if i % print_loss_every == 0:
                print('Loss at %d minibatches, %d epoch,(%s) is %f' %(i, epoch_counter, timeSince(start), np.mean(loss_epoch)))
                print('Accuracy at %d minibatches is %f' % (i, np.mean(accuracy_epoch)))
            # check validation loss every n passes
            if i % print_val_loss_every == 0:
                word_attn_model.eval()
                sent_attn_model.eval()
                val_loss = check_val_loss(X_test, y_test, pic_test, mini_batch_size, word_attn_model, sent_attn_model)
                print('Average training loss at this epoch..minibatch..%d..is %f' % (i, np.mean(loss_epoch)))
                print('Validation loss after %d passes is %f' %(i, val_loss))
                if val_loss > np.mean(loss_full):
                    print('Validation loss is higher than training loss at %d is %f , stopping training!' % (i, val_loss))
                    print('Average training loss at %d is %f' % (i, np.mean(loss_full)))
        except StopIteration:
            epoch_counter += 1
            print('Reached %d epochs' % epoch_counter)
            print('i %d' % i)
            word_attn_model.eval()
            sent_attn_model.eval()
            acc, aucroc = test_accuracy_full_batch(corpus.X_test, corpus.y_test, corpus.pic_test, my_batch, word_attn, sent_attn)
            if acc>max_eval_acc:
                max_eval_acc = acc
            if aucroc>max_eval_aucroc:
                max_eval_aucroc = aucroc
            print("Test accuracy:",acc)
            print("Max test accruacy:",max_eval_acc)
            print("Test aucroc:",aucroc)
            print("Max test aucroc:",max_eval_aucroc)
            word_attn_model.train()
            sent_attn_model.train()
            tacc, taucroc = test_accuracy_full_batch(corpus.X_train, corpus.y_train, corpus.pic_train, my_batch, word_attn, sent_attn)
            if tacc>max_train_acc:
                max_train_acc = tacc
            if taucroc>max_train_aucroc:
                max_train_aucroc = taucroc
            print("Train accuracy:",tacc)
            print("Max train accruacy:",max_train_acc)
            print("Train aucroc:",taucroc)
            print("Max train aucroc:",max_train_aucroc)
            #if epoch_counter == 1:
                #break
            g = gen_minibatch(X_train, y_train, pic_train, mini_batch_size)
            loss_epoch = []
            accuracy_epoch = []
            if epoch_counter==20:
                return acc, aucroc, max_eval_acc, max_eval_aucroc
    return loss_full, None, None, None

-1 user
0 user
1 user
2 user
3 user
4 user
5 user
6 user
7 user
8 user
9 user
10 user
11 user
12 user
13 user
14 user
15 user
16 user
17 user
18 user
19 user
20 user
21 user
22 user
23 user
24 user
25 user
26 user
27 user
28 user
29 user
30 user
31 user
32 user
33 user
34 user
35 user
36 user
37 user
38 user
39 user
40 user
41 user
42 user
43 user
44 user
45 user
46 user
47 user
48 user
49 user
50 user
51 user
52 user
53 user
54 user
55 user
56 user
57 user
58 user
59 user
60 user
61 user
62 user
63 user
64 user
65 user
66 user
67 user
68 user
69 user
70 user
71 user
72 user
73 user
74 user
75 user
76 user
77 user
78 user
79 user
80 user
81 user
82 user
83 user
84 user
85 user
86 user
87 user
88 user
89 user
90 user
91 user
92 user
93 user
94 user
95 user
96 user
97 user
98 user
99 user
100 user
101 user
102 user
103 user
104 user
105 user
106 user
107 user
108 user
109 user
110 user
111 user
112 user
113 user
114 user
115 user
116 user
117 user
118 user
119 user
120 user
121 user
122 

In [4]:
acc, aucroc, max_eval_acc, max_eval_aucroc = train_early_stopping(my_batch, corpus.X_train, corpus.y_train, corpus.pic_train, corpus.X_test, corpus.y_test, corpus.pic_test, word_attn, sent_attn, word_optimizer, sent_optimizer, 
                            criterion, 5000, 1, 1)



Loss at 1 minibatches, 0 epoch,(0m 0s) is 0.682358
Accuracy at 1 minibatches is 0.687500
Average training loss at this epoch..minibatch..1..is 0.682358
Validation loss after 1 passes is 0.639590
Loss at 2 minibatches, 0 epoch,(0m 2s) is 0.663557
Accuracy at 2 minibatches is 0.695312
Average training loss at this epoch..minibatch..2..is 0.663557
Validation loss after 2 passes is 0.628000
Loss at 3 minibatches, 0 epoch,(0m 3s) is 0.646223
Accuracy at 3 minibatches is 0.703125
Average training loss at this epoch..minibatch..3..is 0.646223
Validation loss after 3 passes is 0.593806
Loss at 4 minibatches, 0 epoch,(0m 4s) is 0.636836
Accuracy at 4 minibatches is 0.703125
Average training loss at this epoch..minibatch..4..is 0.636836
Validation loss after 4 passes is 0.609659
Loss at 5 minibatches, 0 epoch,(0m 5s) is 0.623818
Accuracy at 5 minibatches is 0.709375
Average training loss at this epoch..minibatch..5..is 0.623818
Validation loss after 5 passes is 0.577160
Reached 1 epochs
i 6
Test

Average training loss at this epoch..minibatch..40..is 0.036271
Validation loss after 40 passes is 0.370826
Loss at 41 minibatches, 6 epoch,(0m 50s) is 0.029988
Accuracy at 41 minibatches is 0.996875
Average training loss at this epoch..minibatch..41..is 0.029988
Validation loss after 41 passes is 0.394111
Validation loss is higher than training loss at 41 is 0.394111 , stopping training!
Average training loss at 41 is 0.374950
Reached 7 epochs
i 42
Test accuracy: 0.890625
Max test accruacy: 0.890625
Test aucroc: 0.9655797101449276
Max test aucroc: 0.9655797101449276
Train accuracy: 0.996875
Max train accruacy: 0.996875
Train aucroc: 0.9958315570081947
Max train aucroc: 0.9958315570081947
Loss at 43 minibatches, 7 epoch,(0m 52s) is 0.012920
Accuracy at 43 minibatches is 1.000000
Average training loss at this epoch..minibatch..43..is 0.012920
Validation loss after 43 passes is 0.411292
Validation loss is higher than training loss at 43 is 0.411292 , stopping training!
Average training l

Average training loss at this epoch..minibatch..67..is 0.000614
Validation loss after 67 passes is 0.864138
Validation loss is higher than training loss at 67 is 0.864138 , stopping training!
Average training loss at 67 is 0.239934
Loss at 68 minibatches, 11 epoch,(1m 23s) is 0.001117
Accuracy at 68 minibatches is 1.000000
Average training loss at this epoch..minibatch..68..is 0.001117
Validation loss after 68 passes is 0.955011
Validation loss is higher than training loss at 68 is 0.955011 , stopping training!
Average training loss at 68 is 0.235753
Loss at 69 minibatches, 11 epoch,(1m 24s) is 0.001101
Accuracy at 69 minibatches is 1.000000
Average training loss at this epoch..minibatch..69..is 0.001101
Validation loss after 69 passes is 0.956997
Validation loss is higher than training loss at 69 is 0.956997 , stopping training!
Average training loss at 69 is 0.231706
Loss at 70 minibatches, 11 epoch,(1m 25s) is 0.001834
Accuracy at 70 minibatches is 1.000000
Average training loss at 

Loss at 94 minibatches, 15 epoch,(1m 55s) is 0.000754
Accuracy at 94 minibatches is 1.000000
Average training loss at this epoch..minibatch..94..is 0.000754
Validation loss after 94 passes is 1.122883
Validation loss is higher than training loss at 94 is 1.122883 , stopping training!
Average training loss at 94 is 0.170460
Loss at 95 minibatches, 15 epoch,(1m 56s) is 0.000696
Accuracy at 95 minibatches is 1.000000
Average training loss at this epoch..minibatch..95..is 0.000696
Validation loss after 95 passes is 1.200686
Validation loss is higher than training loss at 95 is 1.200686 , stopping training!
Average training loss at 95 is 0.168335
Reached 16 epochs
i 96
Test accuracy: 0.875
Max test accruacy: 0.8984375
Test aucroc: 0.9602150537634409
Max test aucroc: 0.9726751207729469
Train accuracy: 1.0
Max train accruacy: 1.0
Train aucroc: 1.0
Max train aucroc: 1.0
Loss at 97 minibatches, 16 epoch,(1m 58s) is 0.000588
Accuracy at 97 minibatches is 1.000000
Average training loss at this ep

Train accuracy: 1.0
Max train accruacy: 1.0
Train aucroc: 1.0
Max train aucroc: 1.0


In [5]:
#print(test_accuracy_full_batch(corpus.X_test, corpus.y_test, my_batch, word_attn, sent_attn))

In [6]:
#test_accuracy_full_batch(corpus.X_train, corpus.y_train, my_batch, word_attn, sent_attn)