In [13]:
import pandas as pd
import re
import gensim
import torch
import torch.nn as nn
import numpy as np
import sys
import time
import datetime
from typing import Any
from nltk.tokenize import TreebankWordTokenizer
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from torch.autograd import Variable
from sklearn.model_selection import KFold, train_test_split

np.random.seed(15)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print (device)

cpu


In [14]:
class LoadData():
    def __init__(self, dataset_name,
                 mr = '../../../../data/MR-rt-polaritydata/rt-polaritydata/',
                 sst = '../../../../data/SST-stanfordSentimentTreebank/',
                 subj = '../../../../data/Subj-rotten_imdb/',
                 trec = '../../../../data/TREC-Trec6/',
                 cr = '../../../../data/CR-customer review data/',
                 mpqa = '../../../../data/MRQA-database.mpqa.1.2/'):
        
        self.mr, self.sst, self.subj, self.trec, self.cr, self.mpqa = mr, sst, subj, trec, cr, mpqa
        self.data = []
        assert dataset_name in ['MR', 'SST1', 'SST2', 'Subj', 'TREC', 'CR', 'MPQA']
        if dataset_name == 'MR':
            self.load_mr_data()
            
        if dataset_name == 'SST1':
            self.load_sst1_data()
            
        if dataset_name == 'SST2':
            self.load_sst2_data()
            
        if dataset_name == 'Subj':
            self.load_subj_data()
            
        if dataset_name == 'TREC':
            self.load_trec_data()
            
        if dataset_name == 'CR':
            self.load_cr_data()
            
        if dataset_name == 'MPQA':
            self.load_mpqa_data()
    
    def load_data_in_kfolds(self, x, y, folds=10):
        kf = KFold(n_splits=10, shuffle=True)
        kf.get_n_splits(x)
        for train_index, test_index in kf.split(x):

            self.data.append({'x_train': [x[i] for i in train_index], 
                              'y_train': [y[i] for i in train_index], 
                              'x_test': [x[i] for i in test_index], 
                              'y_test': [y[i] for i in test_index]})
        
    def load_mr_data(self):
        with open(self.mr + '/rt-polarity.pos', 'r', encoding='utf-8', errors='ignore') as f:
            pos_statements = f.read().split('\n')
        with open(self.mr + '/rt-polarity.neg', 'r', encoding='utf-8', errors='ignore') as f:
            neg_statements = f.read().split('\n')
        x = pos_statements + neg_statements
        y = ['pos'] * len(pos_statements) + ['neg'] * len(neg_statements)
        self.load_data_in_kfolds(x, y)
        
    def load_sst1_data(self):
        sentences = pd.read_csv(self.sst + '/datasetSentences.txt', sep='\t')
        sentences_type = pd.read_csv(self.sst + '/datasetSplit.txt', sep=',')
        phase_labels = pd.read_csv(self.sst + '/sentiment_labels.txt', sep='|')
        phase_dict = pd.read_csv(self.sst + '/dictionary.txt', sep='|', 
                                 header=None, 
                                 names=['phase', 'phase_id'])
        
        data = pd.merge(sentences, sentences_type, on='sentence_index', how='left')
        data = pd.merge(data, phase_dict, left_on='sentence', right_on='phase', how='left')
        data = pd.merge(data, phase_labels, left_on='phase_id', right_on='phrase ids', how='left')
        
        sentiment = []
        for index, dp in data.iterrows():
            if dp['sentiment values'] <= 0.20:
                sentiment.append('very negative')
            elif dp['sentiment values'] > 0.20 and dp['sentiment values'] <= 0.40:
                sentiment.append('negative')
            elif dp['sentiment values'] > 0.40 and dp['sentiment values'] <= 0.60:
                sentiment.append('neutral')
            elif dp['sentiment values'] > 0.60 and dp['sentiment values'] <= 0.80:
                sentiment.append('positive')
            elif dp['sentiment values'] > 0.80:
                sentiment.append('very positive')
            else:
                sentiment.append(np.NaN)

        data['sentiment'] = sentiment
        data.dropna(subset=['sentence', 'sentiment'], inplace=True)
        self.data.append({'x_train': data[data.splitset_label.isin([1, 3])]['sentence'].tolist(),
                          'y_train': data[data.splitset_label.isin([1, 3])]['sentiment'].tolist(),
                          'x_test': data[data.splitset_label == 2]['sentence'].tolist(),
                          'y_test': data[data.splitset_label == 2]['sentiment'].tolist()})
        
    def load_sst2_data(self):
        sentences = pd.read_csv(self.sst + '/datasetSentences.txt', sep='\t')
        sentences_type = pd.read_csv(self.sst + '/datasetSplit.txt', sep=',')
        phase_labels = pd.read_csv(self.sst + '/sentiment_labels.txt', sep='|')
        phase_dict = pd.read_csv(self.sst + '/dictionary.txt', sep='|', 
                                 header=None, 
                                 names=['phase', 'phase_id'])
        
        data = pd.merge(sentences, sentences_type, on='sentence_index', how='left')
        data = pd.merge(data, phase_dict, left_on='sentence', right_on='phase', how='left')
        data = pd.merge(data, phase_labels, left_on='phase_id', right_on='phrase ids', how='left')
        
        sentiment = []
        for index, dp in data.iterrows():
            if dp['sentiment values'] <= 0.40:
                sentiment.append('negative')
            elif dp['sentiment values'] > 0.60:
                sentiment.append('positive')
            else:
                sentiment.append(np.NaN)

        data['sentiment'] = sentiment
        data.dropna(subset=['sentence', 'sentiment'], inplace=True)
        self.data.append({'x_train': data[data.splitset_label.isin([1, 3])]['sentence'].tolist(),
                          'y_train': data[data.splitset_label.isin([1, 3])]['sentiment'].tolist(),
                          'x_test': data[data.splitset_label == 2]['sentence'].tolist(),
                          'y_test': data[data.splitset_label == 2]['sentiment'].tolist()})
        
    def load_subj_data(self):
        with open(self.subj + '/plot.tok.gt9.5000', 'r',  encoding='utf-8', errors='ignore') as f:
            subjective_sentences = f.readlines()
        with open(self.subj + '/quote.tok.gt9.5000', 'r',  encoding='utf-8', errors='ignore') as f:
            objective_sentences = f.readlines()

        x = subjective_sentences + objective_sentences
        y = ['subj'] * len(subjective_sentences) + ['obj'] * len(objective_sentences)
        self.load_data_in_kfolds(x, y)
        
    def load_trec_data(self):
        data_train = pd.read_csv(self.trec + '/train_5500.label.txt', 
                sep=":", encoding='latin8', header=None, names=['Topic', 'Sentence'])
        data_test = pd.read_csv(self.trec + '/TREC_10.label.txt',
                sep=":", encoding='latin8', header=None, names=['Topic', 'Sentence'])
        
        self.data.append({'x_train': data_train['Sentence'].tolist(),
                          'y_train': data_train['Topic'].tolist(),
                          'x_test': data_test['Sentence'].tolist(),
                          'y_test': data_test['Topic'].tolist()})
        
    def load_cr_data(self):
        reviews = []
        product_type = []

        with open(self.cr + '/Canon G3.txt', 
                  'r',  encoding='utf-8', errors='ignore') as f:
            data = f.readlines()[11:]    
            for text in data:
                reviews.append(text.strip().split('##')[-1].replace('[t]', ''))
                product_type.append('camera')

        with open(self.cr + '/Apex AD2600 Progressive-scan DVD player.txt', 
                  'r',  encoding='utf-8', errors='ignore') as f:
            data = f.readlines()[11:]    
            for text in data:
                reviews.append(text.strip().split('##')[-1].replace('[t]', ''))
                product_type.append('mp3s etc')

        with open(self.cr + '/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt', 
                  'r',  encoding='utf-8', errors='ignore') as f:
            data = f.readlines()[11:]    
            for text in data:
                reviews.append(text.strip().split('##')[-1].replace('[t]', ''))
                product_type.append('mp3s etc')

        with open(self.cr + '/Nikon coolpix 4300.txt', 
                  'r',  encoding='utf-8', errors='ignore') as f:
            data = f.readlines()[11:]    
            for text in data:
                reviews.append(text.strip().split('##')[-1].replace('[t]', ''))
                product_type.append('camera')

        with open(self.cr + '/Nokia 6610.txt', 
                  'r',  encoding='utf-8', errors='ignore') as f:
            data = f.readlines()[11:]    
            for text in data:
                reviews.append(text.strip().split('##')[-1].replace('[t]', ''))
                product_type.append('mp3s etc')
        x = reviews
        y = product_type
        self.load_data_in_kfolds(x, y)
        
    def load_mpqa_data(self):
        text, sentiment = [], []
        with open(self.mpqa + '/mpqa.neg.txt', 'r', encoding='utf-8', errors='ignore') as f:
            data = f.readlines()
            text = text + data
            sentiment = sentiment + ['neg'] * len(data)
        
        with open(self.mpqa + '/mpqa.pos.txt', 'r', encoding='utf-8', errors='ignore') as f:
            data = f.readlines()
            text = text + data
            sentiment = sentiment + ['pos'] * len(data)
        x = text
        y = sentiment
        self.load_data_in_kfolds(x, y)

In [15]:
class PreprocessData():
    def __init__(self, use_pretrained_vector=False,
                 w2v_path='../../../../data/google.news.word2vec/GoogleNews-vectors-negative300.bin'):
        self.word2vec = None
        if use_pretrained_vector:
            print('Loading Word2vec')
            self.word2vec = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
        self.re_word_tokenizer = re.compile(r"\w+", re.I)
        self.reset()

    def set_dataset_name(self, dataset_name):
        self.dataset_name = dataset_name

    def set_maximum_sentence_length(self, sentences):
        sentences_tok = self.tokenize_sentences(sentences)
        self.max_sen_len = np.max([len(s) for s in sentences_tok])

    def reset(self):
        self.dictionary = {}
        self.class2index = {}
        self.index2class = {}
        self.classCount = 0
        self.word2index = {'unk': 0}
        self.wordcount = {'unk': 0}
        self.index2word = {0: 'unk'}
        self.wordCount = 0
        self.wordCount_w2v = 0
        self.weights = None

    def tokenize_sentences(self, sentences):
        if self.dataset_name == 'SST1' or self.dataset_name == 'SST2':
            return [self.clean_str_sst(sen).split(' ') for sen in sentences]
        else:
            return [self.clean_str(sen).split(' ') for sen in sentences]

    def get_average_sentence_length(self, sentences):
        sentences_tok = self.tokenize_sentences(sentences)
        return np.mean([len(s) for s in sentences_tok])

    def update_dict(self, sent):
        for word in sent:
            if not word.lower() in self.word2index:
                self.wordCount += 1
                self.word2index[word.lower()] = self.wordCount
                self.index2word[self.wordCount] = word.lower()
                self.wordcount[word.lower()] = 0
            self.wordcount[word.lower()] += 1

    def train_dictionary(self, sentences, use_pretrained_vector=False):
        sentences_tok = self.tokenize_sentences(sentences)
        for sent_tok in sentences_tok:
            self.update_dict(sent_tok)

        if use_pretrained_vector:
            self.weights = np.zeros((self.wordCount + 1, self.word2vec.vector_size), np.float)
            for i in self.index2word:
                if self.index2word[i] in self.word2vec:
                    self.weights[i, :] = self.word2vec[self.index2word[i]]
                    self.wordCount_w2v += 1
                elif self.wordcount[self.index2word[i]] >= 5:
                    self.weights[i, :] = np.random.uniform(-0.25, 0.25, self.word2vec.vector_size)
                else:
                    pass

            # Old method the weights were directly picked up from word2vec,
            # weight values were not matching in th end thus skipped to self retaining weights
            # self.word2index = {token: token_index for token_index, token in enumerate(self.word2vec.index2word)}
            # self.index2word = {token_index: token for token_index, token in enumerate(self.word2vec.index2word)}
            # self.update_wordCount_w2v()

    def train_classes(self, classes):
        for cl in np.unique(classes):
            if cl not in self.class2index:
                self.class2index[cl] = self.classCount
                self.index2class[self.classCount] = cl
                self.classCount += 1

    def sent2Index(self, sentences):
        sentIndexed = []
        sentences_tok = self.tokenize_sentences(sentences)
        for sent_tok in sentences_tok:
            sentIndx = []
            for w in sent_tok:
                if w.lower() in self.word2index:
                    sentIndx.append(self.word2index[w.lower()])
                else:
                    sentIndx.append(self.word2index['unk'])

            if len(sentIndx) < self.max_sen_len:
                sentIndx = sentIndx + ([self.word2index['unk']] * (self.max_sen_len - len(sentIndx)))
            sentIndexed.append(sentIndx)

            ## As per paper we initially used variable sentence length
            ## adding 'unk' parameter only where it was necessary to achieve min filter
            ## length. While similar accuracy is achieved by the method it lacks speed.
            ## Missing out matrix multiplication as the modelling layer
            # if len(sentIndx) < 5:
            #     sentIndx = sentIndx + ([self.word2index['unk']] * (5 - len(sentIndx)))
            # sentIndexed.append(torch.LongTensor(sentIndx).to(device))

        return torch.LongTensor(sentIndexed).to(device)

    def clean_str(self, sent):
        sent = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", sent)
        sent = re.sub(r"\'s", " \'s", sent)
        sent = re.sub(r"\'ve", " \'ve", sent)
        sent = re.sub(r"n\'t", " n\'t", sent)
        sent = re.sub(r"\'re", " \'re", sent)
        sent = re.sub(r"\'d", " \'d", sent)
        sent = re.sub(r"\'ll", " \'ll", sent)
        sent = re.sub(r",", " , ", sent)
        sent = re.sub(r"!", " ! ", sent)
        sent = re.sub(r"\(", " ( ", sent)
        sent = re.sub(r"\)", " ) ", sent)
        sent = re.sub(r"\?", " ? ", sent)
        sent = re.sub(r"\s{2,}", " ", sent)
        return sent.strip().lower()

    def clean_str_sst(self, sent):
        sent = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", sent)
        sent = re.sub(r"\s{2,}", " ", sent)
        return sent.strip().lower()

    def class2Index(self, classList):
        return torch.LongTensor([self.class2index[c] for c in classList]).to(device)

    def proprocess_data(self, x, y):
        x, y = shuffle(x, y, random_state=17)
        x = self.sent2Index(x)
        y = self.class2Index(y)

        return x, y

In [17]:
dataview = {'Dataset Name': [],
            'No. of Classes': [],
            'Average Length of Sentences': [],
            'Max Length of Sentence': [],
            'Dataset Size': [],
            'Number of Words': [],
            'Number of Words in Word2Vec': [],
            'Test Data Size': []}

data_preprocessor = PreprocessData(use_pretrained_vector=True)
for ds in ['MR', 'SST1', 'SST2', 'Subj', 'TREC', 'CR', 'MPQA']:
    data_loader = LoadData(ds)
    data_preprocessor.reset()
    data_preprocessor.set_dataset_name(ds)
    data_preprocessor.set_maximum_sentence_length(data_loader.data[0]['x_train'] + data_loader.data[0]['x_test'])
    data_preprocessor.train_dictionary(data_loader.data[0]['x_train'] + data_loader.data[0]['x_test'], use_pretrained_vector=True)
    data_preprocessor.train_classes(data_loader.data[0]['y_train'])
    
    dataview['Dataset Name'].append(ds)
    dataview['No. of Classes'].append(data_preprocessor.classCount)
    dataview['Average Length of Sentences'].append(np.round(data_preprocessor.get_average_sentence_length(data_loader.data[0]['x_train'] + 
                                                                                                 data_loader.data[0]['x_test']), 0))
    dataview['Max Length of Sentence'].append(data_preprocessor.max_sen_len)
    dataview['Dataset Size'].append(len(data_loader.data[0]['x_train'] + data_loader.data[0]['x_test']))
    dataview['Number of Words'].append(data_preprocessor.wordCount)
    dataview['Number of Words in Word2Vec'].append(data_preprocessor.wordCount_w2v)
    dataview['Test Data Size'].append('CV' if len(data_loader.data) > 1 else len(data_loader.data[0]['x_test']))

df = pd.DataFrame(dataview)
df

Loading Word2vec


Unnamed: 0,Dataset Name,No. of Classes,Average Length of Sentences,Max Length of Sentence,Dataset Size,Number of Words,Number of Words in Word2Vec,Test Data Size
0,MR,2,20.0,56,10664,18779,16417,CV
1,SST1,5,18.0,53,11286,17200,15748,2125
2,SST2,2,18.0,53,9142,15603,14338,1749
3,Subj,2,23.0,120,10000,21335,17897,CV
4,TREC,6,11.0,38,5952,8708,7475,500
5,CR,2,16.0,105,4260,5226,4755,CV
6,MPQA,2,3.0,36,10606,6247,6084,CV


|   Data | c | l  |    N  |   V   |  Vpre | Test |
|--------|---|----|-------|-------|-------|------|
|   MR   | 2 | 20 | 10662 | 18765 | 16448 |  CV  |
|  SST1  | 5 | 18 | 11855 | 17836 | 16262 | 2210 |
|  SST2  | 2 | 19 | 9613  | 16185 | 14838 | 1821 |
|  Subj  | 2 | 23 | 10000 | 21323 | 17913 |  CV  |
|  TREC  | 6 | 10 | 5952  | 9592  | 9125  | 500  |
|   CR   | 2 | 19 | 3775  | 5340  | 5046  |  CV  |
|  MPQA  | 2 |  3 | 10606 | 6246  | 6083  |  CV  |

In [5]:
import torch
import torch.nn as nn
from torch.autograd import Variable


class CNNClassificationModel(nn.Module):
    def __init__(self,
                 use_pretrained_vector=False,
                 word_count=300000,
                 embedding_size=128,
                 number_of_classes=2,
                 batch_size=50,
                 keep_embeddings_static=False,
                 pretrained_vector_weight=None,
                 use_multi_channel=False):
        super(CNNClassificationModel, self).__init__()

        self.keep_embeddings_static = keep_embeddings_static
        self.number_of_classes = number_of_classes
        self.batch_size = batch_size
        self.input_channel = 1
        self.use_multi_channel = use_multi_channel

        # Setting up embeddings
        if use_pretrained_vector:
            self.embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(pretrained_vector_weight),
                                                                freeze=False)
            self.embedding_size = pretrained_vector_weight.shape[1]
        else:
            self.embedding_layer = nn.Embedding(word_count, embedding_size)
            self.embedding_size = embedding_size
            nn.init.uniform_(self.embedding_layer.weight, -1.0, 1.0)

        if use_multi_channel:
            self.embedding_layer2 = nn.Embedding.from_pretrained(torch.FloatTensor(pretrained_vector_weight),
                                                                 freeze=True)
            self.input_channel = 2


        self.convolution_layer_3dfilter = nn.Conv2d(self.input_channel, 100, (3, self.embedding_size))
        nn.init.xavier_uniform_(self.convolution_layer_3dfilter.weight)

        self.convolution_layer_4dfilter = nn.Conv1d(self.input_channel, 100, (4, self.embedding_size))
        nn.init.xavier_uniform_(self.convolution_layer_4dfilter.weight)

        self.convolution_layer_5dfilter = nn.Conv1d(self.input_channel, 100, (5, self.embedding_size))
        nn.init.xavier_uniform_(self.convolution_layer_4dfilter.weight)

        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(300, self.number_of_classes)
        nn.init.xavier_uniform_(self.linear.weight)

        self.softmax = nn.Softmax(dim=1)

    def forward(self, input):
        embedded = self.embedding_layer(input)
        if self.keep_embeddings_static:
            embedded = Variable(embedded)

        if self.use_multi_channel:
            embedded2 = self.embedding_layer2(input)
            embedded = torch.stack([embedded, embedded2], dim=1)
        else:
            embedded = embedded.unsqueeze(1)

        conv_opt3 = self.convolution_layer_3dfilter(embedded)
        conv_opt4 = self.convolution_layer_4dfilter(embedded)
        conv_opt5 = self.convolution_layer_5dfilter(embedded)

        conv_opt3 = nn.functional.relu(conv_opt3).squeeze(3)
        conv_opt4 = nn.functional.relu(conv_opt4).squeeze(3)
        conv_opt5 = nn.functional.relu(conv_opt5).squeeze(3)

        conv_opt3 = nn.functional.max_pool1d(conv_opt3, conv_opt3.size(2)).squeeze(2)
        conv_opt4 = nn.functional.max_pool1d(conv_opt4, conv_opt4.size(2)).squeeze(2)
        conv_opt5 = nn.functional.max_pool1d(conv_opt5, conv_opt5.size(2)).squeeze(2)


        conv_opt = torch.cat((conv_opt3, conv_opt4, conv_opt5), 1)
        conv_opt = self.dropout(conv_opt)

        linear_opt = self.linear(conv_opt)

        return linear_opt

    ## I used sentences with Variable  earlier as implemented in the paper but as the training was too slow
    ## sentences were padded to maximum sentence length
    # def forward(self, input):
    #     conv_output = []
    #     for inp in input:
    #         embedded = self.embedding_layer(inp).view(1, self.embedding_size, -1)
    #         if self.keep_embeddings_static:
    #             embedded = Variable(embedded)
    #         conv_opt3 = self.convolution_layer_3dfilter(embedded)
    #         conv_opt4 = self.convolution_layer_4dfilter(embedded)
    #         conv_opt5 = self.convolution_layer_5dfilter(embedded)
    #         conv_opt3 = nn.functional.relu(conv_opt3)
    #         conv_opt4 = nn.functional.relu(conv_opt4)
    #         conv_opt5 = nn.functional.relu(conv_opt5)
    #
    #         # Maxpooling to take out the max from each one 100 fitera
    #         conv_opt3 = nn.functional.max_pool1d(conv_opt3, conv_opt3.size(2))
    #         conv_opt4 = nn.functional.max_pool1d(conv_opt4, conv_opt4.size(2))
    #         conv_opt5 = nn.functional.max_pool1d(conv_opt5, conv_opt5.size(2))
    #
    #         conv_opt = torch.cat((conv_opt3, conv_opt4, conv_opt5), 2).view(1, -1)
    #         conv_output.append(conv_opt)
    #
    #     conv_output = torch.cat(conv_output, 0)
    #     conv_output = self.dropout(conv_output)
    #
    #     output = self.linear(conv_output)
    #
    #     return output

In [6]:
def get_batch(x, y, batch_size=50):
    x, y = shuffle(x, y, random_state=13)
    start_index, end_index = 0, 0
    data_batches = []
    while end_index < len(x):
        end_index = (start_index + batch_size) if (start_index + batch_size) < len(x) else len(x)
        data_batches.append((x[start_index:end_index], y[start_index:end_index]))
        start_index = start_index + batch_size
    return data_batches

In [7]:
def train(x_train, y_train, x_test, y_test,
          model, optimizer, criterion,
          model_name = 'model', model_store = False,
          batch_size=50, epochs=100, log_iter=10):
    
    data_batches = get_batch(x_train, y_train, batch_size)
    train_loss, train_accuracy, test_loss, test_accuracy = [], [], [], []
    start_time = start_time = time.time()
    for epoch in range(epochs):        
        # Setting model intot training mode
        model.train() # setting model in train mode
        for batch_num, data in enumerate(data_batches):
            x, y  = data[0], data[1]
            y_pred = model(x)
            
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            
#             This piece is to constrain the normalized weight ||w|| of 
#             the output linear layer be constrained at 3
#             l2_weights = torch.norm(model.linear.weight).item()
#             if l2_weights > 3:
#                 model.linear.parameters = model.linear.weight/l2_weights
            optimizer.step()
            
            sys.stdout.write('\rEpoch:{} | Batch:{} | Time Running: {}'.format(epoch, 
                                                                               batch_num, 
                                                                               datetime.timedelta(seconds=np.round(time.time() - start_time, 0))))

            break
            
        trainloss, trainacc = evaluate(x_train, y_train, model, criterion)
        testloss, testacc = evaluate(x_test, y_test, model, criterion)
        
        if epoch>log_iter and testacc > np.max(test_accuracy) and model_store == True:
            torch.save(model, './models/' + model_name + '.torch')
            
        train_loss.append(trainloss)
        train_accuracy.append(trainacc)
        test_loss.append(testloss)
        test_accuracy.append(testacc)
        
        if epoch%log_iter == 0:
            print (' Train Acc {:.4f}, Train Loss {:.4f}, Test Acc {:.4f}, Test Loss {:.4f}'.format(trainacc, 
                                                                                                    trainloss, 
                                                                                                    testacc,
                                                                                                    testloss))
        
                
    print ('Accuracy Test {:.4f}'.format(np.max(test_accuracy)))
    return train_loss, train_accuracy, test_loss, test_accuracy

In [8]:
def evaluate(x, y, model, criterion):
    model.eval() # setting model to eval mode
    y_pred = model(x)
    loss = criterion(y_pred, y)
    accuracy = accuracy_score(y, y_pred.argmax(-1))
    return loss.item(), accuracy

In [9]:
def predict(x):
    model.eval() # setting model to eval mode
    y_pred = model(x)    
    return y_pred.argmax(-1)

In [10]:
modelling_results = {'MR': {}, 'SST1': {}, 'SST2': {}, 'Subj': {}, 'TREC': {}, 'CR': {}}

In [12]:
# data_preprocessor = PreprocessData()

# MR
data_loader = LoadData('MR')

## CNN Rand
accuracy = []
for d in data_loader.data:
    data_preprocessor.reset()
    data_preprocessor.set_dataset_name(ds)
    data_preprocessor.set_maximum_sentence_length(d['x_train'] + d['x_test'])
    data_preprocessor.train_dictionary(d['x_train'] + d['x_test'])
    data_preprocessor.train_classes(d['y_train'])
    
    x_train = data_preprocessor.sent2Index(d['x_train'])
    y_train = data_preprocessor.class2Index(d['y_train'])
    
    x_test = data_preprocessor.sent2Index(d['x_test'])
    y_test = data_preprocessor.class2Index(d['y_test'])
    
    model = CNNClassificationModel(use_pretrained_vector=False, 
                                   word_count=data_preprocessor.wordCount+1, 
                                   embedding_size=128,
                                   number_of_classes=data_preprocessor.classCount,
                                   batch_size=50, 
                                   keep_embeddings_static=False)
    
    
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    train_loss, train_accuracy, test_loss, test_accuracy = train(x_train, y_train, x_test, y_test, 
                                                                 model, optimizer, criterion, epochs=200, batch_size=50, 
                                                                 log_iter=20, model_name='MR', model_store=True)
    
    accuracy.append(test_accuracy[-1])
    break

print ('CV Test Accuracy {}'.format(np.mean(accuracy)))
modelling_results['MR']['CNNRand'] = np.mean(accuracy)

Epoch:0 | Batch:0 | Time Running: 0:00:00 Train Acc 0.5060, Train Loss 0.7025, Test Acc 0.4995, Test Loss 0.7065
Epoch:20 | Batch:0 | Time Running: 0:03:12 Train Acc 0.5452, Train Loss 0.7051, Test Acc 0.5380, Test Loss 0.7112
Epoch:40 | Batch:0 | Time Running: 0:06:34 Train Acc 0.5480, Train Loss 0.7269, Test Acc 0.5211, Test Loss 0.7330
Epoch:60 | Batch:0 | Time Running: 0:10:09 Train Acc 0.5482, Train Loss 0.7339, Test Acc 0.5342, Test Loss 0.7394
Epoch:80 | Batch:0 | Time Running: 0:13:30 Train Acc 0.5483, Train Loss 0.7390, Test Acc 0.5380, Test Loss 0.7445
Epoch:100 | Batch:0 | Time Running: 0:16:40 Train Acc 0.5484, Train Loss 0.7434, Test Acc 0.5323, Test Loss 0.7489
Epoch:120 | Batch:0 | Time Running: 0:19:54 Train Acc 0.5482, Train Loss 0.7471, Test Acc 0.5333, Test Loss 0.7527
Epoch:140 | Batch:0 | Time Running: 0:22:53 Train Acc 0.5497, Train Loss 0.7505, Test Acc 0.5398, Test Loss 0.7557
Epoch:160 | Batch:0 | Time Running: 0:25:51 Train Acc 0.5508, Train Loss 0.7529, Test 

In [None]:
if not os.path.isfile('./results/MR.csv'):
    df = pd.DataFrame({'Date': [], 'Model Type': [], 'Test Accuracy': [], 'Parameters': []})
else:
    df = pd.read_csv('./results/MR.csv')

df = pd.concat([df, pd.DataFrame({'Date': [datetime.datetime.now().strftime('%d %b %Y')],
                                  'Model Type': ['CNN-Rand'],
                                  'Test Accuracy': [np.mean(accuracy)],
                                  'Parameters': []})])
    
df.to_csv('./results/MR.csv')