In [1]:
import re
# import nltk
import numpy as np
import torch as th
import torch.autograd as ag
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()


# reads the content of the file passed as an argument.
# if limit > 0, this function will return only the first "limit" sentences in the file.
def loadTexts(filename, limit=-1):
    f = open(filename)
    dataset=[]
    line =  f.readline()
    cpt=1
    skip=0
    while line :
        cleanline = clean_str(f.readline()).split()
        if cleanline: 
            dataset.append(cleanline)
        else: 
            line = f.readline()
            skip+=1
            continue
        if limit > 0 and cpt >= limit: 
            break
        line = f.readline()
        cpt+=1        
        
    f.close()
    print("Load ", cpt, " lines from ", filename , " / ", skip ," lines discarded")
    return dataset


In [3]:
LIM=5000
txtfile = "imdb.pos"  # path of the file containing positive reviews
postxt = loadTexts(txtfile,limit=LIM)

txtfile = "imdb.neg"  # path of the file containing negative reviews
negtxt = loadTexts(txtfile,limit=LIM)

Load  5000  lines from  imdb.pos  /  1  lines discarded
Load  5000  lines from  imdb.neg  /  1  lines discarded


In [4]:
train_data = postxt[:int(len(postxt)*0.8)]+negtxt[:int(len(negtxt)*0.8)]
train_label = np.array([1.0 for i in range(int(len(train_data)/2))]+[0.0 for i in range(int(len(train_data)/2))])

dev_data =postxt[int(len(postxt)*0.8):int(len(postxt)*0.9)]+negtxt[int(len(postxt)*0.8):int(len(negtxt)*0.9)]
dev_label = np.array([1.0 for i in range(int(len(dev_data)/2))]+[0.0 for i in range(int(len(dev_data)/2))])

test_data = postxt[int(len(postxt)*0.9):]+negtxt[int(len(postxt)*0.9):]  
test_label = np.array([1.0 for i in range(int(len(test_data)/2))]+[0.0 for i in range(int(len(test_data)/2))])

In [126]:
train_txt = ""
for s in train_data:
    for w in s:
        train_txt+= w +" "
        
token = train_txt.lower().split()
words = Counter(token)
words = sorted(words, key=words.get, reverse=True)
vocab_size = len(words)
word2idx = {o:i for i,o in enumerate(words)}
word2idx["UNK"]=len(word2idx)
print("vocab size : ",vocab_size)


vocab size :  6997


In [193]:
class Conv_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, kernel_size, feat_size):
        super(Conv_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.conv1 = nn.Linear(embedding_dim*kernel_size, feat_size)
        self.l1 = nn.Linear(feat_size, 1)
        
        self.kernel_size = kernel_size
        nn.init.xavier_uniform_(self.l1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.l1.bias.data)  # Xavier/Glorot init for tanh
        
    def forward(self, inputs):
        if(inputs.shape[0]<self.kernel_size):
            pad = th.tensor([0 for i in range(self.kernel_size-inputs.shape[0])])
            inputs = th.cat((inputs, pad), 0)
        
        inputs = self.embedding(inputs)
        cat_inputs= []
        h = th.Tensor(inputs.shape[0], inputs.shape[1]*self.kernel_size)
        
        for i in range(len(inputs)-self.kernel_size+1):
            tab = [inputs[j] for j in range(i, self.kernel_size+i)]
            cat = th.cat(tab, 0)
            cat_inputs.append(self.conv1(cat))
        
        h = th.stack(cat_inputs)
        h, _ = th.max(h, 0)
        
        out = self.l1(h)
        return th.sigmoid(out)

In [194]:
loss_fn = nn.BCELoss()
lr = 1e-1
m = Conv_classifier(len(word2idx), 5, 3, 5)
train_accuracies = []
train_losses = []
dev_accuracies = []
dev_losses = []

In [195]:
def train(model, max_epochs=20):
    optim = th.optim.SGD(params=model.parameters(), lr =lr, weight_decay=1e-4)
#     model.train()
    idx_train = np.arange(len(train_data))
    idx_dev = np.arange(len(dev_data))
    
    for e in range(max_epochs):
        train_accuracy = 0
        dev_accuracy = 0
        train_mean_loss = 0
        dev_mean_loss = 0
        
        np.random.shuffle(idx_train)
        np.random.shuffle(idx_dev)
        for i in idx_train:
            s = train_data[i]
            y = train_label[i]
            
            idxsentence = th.LongTensor([word2idx[w] for w in s])
            label = th.tensor([y])
            pred = model(idxsentence)
            loss = loss_fn(pred, label)

            train_mean_loss+=loss.item()
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                train_accuracy+=1
        ###Dev test
        for i in idx_dev:
            s = dev_data[i]
            y = dev_label[i]
            idxsentence = th.LongTensor([word2idx[w] for w in s if w in word2idx])
            label = th.tensor([y])
            pred = model(idxsentence)
            loss = loss_fn(pred, label)
            dev_mean_loss+=loss.item()
            if((pred<0.5 and label==0) or (pred>0.5 and label == 1)):
                dev_accuracy+=1
        train_accuracies.append(train_accuracy/len(train_data))
        train_losses.append(train_mean_loss/len(train_data))
        dev_accuracies.append(dev_accuracy/len(dev_data))
        dev_losses.append(dev_mean_loss/len(dev_data))
        print("EPOCH {}".format(e+1))
        print("Train Accuracy : ",train_accuracy/len(train_data))
        print("Dev Accuracy : ",dev_accuracy/len(dev_data))
        print("Train Mean loss : ",train_mean_loss/len(train_data))
        print("Dev Mean loss : ",dev_mean_loss/len(dev_data))
        print("----------------------------------------")

In [196]:
train(m)

EPOCH 1
Train Accuracy :  0.5055
Dev Accuracy :  0.523
Train Mean loss :  0.7166379799523857
Dev Mean loss :  0.7075098401606082
----------------------------------------
EPOCH 2
Train Accuracy :  0.538125
Dev Accuracy :  0.533
Train Mean loss :  0.7055273558418267
Dev Mean loss :  0.680806639611721
----------------------------------------
EPOCH 3
Train Accuracy :  0.6165
Dev Accuracy :  0.644
Train Mean loss :  0.656465456299693
Dev Mean loss :  0.6737398589443182
----------------------------------------
EPOCH 4
Train Accuracy :  0.687625
Dev Accuracy :  0.68
Train Mean loss :  0.5847738357541057
Dev Mean loss :  0.6046349857150563
----------------------------------------
EPOCH 5
Train Accuracy :  0.729
Dev Accuracy :  0.679
Train Mean loss :  0.5271868862870419
Dev Mean loss :  0.5939605705873546
----------------------------------------
EPOCH 6
Train Accuracy :  0.762
Dev Accuracy :  0.737
Train Mean loss :  0.48723723436920024
Dev Mean loss :  0.5203046619867964
---------------------