In [41]:
import re
# import nltk
import numpy as np
import torch as th
import torch.autograd as ag
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# Tokenize a sentence
def clean_str(string, tolower=True):
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()


# reads the content of the file passed as an argument.
# if limit > 0, this function will return only the first "limit" sentences in the file.
def loadTexts(filename, limit=-1):
    f = open(filename)
    dataset=[]
    line =  f.readline()
    cpt=1
    skip=0
    while line :
        cleanline = clean_str(f.readline()).split()
        if cleanline: 
            dataset.append(cleanline)
        else: 
            line = f.readline()
            skip+=1
            continue
        if limit > 0 and cpt >= limit: 
            break
        line = f.readline()
        cpt+=1        
        
    f.close()
    print("Load ", cpt, " lines from ", filename , " / ", skip ," lines discarded")
    return dataset


In [256]:
LIM=5000
txtfile = "imdb.pos"  # path of the file containing positive reviews
postxt = loadTexts(txtfile,limit=LIM)

txtfile = "imdb.neg"  # path of the file containing negative reviews
negtxt = loadTexts(txtfile,limit=LIM)

Load  5000  lines from  imdb.pos  /  1  lines discarded
Load  5000  lines from  imdb.neg  /  1  lines discarded


In [274]:
train_data = postxt[:int(len(postxt)*0.8)]+negtxt[:int(len(negtxt)*0.8)]
train_label = np.array([1 for i in range(int(len(train_data)/2))]+[0 for i in range(int(len(train_data)/2))], dtype=np.float32)

dev_data =postxt[int(len(postxt)*0.8):int(len(postxt)*0.9)]+negtxt[int(len(postxt)*0.8):int(len(negtxt)*0.9)]
dev_label = np.array([1 for i in range(int(len(dev_data)/2))]+[0 for i in range(int(len(dev_data)/2))], dtype=np.float32)

test_data = postxt[int(len(postxt)*0.9):]+negtxt[int(len(postxt)*0.9):]  
test_label = np.array([1 for i in range(int(len(test_data)/2))]+[0 for i in range(int(len(test_data)/2))], dtype=np.float32)

In [275]:
train_txt = ""
for s in train_data:
    for w in s:
        train_txt+= w +" "
        
token = train_txt.lower().split()
words = Counter(token)
words = sorted(words, key=words.get, reverse=True)
words = ['_PAD','_UNK'] + words
word2idx = {o:i for i,o in enumerate(words)}
vocab_size = len(word2idx)
print("vocab size : ",vocab_size)


vocab size :  6999


In [276]:
train_data = [[word2idx[w] for w in s] for s in train_data]
dev_data = [[word2idx[w] if w in word2idx else 0 for w in s] for s in dev_data]
test_data = [[word2idx[w] if w in word2idx else 0 for w in s] for s in test_data]

In [277]:
print(train_data[0])

[36, 26, 742, 11, 63, 22, 80]


In [278]:
def pad_input(data, seq_len):
    features = np.zeros((len(data), seq_len), dtype=int)
    for ii, review in enumerate(data):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [279]:
seq_len = max([len(s) for s in train_data])

train_data = pad_input(train_data, seq_len)
dev_data = pad_input(dev_data, seq_len)
test_data = pad_input(dev_data, seq_len)

In [280]:
train_data = TensorDataset(th.from_numpy(train_data), th.from_numpy(train_label))
dev_data = TensorDataset(th.from_numpy(dev_data), th.from_numpy(dev_label))
test_data = TensorDataset(th.from_numpy(test_data), th.from_numpy(test_label))

batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [296]:
class Conv_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, kernel_size, feat_size):
        super(Conv_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.conv1 = nn.Linear(embedding_dim*kernel_size, feat_size)
        self.l1 = nn.Linear(feat_size, 1)
        
        self.kernel_size = kernel_size
        nn.init.xavier_uniform_(self.l1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.l1.bias.data)  # Xavier/Glorot init for tanh
        
    def forward(self, inputs):
        batch_size = inputs.shape[0]
#         inputs = inputs.view(batch_size, inputs.shape[1], 1)
        
#         if(inputs.shape[0]<self.kernel_size):
#             pad = th.tensor([0 for i in range(self.kernel_size-inputs.shape[0])])
#             inputs = th.cat((inputs, pad), 0)

        inputs = self.embedding(inputs)
        
        cat_inputs= []
        h = th.Tensor(batch_size,inputs.shape[1], inputs.shape[2]*self.kernel_size)
        
        for i in range(len(inputs)-self.kernel_size+1):
            tab = [inputs[:,j,:] for j in range(i, self.kernel_size+i)]
            cat = th.cat(tab, 1)
            cat_inputs.append(self.conv1(cat))
        
        h = th.stack(cat_inputs)
        h, _ = th.max(h, 0)
        
        out = self.l1(h)
        out = out.view(-1)
        return th.sigmoid(out)

In [297]:
class LSTM_Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim,nb_cell, hidden_dim, feat_size):
        super(LSTM_Classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_dim = hidden_dim
        self.nb_cell = nb_cell
        
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim, nb_cells, batch_first = True)
        self.l1 = nn.Linear(hidden_dim, 1)
        
        
        nn.init.xavier_uniform_(self.l1.weight.data)  # Xavier/Glorot init for tanh
        nn.init.zeros_(self.l1.bias.data)  # Xavier/Glorot init for tanh
        
    def forward(self, inputs, hidden):
        
        inputs = self.embedding(inputs)
        
        
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = th.Tensor(self.nb_cells, batch_size, self.hidden_size)
        if self.is_lstm:
            hidden = (weight.new(self.nb_cells, batch_size, self.hidden_size).zero_(),
                      weight.new(self.nb_cells, batch_size, self.hidden_size).zero_())
        else:
            hidden = weight.new(self.nb_cells, batch_size, self.hidden_size).zero_()
            
        return hidden

In [298]:
loss_fn = nn.BCELoss()
lr = 1e-1
m = Conv_classifier(len(word2idx), 5, 3, 5)
train_accuracies = []
train_losses = []
dev_accuracies = []
dev_losses = []
clip=5

In [299]:
def train(model, max_epochs=20):
    optim = th.optim.SGD(params=model.parameters(), lr =lr, weight_decay=1e-4)
#     model.train()

    
    for e in range(max_epochs):
        train_accuracy = 0
        dev_accuracy = 0
        train_mean_loss = 0
        dev_mean_loss = 0
        
        
        for x,labels in train_loader:

            preds = model(x)
            
            loss = loss_fn(preds, labels)
            train_mean_loss+=loss.item()
            optim.zero_grad()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            loss.backward()
            optim.step()
            
            for pred, label in zip(preds, labels):
                if (pred<0.5 and label==0) or (pred>0.5 and label==1):
                    train_accuracy+=1
            
                
        ###Dev test
        for x, labels in dev_loader:
            preds = model(x)
            loss = loss_fn(preds, labels)
            dev_mean_loss+=loss.item()
            
            for pred, label in zip(preds, labels):
                if (pred<0.5 and label==0) or (pred>0.5 and label==1):
                    dev_accuracy+=1
                    
        train_accuracies.append(train_accuracy/len(train_data))
        train_losses.append(train_mean_loss/len(train_data))
        dev_accuracies.append(dev_accuracy/len(dev_data))
        dev_losses.append(dev_mean_loss/len(dev_data))
        print("EPOCH {}".format(e+1))
        print("Train Accuracy : ",train_accuracy/len(train_data))
        print("Dev Accuracy : ",dev_accuracy/len(dev_data))
        print("Train Mean loss : ",train_mean_loss/len(train_data))
        print("Dev Mean loss : ",dev_mean_loss/len(dev_data))
        print("----------------------------------------")

In [300]:
train(m)

EPOCH 1
Train Accuracy :  0.53775
Dev Accuracy :  0.557
Train Mean loss :  0.013837529040873051
Dev Mean loss :  0.013682193636894225
----------------------------------------
EPOCH 2
Train Accuracy :  0.558375
Dev Accuracy :  0.582
Train Mean loss :  0.013676310524344443
Dev Mean loss :  0.013619078636169433
----------------------------------------
EPOCH 3
Train Accuracy :  0.567125
Dev Accuracy :  0.583
Train Mean loss :  0.01362375996261835
Dev Mean loss :  0.013570053398609162
----------------------------------------
EPOCH 4
Train Accuracy :  0.56925
Dev Accuracy :  0.589
Train Mean loss :  0.013579626128077508
Dev Mean loss :  0.01352558070421219
----------------------------------------
EPOCH 5
Train Accuracy :  0.57875
Dev Accuracy :  0.594
Train Mean loss :  0.013543844170868397
Dev Mean loss :  0.013506351232528687
----------------------------------------
EPOCH 6
Train Accuracy :  0.580125
Dev Accuracy :  0.582
Train Mean loss :  0.013473866969347
Dev Mean loss :  0.013470309495