In [1]:
import numpy as np
import torch
import os
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f
from torch.utils.data import DataLoader,Dataset
import re
import pickle

Define a function to remove the punctuation

In [2]:
def tokenlize(content):
    content = re.sub('<.*?>','', content)
    filters = ['#','$','%','&',',','\.',':','\(','\)']
    content = re.sub("|".join(filters),'',content)
    content = re.sub('\'s', ' is', content)
    content = re.sub('\'re', ' are', content)
    content = re.sub('\'m', ' am', content)
    tokens = [i.lower() for i in content.split()]
    return tokens

Define a class to load Dataset and get the data

In [3]:
class imdb_dataset(Dataset):
    def __init__(self, train=True):
        self.train_path = '../../dataset/aclImdb_v1/aclImdb/train/'
        self.test_path = '../../dataset/aclImdb_v1/aclImdb/test/'
        data_path = self.train_path if train else self.test_path
        temp_data_path = [data_path+'pos/', data_path+'neg/']
        self.total_file_path = []
        for path in temp_data_path:
            file_name_list = os.listdir(path)
            file_path_list = [path + i for i in file_name_list]
            self.total_file_path.extend(file_path_list)
    
    def __getitem__(self, index):
        file_path = self.total_file_path[index]
        label_list = file_path.split('/')[-2]
        label = 0 if label_list == 'neg' else 1
        tokens = tokenlize(open(file_path, errors='ignore').read())
        return tokens, label
    
    def __len__(self):
        return len(self.total_file_path)

In [4]:
def collate_fn(batch):
    content, label = list(zip(*batch))
    content = [ws.transform(i, sentence_len = 128) for i in content]
    content = torch.LongTensor(content)
    label = torch.LongTensor(label)
    return content, label

Get the dataloader of training set and test set

In [5]:
imdb_data = imdb_dataset()
data_loader = DataLoader(imdb_data, batch_size=128, shuffle=True, collate_fn=collate_fn)
test_data = imdb_dataset(train=False)
test_loader = DataLoader(test_data, batch_size=128, shuffle=True, collate_fn=collate_fn)

This class is to build up a dictionary of data and transform words to numbers

In [11]:
class word2sequence():
    unknown_tag = 'UK'   #set a dictionary key for unknown words
    padding_tag = 'PAD'  #set a dictionary key for padding
    uk = 0               #initial unknown word number sequence as 0
    pad = 1              #initial padding sequence as 1
    
    def __init__(self):
        self.dict = {
            self.unknown_tag : self.uk,     #add the unknown key into dictionary
            self.padding_tag : self.pad     #add the padding key into dictionary
        }
        self.count = {}                     #initial a set for count the frequence of words
    
    def fit(self, sentence):
        for word in sentence:
            self.count[word] = self.count.get(word,0) + 1    #counting the words requence
            
    def get_count():
        print(self.count)

    #build_vocab function is to build up a dictionary of words
    #min_word_times, max_word_times are the condition whether delete some less and very frequence words
    #max_dic_len is the maximun words we gonna use in the dictionary
    def build_vocab(self, min_word_times=None, max_word_times=None, max_dic_len=None):
        if min_word_times is not None:
            self.count = {word : value for word, value in self.count.items() if value > min_word_times}
        if max_word_times is not None:
            self.count = {word : value for word, value in self.count.items() if value < max_word_times}
        if max_dic_len is not None: #ranking the words by its frequence
            temp = sorted(self.count.items(),key = lambda x:x[-1], reverse=True)[:max_word_times]
            self.count = temp
        for word in self.count:  #complete the dictionary, each words' sequence depend on its frequence
            self.dict[word] = len(self.dict)
    
    #transform function is tor tranform a sequence of words into a sequence of numbers
    #I invoked this function and set the sentence_len=128
    def transform(self, sentence, sentence_len=None):
        if sentence_len is not None:   
            if sentence_len > len(sentence):   #If the number of words is less than 128, its would add some padding at the end
                sentence = sentence + [self.padding_tag] * (sentence_len - len(sentence))
            if sentence_len < len(sentence):   #If the number of words is more than 128, its would delete the words at the end
                sentence = sentence[:sentence_len]
        return [self.dict.get(word, self.uk) for word in sentence]
    
    
    def __len__(self):
        return len(self.dict)

RNN Model class

In [7]:
class RNN_Model(nn.Module):
    def __init__(self, max_sentence_len, wv_dim, hidden_size):
        super(RNN_Model, self).__init__()
        self.wv_dim = wv_dim               #wv_dim is a hyperparameter to define the dimonsion each word would embadding
        self.hidden_size = hidden_size     #hidden_Size is the hidden parameter size
        self.num_layers = 2                #the layer numer of RNN
        self.dropout = 0.5                 #dropout is a hyperparameter for RNN layer
        self.embadding = nn.Embedding(len(ws), self.wv_dim)   #define a embadding layer by pytorch.nn lib
        self.rnn = nn.RNN(input_size=self.wv_dim,             #define a RNN layer
                          hidden_size=self.hidden_size,
                          num_layers=self.num_layers, 
                          dropout=self.dropout, 
                          batch_first=True,
                          bidirectional=True)
        self.fc = nn.Linear(self.hidden_size*2, 2)            #difine a linear layer,input dimension is hidden size * 2 because I use bidirectional RNN
        
    def forward(self, x):
        x = self.embadding(x)
        x, h = self.rnn(x)
        output1 = h[-1,:,:]
        output2 = h[-2,:,:]
        output = torch.cat((output1, output2), dim=-1)        #combine two output as linear layer input
        x = self.fc(output)
        return f.log_softmax(x,dim=1)

LSTM Model class

In [8]:
class LSTM_Model(nn.Module):
    def __init__(self, max_sentence_len, wv_dim, hidden_size):
        super(LSTM_Model, self).__init__()
        self.wv_dim = wv_dim
        self.hidden_size = hidden_size
        self.num_layers = 2
        self.dropout = 0.5
        self.embadding = nn.Embedding(len(ws), self.wv_dim)
        self.lstm = nn.LSTM(input_size=self.wv_dim, 
                            hidden_size=self.hidden_size,
                            num_layers=self.num_layers, 
                            dropout=self.dropout, 
                            batch_first=True, 
                            bidirectional=True)
        self.fc = nn.Linear(self.hidden_size*2, 2)
        
    def forward(self, x):
        x = self.embadding(x)
        x, (h, c) = self.lstm(x)
        output1 = h[-1,:,:]
        output2 = h[-2,:,:]
        output = torch.cat((output1, output2), dim=-1)
        x = self.fc(output)
        return f.log_softmax(x,dim=1)

Initial a word2sequence object(It take a long time every time I fit the data, so I save it as a pkl file and load it when I use it.The annotating code is how I initial this object and fit it)

In [12]:
if __name__ == '__main__':
    ws = word2sequence()
    data_path = '../../dataset/aclImdb_v1/aclImdb/train/'
    temp_data_path = [data_path+'pos/', data_path+'neg/']
    total_file_path = []
    for path in temp_data_path:
        file_name_list = os.listdir(path)
        file_path_list = [path + i for i in file_name_list]
        for file_path in file_path_list:
            sentence = tokenlize(open(file_path,errors='ignore').read())
            ws.fit(sentence)
    ws.build_vocab(min_word_times=10, max_word_times=10000)
    pickle.dump(ws, open('ws.pkl','wb'))
    print(len(ws))
#     ws = pickle.load(open('../../Pytorch TEST/ws.pkl', 'rb'))
#     hidden_size_list = [20,50,100,200,500]

19957


define a train function to training the model

In [24]:
def train(epoch, model):       #this function need to pass the model
    training_loss = []
    accuracy = []
    model = model
    optimizer = torch.optim.Adam(model.parameters(), 0.001)     #initial the optimizer  
    for i in range(epoch):
        for idx,(features, labels) in enumerate(data_loader):   #load the data
            optimizer.zero_grad()    
            label_pre = model(features)                         #prediction the data
            loss = f.nll_loss(label_pre, labels)                #get the loss
            loss.backward()                                     #backward propagation
            optimizer.step()                                    #update parameters
            if idx % 50 == 0:                                   #every 50 step, print the training loss
                training_loss.append(loss.item())
                print('Epoch:%2d    Step:%3d    Loss:%.5f' % (i+1, idx+1, loss.item()))
        accuracy.append(test(model))                            #every epoch run a test function and record the accuracy
    return training_loss, accuracy

define a test function to test the model

In [25]:
def test(model):
    loss_list = []
    acc_list = []
    model = model
    for idx, (features, labels) in enumerate(test_loader):
        with torch.no_grad():
            y_pre = model(features)
            loss = f.nll_loss(y_pre, labels)
            loss_list.append(loss.item())
            pred = y_pre.max(dim=-1)[-1]
            acc = pred.eq(labels).float().mean()
            acc_list.append(acc.item())
    print('Average_loss:%.5f   Accuracy:%.5f'%(np.mean(loss_list), np.mean(acc_list))) 
    return np.mean(acc_list)

In [27]:
for hidden_size in hidden_size_list:                                                #train the RNN Model with different hidden size
    rnn_model = RNN_Model(max_sentence_len=128, wv_dim=64, hidden_size=hidden_size) #initial the RNN model
    training_loss, accuracy = train(10, rnn_model)                                  #training the model

Epoch: 1    Step:  1    Loss:0.72763
Epoch: 1    Step: 51    Loss:0.70999
Epoch: 1    Step:101    Loss:0.69719
Epoch: 1    Step:151    Loss:0.69308
Average_loss:0.69226   Accuracy:0.52714
Epoch: 2    Step:  1    Loss:0.69554
Epoch: 2    Step: 51    Loss:0.69771
Epoch: 2    Step:101    Loss:0.68866
Epoch: 2    Step:151    Loss:0.71870
Average_loss:0.68487   Accuracy:0.55658
Epoch: 3    Step:  1    Loss:0.66870
Epoch: 3    Step: 51    Loss:0.69839
Epoch: 3    Step:101    Loss:0.66749
Epoch: 3    Step:151    Loss:0.66684
Average_loss:0.65001   Accuracy:0.62376
Epoch: 4    Step:  1    Loss:0.60061
Epoch: 4    Step: 51    Loss:0.58995
Epoch: 4    Step:101    Loss:0.61256
Epoch: 4    Step:151    Loss:0.62906
Average_loss:0.62650   Accuracy:0.65386
Epoch: 5    Step:  1    Loss:0.62077
Epoch: 5    Step: 51    Loss:0.56112
Epoch: 5    Step:101    Loss:0.55891
Epoch: 5    Step:151    Loss:0.60401
Average_loss:0.59340   Accuracy:0.68927
Epoch: 6    Step:  1    Loss:0.52740
Epoch: 6    Step: 51   

Epoch: 4    Step:151    Loss:0.71366
Average_loss:0.73748   Accuracy:0.49890
Epoch: 5    Step:  1    Loss:0.69907
Epoch: 5    Step: 51    Loss:0.68664
Epoch: 5    Step:101    Loss:0.68376
Epoch: 5    Step:151    Loss:0.66413
Average_loss:0.69426   Accuracy:0.54134
Epoch: 6    Step:  1    Loss:0.69483
Epoch: 6    Step: 51    Loss:0.67158
Epoch: 6    Step:101    Loss:0.65860
Epoch: 6    Step:151    Loss:0.77928
Average_loss:0.71696   Accuracy:0.52782
Epoch: 7    Step:  1    Loss:0.67236
Epoch: 7    Step: 51    Loss:0.66996
Epoch: 7    Step:101    Loss:0.73036
Epoch: 7    Step:151    Loss:0.67668
Average_loss:0.70153   Accuracy:0.54045
Epoch: 8    Step:  1    Loss:0.67262
Epoch: 8    Step: 51    Loss:0.69555
Epoch: 8    Step:101    Loss:0.76229
Epoch: 8    Step:151    Loss:0.66108
Average_loss:0.70242   Accuracy:0.54379
Epoch: 9    Step:  1    Loss:0.66471
Epoch: 9    Step: 51    Loss:0.60784
Epoch: 9    Step:101    Loss:0.70460
Epoch: 9    Step:151    Loss:0.69242
Average_loss:0.72086   

In [29]:
for hidden_size in hidden_size_list:                                                   #train the LSTM model with different hidden size
    lstm_model = LSTM_Model(max_sentence_len=128, wv_dim=64, hidden_size=hidden_size)  #intial the LSTM Model
    training_loss2, accuracy2 = train(10, lstm_model)                                  #training the model

Epoch: 1    Step:  1    Loss:0.68859
Epoch: 1    Step: 51    Loss:0.69103
Epoch: 1    Step:101    Loss:0.68680
Epoch: 1    Step:151    Loss:0.67059
Average_loss:0.69324   Accuracy:0.59749
Epoch: 2    Step:  1    Loss:0.69469
Epoch: 2    Step: 51    Loss:0.64526
Epoch: 2    Step:101    Loss:0.64956
Epoch: 2    Step:151    Loss:0.51612
Average_loss:0.56728   Accuracy:0.71349
Epoch: 3    Step:  1    Loss:0.56267
Epoch: 3    Step: 51    Loss:0.56462
Epoch: 3    Step:101    Loss:0.47008
Epoch: 3    Step:151    Loss:0.49458
Average_loss:0.51612   Accuracy:0.74405
Epoch: 4    Step:  1    Loss:0.46314
Epoch: 4    Step: 51    Loss:0.44098
Epoch: 4    Step:101    Loss:0.42901
Epoch: 4    Step:151    Loss:0.39966
Average_loss:0.47173   Accuracy:0.78273
Epoch: 5    Step:  1    Loss:0.38456
Epoch: 5    Step: 51    Loss:0.39024
Epoch: 5    Step:101    Loss:0.34309
Epoch: 5    Step:151    Loss:0.31015
Average_loss:0.44700   Accuracy:0.79963
Epoch: 6    Step:  1    Loss:0.28338
Epoch: 6    Step: 51   

Epoch: 4    Step:151    Loss:0.53905
Average_loss:0.48081   Accuracy:0.77963
Epoch: 5    Step:  1    Loss:0.45143
Epoch: 5    Step: 51    Loss:0.38667
Epoch: 5    Step:101    Loss:0.43056
Epoch: 5    Step:151    Loss:0.45227
Average_loss:0.45828   Accuracy:0.77951
Epoch: 6    Step:  1    Loss:0.36694
Epoch: 6    Step: 51    Loss:0.29234
Epoch: 6    Step:101    Loss:0.29500
Epoch: 6    Step:151    Loss:0.40045
Average_loss:0.41725   Accuracy:0.81314
Epoch: 7    Step:  1    Loss:0.35777
Epoch: 7    Step: 51    Loss:0.32385
Epoch: 7    Step:101    Loss:0.24643
Epoch: 7    Step:151    Loss:0.31021
Average_loss:0.42391   Accuracy:0.81973
Epoch: 8    Step:  1    Loss:0.27387
Epoch: 8    Step: 51    Loss:0.31804
Epoch: 8    Step:101    Loss:0.25637
Epoch: 8    Step:151    Loss:0.26698
Average_loss:0.44923   Accuracy:0.81761
Epoch: 9    Step:  1    Loss:0.24818
Epoch: 9    Step: 51    Loss:0.21091
Epoch: 9    Step:101    Loss:0.18121
Epoch: 9    Step:151    Loss:0.22933
Average_loss:0.46527   