In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
os.chdir("/content/gdrive/My Drive/Colab Notebooks/data")

# LSTM for language modeling

In [3]:
import re
import os
import random
import nltk
from collections import Counter

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
device  

'cuda'

In [4]:
def set_seed(seed = 27):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)

In [5]:
set_seed()

## Pride and Prejudice data

In [6]:
text_all = []
with open('prideAndPrejudice.txt') as f:
    for line in f:
        text_all.append(line.strip())

In [7]:
text_all[:5]

['It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.',
 'However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.',
 '"My dear Mr. Bennet," said his lady to him one day, "have you heard that Netherfield Park is let at last?"',
 'Mr. Bennet replied that he had not.',
 '"But it is," returned she; "for Mrs. Long has just been here, and she told me all about it."']

In [8]:
def tokenize(text_all):
    text = ' '.join(text_all)
    sentences = nltk.sent_tokenize(text)
    tokens = []
    for sent in sentences:
        token = ['<s>'] + nltk.word_tokenize(sent) + ['</s>']
        tokens.append(token)
    return tokens

In [9]:
nltk.download('punkt')
text_tokens = tokenize(text_all)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
def get_dict(text_tokens,appear_time=1):
    vocab = Counter(sum(text_tokens, []))
    # Removing the words that only appear once
    vocab = {k:v for k,v in vocab.items() if v>appear_time}
    # Sorting the words according to the number of appearances, with the most common word being first
    vocab = sorted(vocab, key=vocab.get, reverse=True)
    # Adding padding and unknown to our vocabulary so that they will be assigned an index
    vocab = ['<pad>','<unk>'] + vocab
    print("vocab number: {}".format(len(vocab)))
    # Dictionaries to store the word to index mappings and vice versa
    word2idx = {o:i for i,o in enumerate(vocab)}
    idx2word = {i:o for i,o in enumerate(vocab)}

    return word2idx, idx2word

In [11]:
word2idx, idx2word = get_dict(text_tokens)

vocab number: 4152


In [12]:
# create sequences of length 5 tokens
def create_seq(tokens, seq_len = 5):
    
    sequences = []

    if len(tokens) > seq_len:
        for i in range(0, len(tokens)-seq_len):
            # select sequence of tokens
            seq = tokens[i:i+seq_len]
            # add to the list
            sequences.append(" ".join(seq))

        return sequences

    else:
        seq = tokens[:]
        # pad sequence to 5
        for i in range(len(tokens), seq_len):
            seq.append('<pad>') 
        return [" ".join(seq)]

In [13]:
def get_integer_seq(seq):
    return [word2idx[w] if w in word2idx.keys() else word2idx['<unk>'] for w in seq.split()]

In [14]:
class SeqDataset(Dataset):

    def __init__(self, text_tokens, seq_len=5):
        seqs = [create_seq(tokens, seq_len) for tokens in text_tokens]
        seqs = sum(seqs, [])

        # create inputs and targets (x and y)
        x, y = [], []

        for s in seqs:
            x.append(" ".join(s.split()[:-1]))
            y.append(" ".join(s.split()[1:]))

        # convert text sequences to integer sequences
        x = [get_integer_seq(i) for i in x]
        y = [get_integer_seq(i) for i in y]

        # convert lists to numpy arrays
        x = np.array(x)
        y = np.array(y)

        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


In [15]:
class WordLSTM(nn.Module):
    
    def __init__(self, vocab_size=len(word2idx), embed_dim=200, n_hidden=256, n_layers=4, drop_prob=0.3):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        
        self.emb_layer = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        embedded = self.emb_layer(x)
        if hidden != None:
            lstm_output, hidden = self.lstm(embedded, hidden)
        else:
            lstm_output, hidden = self.lstm(embedded)
          
        out = self.dropout(lstm_output)
        out = out.reshape(-1, self.n_hidden) 
        out = self.fc(out)
        
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return hidden

In [16]:
def train(num_epochs=10, dataloader=None, clip=1, interval=300):
    
    model.train()
    min_loss = 99
    for epoch in range(num_epochs):
        all_loss = []
        hidden = model.init_hidden(batch_size)
        for i, (x, y) in enumerate(dataloader):
            inputs, targets = x.long().to(device), y.long().to(device)
            # hidden = tuple([i.data for i in hidden])

            output, hidden = model(inputs, None)
            loss = loss_fn(output, targets.view(-1))

            model.zero_grad()
            loss.backward()
            all_loss.append(loss.item())
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
            
            if i % interval == 0:
                print("Epoch: {}/{}, Step: {}/{}, Train Loss: {}".format(epoch+1, num_epochs, i, len(dataloader), loss))
        epoch_loss = np.mean(all_loss)
        print("Epoch: {}/{}, Train Loss: {}".format(epoch+1, num_epochs, epoch_loss))
        
        if epoch_loss < min_loss:
            torch.save(model.state_dict(), 'model_best.pt')
            print('Model saved successfully')
            min_loss = epoch_loss
        print()

## Sequence length: 5

In [17]:
batch_size = 32
train_set = SeqDataset(text_tokens, seq_len=5)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

In [18]:
model = WordLSTM(vocab_size=len(word2idx), embed_dim=200)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()
model = model.to(device)
loss_fn = loss_fn.to(device)
model

WordLSTM(
  (emb_layer): Embedding(4152, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=4152, bias=True)
)

In [19]:
train(dataloader=train_loader, num_epochs=5, clip=1, interval=800)

Epoch: 1/5, Step: 0/3925, Train Loss: 8.338584899902344
Epoch: 1/5, Step: 800/3925, Train Loss: 6.1500468254089355
Epoch: 1/5, Step: 1600/3925, Train Loss: 6.0997419357299805
Epoch: 1/5, Step: 2400/3925, Train Loss: 6.155903339385986
Epoch: 1/5, Step: 3200/3925, Train Loss: 5.876415252685547
Epoch: 1/5, Train Loss: 6.08282847252621
Model saved successfully

Epoch: 2/5, Step: 0/3925, Train Loss: 5.812653541564941
Epoch: 2/5, Step: 800/3925, Train Loss: 5.874948024749756
Epoch: 2/5, Step: 1600/3925, Train Loss: 5.4947829246521
Epoch: 2/5, Step: 2400/3925, Train Loss: 5.484261512756348
Epoch: 2/5, Step: 3200/3925, Train Loss: 5.39902925491333
Epoch: 2/5, Train Loss: 5.594286084266225
Model saved successfully

Epoch: 3/5, Step: 0/3925, Train Loss: 5.508958339691162
Epoch: 3/5, Step: 800/3925, Train Loss: 5.37017297744751
Epoch: 3/5, Step: 1600/3925, Train Loss: 5.195843696594238
Epoch: 3/5, Step: 2400/3925, Train Loss: 5.228837013244629
Epoch: 3/5, Step: 3200/3925, Train Loss: 5.0636191368

In [20]:
def top_k_logits(logits, k):
    value, idx = torch.topk(logits, k)
    logits[logits < value[:, [-1]]] = -float('Inf')
    return logits

In [21]:
@torch.no_grad()
# predict next token
def predict(word, hidden, top_k, sample):
         
    # tensor inputs
    word = word if word in word2idx else '<unk>'
    x = np.array([[word2idx[word]]])
    inputs = torch.tensor(x).to(device)
    hidden = tuple([i.data for i in hidden])

    # get the output of the model
    logits, hidden = model(inputs, hidden)

    # get the token probabilities
    if top_k is not None:
        logits = top_k_logits(logits, top_k)
    # apply softmax to convert to probabilities
    probs = F.softmax(logits, dim=-1)
    # sample from the distribution or take the most likely
    if sample:
        idx = torch.multinomial(probs, num_samples=1)
    else:
        value, idx = torch.topk(probs, k=1, dim=-1)
    # return the encoded value of the predicted word and the hidden state
    return idx2word[idx.item()], hidden

In [22]:
# function to generate text
def sample(max_step=100, context='<s>', top_k=None, sample=True):
        
    # push to GPU
    model.to(device)
    model.eval()
    hidden = model.init_hidden(1)
    tokens = context.split()

    # predict subsequent tokens
    for i in range(max_step):
        token, hidden = predict(tokens[-1], hidden, top_k, sample)
        tokens.append(token)
        if token == '<\s>':
            return ' '.join(tokens)

    return ' '.join(tokens)

In [23]:
generate_sent = [] 
for i in range(10):
    sent = sample(max_step=30,context='<s>',top_k=20,sample=True)
    generate_sent.append(sent)
with open('generate_text.txt','w') as f:
    for sent in generate_sent:
        f.write("{}\n".format(sent))

In [24]:
def get_prob(word,hidden,target):
    word = word if word in word2idx else '<unk>'
    target = target if target in word2idx else '<unk>'
    x = np.array([[word2idx[word]]])
    y = np.array([[word2idx[target]]])
    inputs = torch.tensor(x).to(device)
    hidden = tuple([i.data for i in hidden])

    # get the output of the model
    logits, hidden = model(inputs, hidden)
    probs = F.softmax(logits, dim=-1)
    prob = probs[0,y].item()
    return prob, hidden

In [25]:
@torch.no_grad()
def compute_perplex(model, test_tokens):
    all_perplex = []
    model.to(device)
    model.eval()
    for tokens in test_tokens:
        sent_perplex = 0
        hidden = model.init_hidden(1)
        for i in range(len(tokens)-1):
            prob, hidden = get_prob(tokens[i],hidden,tokens[i+1])
            sent_perplex += -np.log(prob)
        all_perplex.append(sent_perplex/len(tokens))
    print("Test perplexity: {}".format(np.exp(np.mean(all_perplex))))

In [26]:
test_tokens = []
with open('test_1.txt') as f:
    for line in f:
        test_tokens.append(line.strip().split())

In [27]:
compute_perplex(model, test_tokens)

Test perplexity: 134.93943483325958


## Sequence length: 25

In [28]:
train_set_2 = SeqDataset(text_tokens, seq_len=25)
train_loader_2 = DataLoader(train_set_2, batch_size=batch_size, shuffle=True)

In [29]:
model = WordLSTM()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()
model = model.to(device)
loss_fn = loss_fn.to(device)
model

WordLSTM(
  (emb_layer): Embedding(4152, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=4152, bias=True)
)

In [30]:
train(dataloader=train_loader_2, num_epochs=5, clip=1, interval=800)

Epoch: 1/5, Step: 0/1467, Train Loss: 8.327515602111816
Epoch: 1/5, Step: 800/1467, Train Loss: 6.036366939544678
Epoch: 1/5, Train Loss: 6.02771867857389
Model saved successfully

Epoch: 2/5, Step: 0/1467, Train Loss: 5.889590740203857
Epoch: 2/5, Step: 800/1467, Train Loss: 5.848283767700195
Epoch: 2/5, Train Loss: 5.862490297420218
Model saved successfully

Epoch: 3/5, Step: 0/1467, Train Loss: 5.7718892097473145
Epoch: 3/5, Step: 800/1467, Train Loss: 5.745514392852783
Epoch: 3/5, Train Loss: 5.748726030556458
Model saved successfully

Epoch: 4/5, Step: 0/1467, Train Loss: 5.903743743896484
Epoch: 4/5, Step: 800/1467, Train Loss: 5.521963119506836
Epoch: 4/5, Train Loss: 5.47387253742439
Model saved successfully

Epoch: 5/5, Step: 0/1467, Train Loss: 5.176084518432617
Epoch: 5/5, Step: 800/1467, Train Loss: 5.152724742889404
Epoch: 5/5, Train Loss: 5.066474747479003
Model saved successfully



In [31]:
generate_sent = [] 
for i in range(10):
    sent = sample(max_step=30,context='<s>',top_k=20,sample=True)
    generate_sent.append(sent)
with open('generate_text_2.txt','w') as f:
    for sent in generate_sent:
        f.write("{}\n".format(sent))

In [32]:
compute_perplex(model, test_tokens)

Test perplexity: 144.19671284051736


## better model on test_2

In [33]:
test_tokens_2 = []
with open('test_2.txt') as f:
    for line in f:
        test_tokens_2.append(line.strip().split())

In [34]:
model.load_state_dict(torch.load('model_best.pt', map_location=device))

<All keys matched successfully>

In [35]:
compute_perplex(model, test_tokens_2)

Test perplexity: 228.35428312845553


## Glove vector

In [36]:
import torchtext

In [37]:
embed_dim = 100
Glove = torchtext.vocab.GloVe(name='6B', dim=embed_dim, cache='./')

In [38]:
def get_pretrained_embed(word2idx):
    pretrained_embed = torch.zeros((len(word2idx), embed_dim))

    for i, word in enumerate(word2idx.keys()):
        if word in Glove.stoi:
            pretrained_embed[i] = Glove[word]
        else:
            pretrained_embed[i] = np.random.normal(embed_dim)
    print("pretrained embedding size: {}".format(pretrained_embed.shape))
    return pretrained_embed

In [39]:
pretrained_embed = get_pretrained_embed(word2idx)

pretrained embedding size: torch.Size([4152, 100])


In [40]:
model = WordLSTM(vocab_size=len(word2idx), embed_dim=100)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()
model = model.to(device)
loss_fn = loss_fn.to(device)
model

WordLSTM(
  (emb_layer): Embedding(4152, 100)
  (lstm): LSTM(100, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=4152, bias=True)
)

In [41]:
# load pretrain_embed
model.emb_layer.weight.data.copy_(pretrained_embed)
model.emb_layer.weight.requires_grad = True

In [42]:
train(dataloader=train_loader, num_epochs=5, clip=1, interval=800)

Epoch: 1/5, Step: 0/3925, Train Loss: 8.325315475463867
Epoch: 1/5, Step: 800/3925, Train Loss: 6.068885326385498
Epoch: 1/5, Step: 1600/3925, Train Loss: 5.694124698638916
Epoch: 1/5, Step: 2400/3925, Train Loss: 6.004619121551514
Epoch: 1/5, Step: 3200/3925, Train Loss: 6.01119327545166
Epoch: 1/5, Train Loss: 6.095573914521819
Model saved successfully

Epoch: 2/5, Step: 0/3925, Train Loss: 6.211051940917969
Epoch: 2/5, Step: 800/3925, Train Loss: 5.377532958984375
Epoch: 2/5, Step: 1600/3925, Train Loss: 5.4888153076171875
Epoch: 2/5, Step: 2400/3925, Train Loss: 5.583943843841553
Epoch: 2/5, Step: 3200/3925, Train Loss: 5.578301429748535
Epoch: 2/5, Train Loss: 5.670285832714883
Model saved successfully

Epoch: 3/5, Step: 0/3925, Train Loss: 4.9225053787231445
Epoch: 3/5, Step: 800/3925, Train Loss: 5.4769415855407715
Epoch: 3/5, Step: 1600/3925, Train Loss: 5.49036979675293
Epoch: 3/5, Step: 2400/3925, Train Loss: 5.086743354797363
Epoch: 3/5, Step: 3200/3925, Train Loss: 5.284717

In [43]:
generate_sent = [] 
for i in range(10):
    sent = sample(max_step=30,context='<s>',top_k=20,sample=True)
    generate_sent.append(sent)
with open('generate_text_3.txt','w') as f:
    for sent in generate_sent:
        f.write("{}\n".format(sent))

In [44]:
compute_perplex(model, test_tokens)

Test perplexity: 128.33346522969657


## Tweet data

In [45]:
tweet_text = []
with open('tweet.txt') as f:
    for line in f:
        tweet_text.append(line.strip())

In [46]:
text_tokens = [text.split() for text in tweet_text]

In [47]:
word2idx, idx2word = get_dict(text_tokens)

vocab number: 3997


## Sequence length: 5

In [48]:
batch_size = 32
train_set_3 = SeqDataset(text_tokens, seq_len=5)
train_loader_3 = DataLoader(train_set_3, batch_size=batch_size, shuffle=True)

In [49]:
model = WordLSTM(vocab_size=len(word2idx), embed_dim=200)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()
model = model.to(device)
loss_fn = loss_fn.to(device)

In [50]:
train(dataloader=train_loader_3, num_epochs=5, clip=1, interval=800)

Epoch: 1/5, Step: 0/2465, Train Loss: 8.283422470092773
Epoch: 1/5, Step: 800/2465, Train Loss: 6.224873065948486
Epoch: 1/5, Step: 1600/2465, Train Loss: 6.236417770385742
Epoch: 1/5, Step: 2400/2465, Train Loss: 5.787410736083984
Epoch: 1/5, Train Loss: 6.137932942218278
Model saved successfully

Epoch: 2/5, Step: 0/2465, Train Loss: 6.110302925109863
Epoch: 2/5, Step: 800/2465, Train Loss: 6.350065231323242
Epoch: 2/5, Step: 1600/2465, Train Loss: 6.189631462097168
Epoch: 2/5, Step: 2400/2465, Train Loss: 5.770730495452881
Epoch: 2/5, Train Loss: 5.921329023001402
Model saved successfully

Epoch: 3/5, Step: 0/2465, Train Loss: 5.921454906463623
Epoch: 3/5, Step: 800/2465, Train Loss: 5.723166465759277
Epoch: 3/5, Step: 1600/2465, Train Loss: 5.641420841217041
Epoch: 3/5, Step: 2400/2465, Train Loss: 5.6212358474731445
Epoch: 3/5, Train Loss: 5.695617106284863
Model saved successfully

Epoch: 4/5, Step: 0/2465, Train Loss: 5.821914196014404
Epoch: 4/5, Step: 800/2465, Train Loss: 5.6

In [51]:
generate_sent = [] 
for i in range(10):
    sent = sample(max_step=30,context='<s>',top_k=20,sample=True)
    generate_sent.append(sent)
with open('generate_text_4.txt','w') as f:
    for sent in generate_sent:
        f.write("{}\n".format(sent))

In [52]:
compute_perplex(model, test_tokens_2)

Test perplexity: 114.09393206329511


## Sequence length: 15

In [53]:
batch_size = 32
train_set_4 = SeqDataset(text_tokens, seq_len=15)
train_loader_4 = DataLoader(train_set_4, batch_size=batch_size, shuffle=True)

In [54]:
model = WordLSTM(vocab_size=len(word2idx), embed_dim=200)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()
model = model.to(device)
loss_fn = loss_fn.to(device)

In [55]:
train(dataloader=train_loader_4, num_epochs=5, clip=1, interval=800)

Epoch: 1/5, Step: 0/1061, Train Loss: 8.298203468322754
Epoch: 1/5, Step: 800/1061, Train Loss: 5.970295429229736
Epoch: 1/5, Train Loss: 6.135001772198331
Model saved successfully

Epoch: 2/5, Step: 0/1061, Train Loss: 5.942286014556885
Epoch: 2/5, Step: 800/1061, Train Loss: 5.8520073890686035
Epoch: 2/5, Train Loss: 5.903343921557111
Model saved successfully

Epoch: 3/5, Step: 0/1061, Train Loss: 5.9632062911987305
Epoch: 3/5, Step: 800/1061, Train Loss: 5.996168613433838
Epoch: 3/5, Train Loss: 5.84365752010723
Model saved successfully

Epoch: 4/5, Step: 0/1061, Train Loss: 5.6815385818481445
Epoch: 4/5, Step: 800/1061, Train Loss: 5.9780755043029785
Epoch: 4/5, Train Loss: 5.793503226478857
Model saved successfully

Epoch: 5/5, Step: 0/1061, Train Loss: 5.68289041519165
Epoch: 5/5, Step: 800/1061, Train Loss: 5.526974678039551
Epoch: 5/5, Train Loss: 5.701874151868038
Model saved successfully



In [56]:
generate_sent = [] 
for i in range(10):
    sent = sample(max_step=30,context='<s>',top_k=20,sample=True)
    generate_sent.append(sent)
with open('generate_text_5.txt','w') as f:
    for sent in generate_sent:
        f.write("{}\n".format(sent))

In [57]:
compute_perplex(model, test_tokens_2)

Test perplexity: 127.75258888220837


## better model on test_1

In [58]:
model.load_state_dict(torch.load('model_best.pt', map_location=device))

<All keys matched successfully>

In [59]:
compute_perplex(model, test_tokens)

Test perplexity: 145.24113031324816


## Glove vectors

In [60]:
pretrained_embed = get_pretrained_embed(word2idx)

pretrained embedding size: torch.Size([3997, 100])


In [61]:
model = WordLSTM(vocab_size=len(word2idx), embed_dim=100)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()
model = model.to(device)
loss_fn = loss_fn.to(device)

In [62]:
# load pretrain_embed
model.emb_layer.weight.data.copy_(pretrained_embed)
model.emb_layer.weight.requires_grad = True

In [63]:
train(dataloader=train_loader_3, num_epochs=5, clip=1, interval=800)

Epoch: 1/5, Step: 0/2465, Train Loss: 8.285900115966797
Epoch: 1/5, Step: 800/2465, Train Loss: 5.965482234954834
Epoch: 1/5, Step: 1600/2465, Train Loss: 6.151297569274902
Epoch: 1/5, Step: 2400/2465, Train Loss: 5.83143949508667
Epoch: 1/5, Train Loss: 6.108056863664856
Model saved successfully

Epoch: 2/5, Step: 0/2465, Train Loss: 5.952112674713135
Epoch: 2/5, Step: 800/2465, Train Loss: 5.813508987426758
Epoch: 2/5, Step: 1600/2465, Train Loss: 6.064155578613281
Epoch: 2/5, Step: 2400/2465, Train Loss: 5.619847774505615
Epoch: 2/5, Train Loss: 5.928172902693372
Model saved successfully

Epoch: 3/5, Step: 0/2465, Train Loss: 5.819441318511963
Epoch: 3/5, Step: 800/2465, Train Loss: 5.717911720275879
Epoch: 3/5, Step: 1600/2465, Train Loss: 5.784095764160156
Epoch: 3/5, Step: 2400/2465, Train Loss: 5.605437278747559
Epoch: 3/5, Train Loss: 5.720361549617311
Model saved successfully

Epoch: 4/5, Step: 0/2465, Train Loss: 5.382509708404541
Epoch: 4/5, Step: 800/2465, Train Loss: 5.330

In [64]:
generate_sent = [] 
for i in range(10):
    sent = sample(max_step=30,context='<s>',top_k=20,sample=True)
    generate_sent.append(sent)
with open('generate_text_6.txt','w') as f:
    for sent in generate_sent:
        f.write("{}\n".format(sent))

In [65]:
compute_perplex(model, test_tokens_2)

Test perplexity: 118.29567983443539


## GloveTwitter vectors

In [66]:
embed_dim = 100
Glove = torchtext.vocab.GloVe(name='twitter.27B', dim=embed_dim, cache='./')

In [67]:
model = WordLSTM(vocab_size=len(word2idx), embed_dim=100)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()
model = model.to(device)
loss_fn = loss_fn.to(device)

In [68]:
train(dataloader=train_loader_3, num_epochs=5, clip=1, interval=800)

Epoch: 1/5, Step: 0/2465, Train Loss: 8.287731170654297
Epoch: 1/5, Step: 800/2465, Train Loss: 5.763376712799072
Epoch: 1/5, Step: 1600/2465, Train Loss: 5.944653511047363
Epoch: 1/5, Step: 2400/2465, Train Loss: 6.010117053985596
Epoch: 1/5, Train Loss: 6.140636523210011
Model saved successfully

Epoch: 2/5, Step: 0/2465, Train Loss: 5.856949329376221
Epoch: 2/5, Step: 800/2465, Train Loss: 5.867694854736328
Epoch: 2/5, Step: 1600/2465, Train Loss: 5.806730270385742
Epoch: 2/5, Step: 2400/2465, Train Loss: 5.4784369468688965
Epoch: 2/5, Train Loss: 5.923018555515436
Model saved successfully

Epoch: 3/5, Step: 0/2465, Train Loss: 5.412802696228027
Epoch: 3/5, Step: 800/2465, Train Loss: 5.90138053894043
Epoch: 3/5, Step: 1600/2465, Train Loss: 5.712285041809082
Epoch: 3/5, Step: 2400/2465, Train Loss: 5.912449359893799
Epoch: 3/5, Train Loss: 5.734100012556777
Model saved successfully

Epoch: 4/5, Step: 0/2465, Train Loss: 5.593371868133545
Epoch: 4/5, Step: 800/2465, Train Loss: 5.67

In [69]:
generate_sent = [] 
for i in range(10):
    sent = sample(max_step=30,context='<s>',top_k=20,sample=True)
    generate_sent.append(sent)
with open('generate_text_7.txt','w') as f:
    for sent in generate_sent:
        f.write("{}\n".format(sent))

In [70]:
compute_perplex(model, test_tokens_2)

Test perplexity: 121.52823262088273


# LSTM for classification

In [124]:
import csv

In [125]:
train_sentences = []
train_labels = []
with open('sentiment-train.csv') as f:
    reader = csv.reader(f)
    next(reader)
    for line in reader:
        label, text = line
        train_sentences.append(text)
        train_labels.append(float(label))

In [126]:
test_sentences = []
test_labels = []
with open('sentiment-test.csv') as f:
    reader = csv.reader(f)
    next(reader)
    for line in reader:
        label, text = line
        test_sentences.append(text)
        test_labels.append(float(label))

In [127]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for i, review in enumerate(sentences):
        if len(review) > 0:
            features[i, :len(review)] = np.array(review)[:seq_len]
    return features

In [128]:
def process_data(train_sentences, train_labels, test_sentences, test_labels, padding_length=30):
    for i in range(len(train_sentences)):
        train_sentences[i] = re.sub('\d','0',train_sentences[i])
    for i in range(len(test_sentences)):
        test_sentences[i] = re.sub('\d','0',test_sentences[i])

    # Modify URLs to <url>
    for i in range(len(train_sentences)):
        if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
            train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i]) 
    for i in range(len(test_sentences)):
        if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
            test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

    vocab = Counter() #Dictionary that will map a word to the number of times it appeared in all the training sentences
    train_tokens = []
    for i, sentence in enumerate(train_sentences):
        #The sentences will be stored as a list of words/tokens
        tokens = []
        for word in nltk.word_tokenize(sentence): #Tokenizing the words
            vocab.update([word.lower()]) #Converting all the words to lower case
            tokens.append(word)
        train_tokens.append(tokens)
        if i%(len(train_sentences)/10) == 0:
            print(str((i*100)/len(train_sentences)) + "% done")
    print("100% done")

    vocab = {k:v for k,v in vocab.items() if v>1}
    # Sorting the words according to the number of appearances, with the most common word being first
    vocab = sorted(vocab, key=vocab.get, reverse=True)
    # Adding padding and unknown to our vocabulary so that they will be assigned an index
    vocab = ['<pad>','<unk>'] + vocab
    # Dictionaries to store the word to index mappings and vice versa
    word2idx = {o:i for i,o in enumerate(vocab)}
    idx2word = {i:o for i,o in enumerate(vocab)}
    print("vocab number: {}".format(len(vocab)))

    train_idx = []
    test_idx = []
    for i, tokens in enumerate(train_tokens):
        # Looking up the mapping dictionary and assigning the index to the respective words
        train_idx.append([word2idx[word] if word in word2idx else word2idx['<unk>'] for word in tokens])

    for i, sentence in enumerate(test_sentences):
        # For test sentences, we have to tokenize the sentences as well
        test_idx.append([word2idx[word.lower()] if word.lower() in word2idx else word2idx['<unk>'] for word in nltk.word_tokenize(sentence)])
    seq_len = padding_length #The length that the sentences will be padded/shortened to
    train_idx = pad_input(train_idx, seq_len)
    test_idx = pad_input(test_idx, seq_len)

    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)

    train_data = TensorDataset(torch.from_numpy(train_idx), torch.from_numpy(train_labels))
    test_data = TensorDataset(torch.from_numpy(test_idx), torch.from_numpy(test_labels))

    return word2idx, idx2word, train_data, test_data

In [129]:
word2idx, idx2word, train_data, test_data = process_data(train_sentences, train_labels, test_sentences, test_labels, padding_length=30)

0.0% done
10.0% done
20.0% done
30.0% done
40.0% done
50.0% done
60.0% done
70.0% done
80.0% done
90.0% done
100% done
vocab number: 19959


In [130]:
batch_size = 32
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [131]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=False):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if use_model.lower() == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=0.5, batch_first=True, bidirectional=bidirectional)
        if use_model.lower() == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=0.5, batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(0.2)
        if bidirectional:
            self.fc = nn.Linear(hidden_dim*2, output_size)
        else:
            self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        out, hidden = self.rnn(embeds, hidden)
        out = out[:,-1,:]
        out = self.dropout(out)
        out = self.fc(out)
        out = self.sigmoid(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden


In [132]:
def train(num_epochs=10, train_loader=None, valid_loader=None, clip=5, interval=300, valid_acc_all=[]):

    loss_min = 99
    max_acc = 0
    for epoch in range(num_epochs):
        model.train()
        train_loss = []
        hidden = model.init_hidden(batch_size)
        for i, (inputs, labels) in enumerate(train_loader):
            # hidden = tuple([e.data for e in hidden])
            inputs, labels = inputs.long().to(device), labels.to(device)

            output, hidden = model(inputs, None)
            loss = loss_fn(output.squeeze(-1), labels.float())
            train_loss.append(loss.item())

            model.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
            
            if i % interval == 0:
                print("Epoch: {}/{}, Step: {}/{}, Train Loss: {}".format(epoch+1, num_epochs, i, len(train_loader), loss))
        epoch_loss = np.mean(train_loss)
        print("Epoch: {}/{}, Train Loss: {}".format(epoch+1, num_epochs, epoch_loss))

        if valid_loader != None:
            model.eval()
            valid_loss = []
            num_correct = 0
            for i, (inputs, labels) in enumerate(valid_loader):
                # val_hidden = tuple([each.data for each in val_h])
                inputs, labels = inputs.long().to(device), labels.to(device)
                output, hidden = model(inputs, None)
                loss = loss_fn(output.squeeze(-1), labels.float())
                valid_loss.append(loss.item())
                pred = torch.round(output.squeeze(-1))
                correct = pred.eq(labels.float()).cpu().numpy()
                num_correct += np.sum(correct)

            print("Epoch: {}/{}, Valid Loss: {}".format(epoch+1, num_epochs, np.mean(valid_loss)))
            valid_acc = num_correct/len(test_loader.dataset)
            max_acc = max(max_acc, valid_acc)
            print("Epoch: {}/{}, Valid accuracy: {:.3f}%".format(epoch+1, num_epochs, valid_acc*100))

        if epoch_loss < loss_min:
            torch.save(model.state_dict(), 'bestmodel_sentiment.pt')
            print('Model saved successfully')
            loss_min = epoch_loss
        print()
    if valid_loader != None:
        valid_acc_all.append(max_acc)

In [133]:
def test(test_loader):
    test_loss = []
    num_correct = 0
    hidden = model.init_hidden(batch_size)

    model.eval()
    for inputs, labels in test_loader:
        # hidden = tuple([each.data for each in hidden])
        inputs, labels = inputs.long().to(device), labels.to(device)

        output, hidden = model(inputs, None)
        loss = loss_fn(output.squeeze(-1), labels.float())
        test_loss.append(loss.item())
        pred = torch.round(output.squeeze(-1))
        
        correct = pred.eq(labels.float()).cpu().numpy()
        num_correct += np.sum(correct)
            
    print("Test loss: {:.3f}".format(np.mean(test_loss)))
    test_acc = num_correct/len(test_loader.dataset)
    print("Test accuracy: {:.3f}%".format(test_acc*100))

In [134]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=False)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = nn.BCELoss()
model.to(device)
model

SentimentNet(
  (embedding): Embedding(19959, 400)
  (rnn): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [135]:
train(num_epochs=5, train_loader=train_loader, valid_loader=None, clip=5, interval=500)

Epoch: 1/5, Step: 0/1875, Train Loss: 0.6896765232086182
Epoch: 1/5, Step: 500/1875, Train Loss: 0.6008710861206055
Epoch: 1/5, Step: 1000/1875, Train Loss: 0.6363990306854248
Epoch: 1/5, Step: 1500/1875, Train Loss: 0.6959047317504883
Epoch: 1/5, Train Loss: 0.5760922355969746
Model saved successfully

Epoch: 2/5, Step: 0/1875, Train Loss: 0.5280606746673584
Epoch: 2/5, Step: 500/1875, Train Loss: 0.4512268006801605
Epoch: 2/5, Step: 1000/1875, Train Loss: 0.4189934730529785
Epoch: 2/5, Step: 1500/1875, Train Loss: 0.5200663805007935
Epoch: 2/5, Train Loss: 0.4904069847583771
Model saved successfully

Epoch: 3/5, Step: 0/1875, Train Loss: 0.5496572256088257
Epoch: 3/5, Step: 500/1875, Train Loss: 0.5087374448776245
Epoch: 3/5, Step: 1000/1875, Train Loss: 0.5441999435424805
Epoch: 3/5, Step: 1500/1875, Train Loss: 0.496160626411438
Epoch: 3/5, Train Loss: 0.4626253471215566
Model saved successfully

Epoch: 4/5, Step: 0/1875, Train Loss: 0.5688390731811523
Epoch: 4/5, Step: 500/1875, T

In [136]:
test(test_loader)

Test loss: 0.517
Test accuracy: 75.209%


## Use GRU

In [137]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='gru', bidirectional=False)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = nn.BCELoss()
model.to(device)
model

SentimentNet(
  (embedding): Embedding(19959, 400)
  (rnn): GRU(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [138]:
train(num_epochs=5, train_loader=train_loader, valid_loader=None, clip=5, interval=500)

Epoch: 1/5, Step: 0/1875, Train Loss: 0.7007261514663696
Epoch: 1/5, Step: 500/1875, Train Loss: 0.706928551197052
Epoch: 1/5, Step: 1000/1875, Train Loss: 0.5427626967430115
Epoch: 1/5, Step: 1500/1875, Train Loss: 0.6162347793579102
Epoch: 1/5, Train Loss: 0.6264342344919841
Model saved successfully

Epoch: 2/5, Step: 0/1875, Train Loss: 0.692589521408081
Epoch: 2/5, Step: 500/1875, Train Loss: 0.5782080888748169
Epoch: 2/5, Step: 1000/1875, Train Loss: 0.652542233467102
Epoch: 2/5, Step: 1500/1875, Train Loss: 0.6577836871147156
Epoch: 2/5, Train Loss: 0.6053669723510742
Model saved successfully

Epoch: 3/5, Step: 0/1875, Train Loss: 0.6928296089172363
Epoch: 3/5, Step: 500/1875, Train Loss: 0.40084144473075867
Epoch: 3/5, Step: 1000/1875, Train Loss: 0.509823203086853
Epoch: 3/5, Step: 1500/1875, Train Loss: 0.8983087539672852
Epoch: 3/5, Train Loss: 0.6368058365186056

Epoch: 4/5, Step: 0/1875, Train Loss: 0.44613561034202576
Epoch: 4/5, Step: 500/1875, Train Loss: 0.5744693279266

In [139]:
test(test_loader)

Test loss: 0.678
Test accuracy: 61.838%


## Bidirectional LSTM

In [140]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = nn.BCELoss()
model.to(device)
model

SentimentNet(
  (embedding): Embedding(19959, 400)
  (rnn): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=1024, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [141]:
train(num_epochs=5, train_loader=train_loader, valid_loader=None, clip=5, interval=800)

Epoch: 1/5, Step: 0/1875, Train Loss: 0.6933279633522034
Epoch: 1/5, Step: 800/1875, Train Loss: 0.556958794593811
Epoch: 1/5, Step: 1600/1875, Train Loss: 0.6091010570526123
Epoch: 1/5, Train Loss: 0.6177551045735677
Model saved successfully

Epoch: 2/5, Step: 0/1875, Train Loss: 0.5861363410949707
Epoch: 2/5, Step: 800/1875, Train Loss: 0.3751044273376465
Epoch: 2/5, Step: 1600/1875, Train Loss: 0.5037902593612671
Epoch: 2/5, Train Loss: 0.5140378169616063
Model saved successfully

Epoch: 3/5, Step: 0/1875, Train Loss: 0.6879075765609741
Epoch: 3/5, Step: 800/1875, Train Loss: 0.2774255573749542
Epoch: 3/5, Step: 1600/1875, Train Loss: 0.6646953821182251
Epoch: 3/5, Train Loss: 0.48483073558012646
Model saved successfully

Epoch: 4/5, Step: 0/1875, Train Loss: 0.30227038264274597
Epoch: 4/5, Step: 800/1875, Train Loss: 0.5301316976547241
Epoch: 4/5, Step: 1600/1875, Train Loss: 0.3859880864620209
Epoch: 4/5, Train Loss: 0.46576744728088376
Model saved successfully

Epoch: 5/5, Step: 

In [142]:
test(test_loader)

Test loss: 0.597
Test accuracy: 74.652%


## Bidirectional GRU

In [143]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='gru', bidirectional=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = nn.BCELoss()
model.to(device)
model

SentimentNet(
  (embedding): Embedding(19959, 400)
  (rnn): GRU(400, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=1024, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [144]:
train(num_epochs=5, train_loader=train_loader, valid_loader=None, clip=5, interval=800)

Epoch: 1/5, Step: 0/1875, Train Loss: 0.6966218948364258
Epoch: 1/5, Step: 800/1875, Train Loss: 0.7064805030822754
Epoch: 1/5, Step: 1600/1875, Train Loss: 0.8709197044372559
Epoch: 1/5, Train Loss: 0.6324259975115458
Model saved successfully

Epoch: 2/5, Step: 0/1875, Train Loss: 0.5157490968704224
Epoch: 2/5, Step: 800/1875, Train Loss: 0.7551617622375488
Epoch: 2/5, Step: 1600/1875, Train Loss: 0.536025881767273
Epoch: 2/5, Train Loss: 0.6216688517649969
Model saved successfully

Epoch: 3/5, Step: 0/1875, Train Loss: 0.682677149772644
Epoch: 3/5, Step: 800/1875, Train Loss: 0.662162184715271
Epoch: 3/5, Step: 1600/1875, Train Loss: 0.4738426208496094
Epoch: 3/5, Train Loss: 0.6465229260285695

Epoch: 4/5, Step: 0/1875, Train Loss: 0.778519332408905
Epoch: 4/5, Step: 800/1875, Train Loss: 0.5623020529747009
Epoch: 4/5, Step: 1600/1875, Train Loss: 0.49415072798728943
Epoch: 4/5, Train Loss: 0.6205930606047313
Model saved successfully

Epoch: 5/5, Step: 0/1875, Train Loss: 0.57388418

In [145]:
test(test_loader)

Test loss: 0.606
Test accuracy: 65.738%


## GloveTwitter Vectors

In [146]:
import torchtext

In [147]:
embed_dim = 100
Glove = torchtext.vocab.GloVe(name='twitter.27B', dim=embed_dim, cache='./')

In [148]:
pretrained_embed = get_pretrained_embed(word2idx)

pretrained embedding size: torch.Size([19959, 100])


In [149]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim = 100
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = nn.BCELoss()
model.to(device)
model

SentimentNet(
  (embedding): Embedding(19959, 100)
  (rnn): LSTM(100, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=1024, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [150]:
model.embedding.weight.data.copy_(pretrained_embed)
model.embedding.weight.requires_grad = True

In [151]:
train(num_epochs=5, train_loader=train_loader, valid_loader=None, clip=5, interval=800)

Epoch: 1/5, Step: 0/1875, Train Loss: 0.7052767276763916
Epoch: 1/5, Step: 800/1875, Train Loss: 0.5163572430610657
Epoch: 1/5, Step: 1600/1875, Train Loss: 0.5856012105941772
Epoch: 1/5, Train Loss: 0.5601503868897756
Model saved successfully

Epoch: 2/5, Step: 0/1875, Train Loss: 0.3237488865852356
Epoch: 2/5, Step: 800/1875, Train Loss: 0.24358971416950226
Epoch: 2/5, Step: 1600/1875, Train Loss: 0.7080134153366089
Epoch: 2/5, Train Loss: 0.47873155713876087
Model saved successfully

Epoch: 3/5, Step: 0/1875, Train Loss: 0.470812052488327
Epoch: 3/5, Step: 800/1875, Train Loss: 0.5095195770263672
Epoch: 3/5, Step: 1600/1875, Train Loss: 0.22125974297523499
Epoch: 3/5, Train Loss: 0.4454321674823761
Model saved successfully

Epoch: 4/5, Step: 0/1875, Train Loss: 0.30066439509391785
Epoch: 4/5, Step: 800/1875, Train Loss: 0.40145742893218994
Epoch: 4/5, Step: 1600/1875, Train Loss: 0.44392138719558716
Epoch: 4/5, Train Loss: 0.4250964507738749
Model saved successfully

Epoch: 5/5, Ste

In [152]:
test(test_loader)

Test loss: 0.647
Test accuracy: 74.095%


## k-fold Cross Validation

In [153]:
def random_split_data(dataset, ratio=0.8, batch_size = 32):
    train_size = int(ratio * len(dataset))
    valid_size = len(dataset) - train_size
    train_data, valid_data = random_split(dataset, [train_size,valid_size])
    print('Train samples: {}, Valid samples: {}'.format(len(train_data), len(valid_data)))

    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    return train_loader, test_loader

### hidden size=128, embedding size=100

In [154]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim = 100
hidden_dim = 128
n_layers = 2

In [155]:
valid_acc_all = []
for i in range(5):
    print("k-fold validation: {}".format(i))
    train_loader, valid_loader = random_split_data(train_data, ratio=0.8)
    model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
    loss_fn = nn.BCELoss()
    model.to(device)
    train(num_epochs=5, train_loader=train_loader, valid_loader=valid_loader, clip=5, interval=2500, valid_acc_all=valid_acc_all)
print("Average valid accuracy: {}".format(np.mean(valid_acc_all)))

k-fold validation: 0
Train samples: 48000, Valid samples: 12000
Epoch: 1/5, Step: 0/1500, Train Loss: 0.6941091418266296
Epoch: 1/5, Train Loss: 0.6502778763969739
Epoch: 1/5, Valid Loss: 0.5388427227735519
Epoch: 1/5, Valid accuracy: 74.930%
Model saved successfully

Epoch: 2/5, Step: 0/1500, Train Loss: 0.6555651426315308
Epoch: 2/5, Train Loss: 0.5003530928393205
Epoch: 2/5, Valid Loss: 0.47868334253629047
Epoch: 2/5, Valid accuracy: 77.159%
Model saved successfully

Epoch: 3/5, Step: 0/1500, Train Loss: 0.5481796860694885
Epoch: 3/5, Train Loss: 0.43169332920511566
Epoch: 3/5, Valid Loss: 0.4529117209215959
Epoch: 3/5, Valid accuracy: 79.109%
Model saved successfully

Epoch: 4/5, Step: 0/1500, Train Loss: 0.42115145921707153
Epoch: 4/5, Train Loss: 0.3914153158267339
Epoch: 4/5, Valid Loss: 0.5539292941490809
Epoch: 4/5, Valid accuracy: 75.766%
Model saved successfully

Epoch: 5/5, Step: 0/1500, Train Loss: 0.3646601736545563
Epoch: 5/5, Train Loss: 0.3632792010456324
Epoch: 5/5, V

### hidden size=128, embedding size=400

In [156]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim =400
hidden_dim = 128
n_layers = 2

In [157]:
valid_acc_all = []
for i in range(5):
    print("k-fold validation: {}".format(i))
    train_loader, valid_loader = random_split_data(train_data, ratio=0.8)
    model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
    loss_fn = nn.BCELoss()
    model.to(device)
    train(num_epochs=5, train_loader=train_loader, valid_loader=valid_loader, clip=5, interval=2500, valid_acc_all=valid_acc_all)
print("Average valid accuracy: {}".format(np.mean(valid_acc_all)))

k-fold validation: 0
Train samples: 48000, Valid samples: 12000
Epoch: 1/5, Step: 0/1500, Train Loss: 0.7055128216743469
Epoch: 1/5, Train Loss: 0.5742330858111382
Epoch: 1/5, Valid Loss: 0.4743967851003011
Epoch: 1/5, Valid accuracy: 75.487%
Model saved successfully

Epoch: 2/5, Step: 0/1500, Train Loss: 0.4238418936729431
Epoch: 2/5, Train Loss: 0.4699208450814088
Epoch: 2/5, Valid Loss: 0.4842695916692416
Epoch: 2/5, Valid accuracy: 77.994%
Model saved successfully

Epoch: 3/5, Step: 0/1500, Train Loss: 0.22738242149353027
Epoch: 3/5, Train Loss: 0.42285908152659735
Epoch: 3/5, Valid Loss: 0.5085679888725281
Epoch: 3/5, Valid accuracy: 76.045%
Model saved successfully

Epoch: 4/5, Step: 0/1500, Train Loss: 0.37004488706588745
Epoch: 4/5, Train Loss: 0.39532965379953383
Epoch: 4/5, Valid Loss: 0.5667300696174303
Epoch: 4/5, Valid accuracy: 76.323%
Model saved successfully

Epoch: 5/5, Step: 0/1500, Train Loss: 0.23610080778598785
Epoch: 5/5, Train Loss: 0.3745675835410754
Epoch: 5/5,

### hidden size=512, embedding size=100

In [158]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim =100
hidden_dim = 512
n_layers = 2

In [159]:
valid_acc_all = []
for i in range(5):
    print("k-fold validation: {}".format(i))
    train_loader, valid_loader = random_split_data(train_data, ratio=0.8)
    model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
    loss_fn = nn.BCELoss()
    model.to(device)
    train(num_epochs=5, train_loader=train_loader, valid_loader=valid_loader, clip=5, interval=2500, valid_acc_all=valid_acc_all)
print("Average valid accuracy: {}".format(np.mean(valid_acc_all)))

k-fold validation: 0
Train samples: 48000, Valid samples: 12000
Epoch: 1/5, Step: 0/1500, Train Loss: 0.6905481815338135
Epoch: 1/5, Train Loss: 0.6981073155005773
Epoch: 1/5, Valid Loss: 0.6888065859675407
Epoch: 1/5, Valid accuracy: 49.304%
Model saved successfully

Epoch: 2/5, Step: 0/1500, Train Loss: 0.6853047013282776
Epoch: 2/5, Train Loss: 0.697849326968193
Epoch: 2/5, Valid Loss: 0.686759427189827
Epoch: 2/5, Valid accuracy: 52.646%
Model saved successfully

Epoch: 3/5, Step: 0/1500, Train Loss: 0.6507456302642822
Epoch: 3/5, Train Loss: 0.7078756774266561
Epoch: 3/5, Valid Loss: 0.731641819079717
Epoch: 3/5, Valid accuracy: 50.696%

Epoch: 4/5, Step: 0/1500, Train Loss: 0.6796683073043823
Epoch: 4/5, Train Loss: 0.67979761048158
Epoch: 4/5, Valid Loss: 0.7475843131542206
Epoch: 4/5, Valid accuracy: 49.861%
Model saved successfully

Epoch: 5/5, Step: 0/1500, Train Loss: 0.7133102416992188
Epoch: 5/5, Train Loss: 0.6244572818080584
Epoch: 5/5, Valid Loss: 0.6310108179847399
Epo

### hidden size=512, embedding size=400

In [160]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim =400
hidden_dim = 512
n_layers = 2

In [161]:
valid_acc_all = []
for i in range(5):
    print("k-fold validation: {}".format(i))
    train_loader, valid_loader = random_split_data(train_data, ratio=0.8)
    model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
    loss_fn = nn.BCELoss()
    model.to(device)
    train(num_epochs=5, train_loader=train_loader, valid_loader=valid_loader, clip=5, interval=2500, valid_acc_all=valid_acc_all)
print("Average valid accuracy: {}".format(np.mean(valid_acc_all)))

k-fold validation: 0
Train samples: 48000, Valid samples: 12000
Epoch: 1/5, Step: 0/1500, Train Loss: 0.6969070434570312
Epoch: 1/5, Train Loss: 0.6368362758755683
Epoch: 1/5, Valid Loss: 0.5641574785113335
Epoch: 1/5, Valid accuracy: 67.409%
Model saved successfully

Epoch: 2/5, Step: 0/1500, Train Loss: 0.5850573778152466
Epoch: 2/5, Train Loss: 0.5346089209318161
Epoch: 2/5, Valid Loss: 0.5165652744472027
Epoch: 2/5, Valid accuracy: 74.373%
Model saved successfully

Epoch: 3/5, Step: 0/1500, Train Loss: 0.4991030693054199
Epoch: 3/5, Train Loss: 0.4944921514689922
Epoch: 3/5, Valid Loss: 0.4880125472942988
Epoch: 3/5, Valid accuracy: 77.159%
Model saved successfully

Epoch: 4/5, Step: 0/1500, Train Loss: 0.6507980227470398
Epoch: 4/5, Train Loss: 0.469894880960385
Epoch: 4/5, Valid Loss: 0.4722542588909467
Epoch: 4/5, Valid accuracy: 77.159%
Model saved successfully

Epoch: 5/5, Step: 0/1500, Train Loss: 0.4828230142593384
Epoch: 5/5, Train Loss: 0.4537943410774072
Epoch: 5/5, Valid

## Best combination on test data

In [109]:
batch_size = 32
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [110]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim = 100
hidden_dim = 128
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = nn.BCELoss()
model.to(device)

SentimentNet(
  (embedding): Embedding(19959, 100)
  (rnn): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [111]:
train(num_epochs=4, train_loader=train_loader, valid_loader=None, clip=5, interval=800)

Epoch: 1/4, Step: 0/1875, Train Loss: 0.6855311989784241
Epoch: 1/4, Step: 800/1875, Train Loss: 0.6409757137298584
Epoch: 1/4, Step: 1600/1875, Train Loss: 0.5881306529045105
Epoch: 1/4, Train Loss: 0.6273406866709391
Model saved successfully

Epoch: 2/4, Step: 0/1875, Train Loss: 0.4047505855560303
Epoch: 2/4, Step: 800/1875, Train Loss: 0.5136011838912964
Epoch: 2/4, Step: 1600/1875, Train Loss: 0.6156495213508606
Epoch: 2/4, Train Loss: 0.506865639368693
Model saved successfully

Epoch: 3/4, Step: 0/1875, Train Loss: 0.4363364577293396
Epoch: 3/4, Step: 800/1875, Train Loss: 0.4549494981765747
Epoch: 3/4, Step: 1600/1875, Train Loss: 0.25772589445114136
Epoch: 3/4, Train Loss: 0.45821071407000225
Model saved successfully

Epoch: 4/4, Step: 0/1875, Train Loss: 0.3221054673194885
Epoch: 4/4, Step: 800/1875, Train Loss: 0.49739134311676025
Epoch: 4/4, Step: 1600/1875, Train Loss: 0.4270251393318176
Epoch: 4/4, Train Loss: 0.4289244349161784
Model saved successfully



In [112]:
test(test_loader)

Test loss: 0.529
Test accuracy: 76.602%


## Bonus

In [113]:
train_sentences = []
train_labels = []
with open('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1') as f:
    reader = csv.reader(f)
    for line in reader:
        label, text = line[0], line[5]
        if label == '0':
            train_sentences.append(text)
            train_labels.append(0.)
        if label == '4':
            train_sentences.append(text)
            train_labels.append(1.)

In [114]:
word2idx, idx2word, train_data, test_data = process_data(train_sentences, train_labels, test_sentences, test_labels, padding_length=30)

0.0% done
10.0% done
20.0% done
30.0% done
40.0% done
50.0% done
60.0% done
70.0% done
80.0% done
90.0% done
100% done
vocab number: 247693


In [115]:
batch_size = 32
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [116]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, use_model='lstm', bidirectional=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = nn.BCELoss()
model.to(device)

SentimentNet(
  (embedding): Embedding(247693, 400)
  (rnn): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=1024, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [117]:
train(num_epochs=1, train_loader=train_loader, valid_loader=None, clip=5, interval=5000)

Epoch: 1/1, Step: 0/50000, Train Loss: 0.6925088763237
Epoch: 1/1, Step: 5000/50000, Train Loss: 0.7107133865356445
Epoch: 1/1, Step: 10000/50000, Train Loss: 0.5370061993598938
Epoch: 1/1, Step: 15000/50000, Train Loss: 0.5049312114715576
Epoch: 1/1, Step: 20000/50000, Train Loss: 0.3669494390487671
Epoch: 1/1, Step: 25000/50000, Train Loss: 0.3891751170158386
Epoch: 1/1, Step: 30000/50000, Train Loss: 0.784136176109314
Epoch: 1/1, Step: 35000/50000, Train Loss: 0.5021012425422668
Epoch: 1/1, Step: 40000/50000, Train Loss: 0.5080928802490234
Epoch: 1/1, Step: 45000/50000, Train Loss: 0.4689303934574127
Epoch: 1/1, Train Loss: 0.5369125262585283
Model saved successfully



In [118]:
test(test_loader)

Test loss: 0.498
Test accuracy: 76.880%
