In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [2]:
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import sys
import os
import string, nltk
#nltk.download('stopwords')
nltk.data.path.append("/home/ubuntu/nltk_data")
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
#nltk.download('punkt')
#nltk.download('wordnet')
import numpy as np
import torch
from string import punctuation
from collections import Counter
from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim
from torchsummary import summary

In [3]:
Vocab = []
seqence_len = 50
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english') + list(string.punctuation)) 

def Norm(text,wordnet_lemmatizer,stop_words):
    text = text.lower().replace("\s+"," ")
    word_tokens = word_tokenize(text) 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            w = wordnet_lemmatizer.lemmatize(w, pos="v")
            filtered_sentence.append(w) 
    texts=" ".join(str(x) for x in filtered_sentence)
    return text

def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
        
    return np.array(reviews)

def LoadData(file, Vocab=Vocab):
    with open(file, "r",encoding="ISO-8859-1") as f:
        data_x = []
        data_y = []
        contents = f.read().splitlines()
        for line in contents:
            try:
                _,text,label = line.split("#")
            except:
                continue
            text = text.split(" ",1)[1]
            
            text = "".join([ch for ch in text if ch not in punctuation])
            text = Norm(text,wordnet_lemmatizer,stop_words)
            
            data_x.append(text)
            data_y.append(label)
            Vocab = Vocab + text.split(" ")
        return data_x, data_y, Vocab

# Data Preprocessing

In [4]:

train_x, train_y, Vocab = LoadData("../data/train.txt",Vocab)
dev_x, dev_y, Vocab = LoadData("../data/dev.txt",Vocab)
test_x, test_y, Vocab = LoadData("../data/test.txt",Vocab)


word_counts = Counter(Vocab)
word_list = sorted(word_counts, key = word_counts.get, reverse = True)
vocab_to_int = {word:idx+1 for idx, word in enumerate(word_list)}
int_to_vocab = {idx:word for word, idx in vocab_to_int.items()}


encoded_train = [[vocab_to_int[word] for word in review.split(" ")] for review in train_x]
train_x = pad_text(encoded_train, seq_length = seqence_len)
train_y = np.array([1 if label == "pos" else 0 for label in train_y])

encoded_dev = [[vocab_to_int[word] for word in review.split(" ")] for review in dev_x]
dev_x = pad_text(encoded_dev, seq_length = seqence_len)
dev_y = np.array([1 if label == "pos" else 0 for label in dev_y])

encoded_test = [[vocab_to_int[word] for word in review.split(" ")] for review in test_x]
test_x = pad_text(encoded_test, seq_length = seqence_len)
test_y = np.array([1 if label == "pos" else 0 for label in test_y])



train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size = 50
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

# Models

In [5]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class NetworkLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        
        self.lstm = nn.LSTM(n_embed, hidden_node, layers, batch_first = True, bidirectional=False)
        
        self.fc = nn.Linear(n_hidden, n_output)
        
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):                       # => batch size, sent len
        embedded_words = self.embedding(input_words)    # => (batch_size, seq_length, n_embed)
        lstm_out, hidden = self.lstm(embedded_words)         # =>  (batch_size, seq_length, n_hidden)
        
        out = self.fc(lstm_out[:, -1, :])
        
        sig = self.sigmoid(out)
        return sig, hidden

class NetworkLSTM_(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.Wir = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
        self.Whr = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
        self.Wif = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
        self.Whf = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
        self.Wig = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
        self.Whg = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
        self.Wio = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
        self.Who = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
        self.hidden_node = hidden_node
        self.layers = layers
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
    def forward (self, input_words):                    # => (batch size, sent len)
        
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        embedded_words = embedded_words.permute(1,0,2)   #  (seq_length,batch_size,  n_embed)
        hidden = torch.zeros(input_words.size(0), self.hidden_node).to(device)  # batch-node
        
        c = torch.zeros(input_words.size(0), self.hidden_node).to(device)
        
        for i in range(input_words.size(1)):           #for i in seq_length

            ir=embedded_words[i].matmul(self.Wir)
            hr=hidden.matmul(   self.Whr)
            r= ir.add(hr)
            rt = self.sigmoid(r)
            
            iff=embedded_words[i].matmul(self.Wif)
            hff=hidden.matmul(   self.Whf)
            ff= iff.add(hff)
            fft = self.sigmoid(ff)
            
            ig=embedded_words[i].matmul(self.Wig)
            hg=hidden.matmul(   self.Whg)
            g= ig.add(hg)
            gt = self.tanh(g)
            
            io=embedded_words[i].matmul(self.Wio)
            ho=hidden.matmul(   self.Who)
            o= io.add(ho)
            ot = self.sigmoid(o)
            
            c = fft*c + rt*gt
            hidden = ot*self.tanh(c)
        
        out = self.fc(hidden)
        
        sig = self.sigmoid(out)
        return sig, hidden
    
class NetworkGRU_(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.Wir = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
        self.Whr = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
        self.Wiz = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
        self.Whz = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
        self.Win = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
        self.Whn = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
        self.hidden_node = hidden_node
        self.layers = layers
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
    def forward (self, input_words):                    # => (batch size, sent len)
        
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        embedded_words = embedded_words.permute(1,0,2)   #  (seq_length,batch_size,  n_embed)
        hidden = torch.zeros(input_words.size(0), self.hidden_node).to(device)  # batch-node
        
        for i in range(input_words.size(1)):           #for i in seq_length

            ir=embedded_words[i].matmul(self.Wir)
            hr=hidden.matmul(   self.Whr)
            r= ir.add(hr)
            rt = self.sigmoid(r)
            
            #print(rt.shape)
            
            iz=embedded_words[i].matmul(self.Wiz)
            hz=hidden.matmul(   self.Whz)
            z= iz.add(hz)
            zt = self.sigmoid(z)
            
            iN=embedded_words[i].matmul(self.Win)
            hN=hidden.matmul(   self.Whz)*rt
            N= iN.add(hN)
            Nt = self.tanh(N)
            
            hidden = (1-zt)*Nt + zt*hidden
        
        out = self.fc(hidden)
        
        sig = self.sigmoid(out)
        return sig, hidden
    
class NetworkRNN_(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.Wi = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
        self.Wh = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
        self.hidden_node = hidden_node
        self.layers = layers
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
#         self.rnn = nn.RNN(n_embed, hidden_node, layers, batch_first = True, bidirectional=False)
#         self.rnn_cell = nn.RNNCell(n_embed, hidden_node)
        self.fc = nn.Linear(n_hidden, n_output)
        
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
    def forward (self, input_words):                    # => (batch size, sent len)
        
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        embedded_words = embedded_words.permute(1,0,2)   #  (seq_length,batch_size,  n_embed)
        hidden = torch.zeros(input_words.size(0), self.hidden_node).to(device)  # batch-node
        
        for i in range(input_words.size(1)):           #for i in seq_length

            A=embedded_words[i].matmul(self.Wi)
            B=hidden.matmul(   self.Wh)
            C = A.add(B)
            hidden = self.sigmoid(C)
        
        out = self.fc(hidden)
        
        sig = self.sigmoid(out)
        return sig, hidden
    
class NetworkGRU(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        
        self.gru = nn.GRU(n_embed, hidden_node, layers, batch_first = True, bidirectional=False)
        
        self.fc = nn.Linear(n_hidden, n_output)
        
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        gru_out, hidden = self.gru(embedded_words)         # (batch_size, seq_length, n_hidden)
        
        out = self.fc(gru_out[:, -1, :])
        
        sig = self.sigmoid(out)
        return sig, hidden
    
    
    
    
    
    
class NetworkRNN(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        
        self.rnn = nn.RNN(n_embed, hidden_node, layers, batch_first = True, bidirectional=False)
        
        self.fc = nn.Linear(n_hidden, n_output)
        
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        gru_out, hidden = self.rnn(embedded_words)         # (batch_size, seq_length, n_hidden)
        
        out = self.fc(gru_out[:, -1, :])
        
        sig = self.sigmoid(out)
        return sig, hidden

# Training

In [6]:

n_vocab = len(vocab_to_int)
n_embed = 300
n_hidden = 512
n_output = 1   # 1 ("positive") or 0 ("negative")
layers = 1

net = NetworkGRU(n_vocab, n_embed, n_hidden, n_output, layers).cuda()

criterion = nn.BCELoss()
criterion = criterion.cuda()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

print(net)

# inp = torch.zeros((1,200), dtype=torch.long) # [length, batch_size]
# print(summary(net,(300) ))

print_every = 100
step = 0
n_epochs = 8 #4
clip = 5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
count = 0
sums = 0 
print(device)
for epoch in range(n_epochs):
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)
        
        net.zero_grad()
        output, h = net(inputs)
        try:
            loss = criterion(output.squeeze(), labels.float())
        except:
            output[output < 0.0] = 0.0
            output[output > 1.0] = 1.0
            loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            net.eval()
            valid_losses = []
            
            for v_inputs, v_labels in test_loader:
                sums = sums + len(v_inputs)
                v_inputs, v_labels = inputs.to(device), labels.to(device)

                
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())
                
                output = torch.round(v_output.squeeze()).detach().cpu().numpy().astype(int)
                #print(len(output))
                ground = v_labels.detach().cpu().numpy().astype(int)
                #print(len(ground))
                
                count = count + np.sum(output == ground)

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            print(count/len(test_x))
            count = 0
            print(sums)
            print(len(test_x))
            sums = 0
            net.train()
            
            
            
            
            
            
            
            
# torch.save(net.state_dict(), "LSTM.pt")



# net.eval()


NetworkGRU(
  (embedding): Embedding(20451, 300)
  (gru): GRU(300, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
cuda




Epoch: 1/8 Step: 100 Training Loss: 0.6645 Validation Loss: 0.6242
0.6814602720114531
1397
1397
Epoch: 2/8 Step: 200 Training Loss: 0.5667 Validation Loss: 0.5059
0.7616320687186829
1397
1397
Epoch: 2/8 Step: 300 Training Loss: 0.5082 Validation Loss: 0.4429
0.7616320687186829
1397
1397
Epoch: 3/8 Step: 400 Training Loss: 0.3689 Validation Loss: 0.2847
0.9219756621331424
1397
1397
Epoch: 4/8 Step: 500 Training Loss: 0.0857 Validation Loss: 0.0590
0.9821045096635648
1397
1397
Epoch: 4/8 Step: 600 Training Loss: 0.1311 Validation Loss: 0.0653
0.9620615604867573
1397
1397
Epoch: 5/8 Step: 700 Training Loss: 0.0311 Validation Loss: 0.0212
1.0021474588403723
1397
1397
Epoch: 6/8 Step: 800 Training Loss: 0.0091 Validation Loss: 0.0041
1.0021474588403723
1397
1397
Epoch: 6/8 Step: 900 Training Loss: 0.0181 Validation Loss: 0.0048
1.0021474588403723
1397
1397
Epoch: 7/8 Step: 1000 Training Loss: 0.0029 Validation Loss: 0.0029
1.0021474588403723
1397
1397
Epoch: 8/8 Step: 1100 Training Loss: 0.

# Predict test

In [7]:
net.eval().cpu()#.to(device)
count = 0
sums = 0

valid_losses = []

for v_inputs, v_labels in test_loader:
    sums = sums + len(v_inputs)
    v_inputs, v_labels = v_inputs, v_labels
    
    #print(len(inputs))
    
    v_output, v_h = net(v_inputs)
    
    
#     v_loss = criterion(v_output.squeeze(), v_labels.float())
#     valid_losses.append(v_loss.item())
        

#     output = torch.round(v_output.squeeze()).detach().cpu().numpy().astype(int)
#     #print(len(output))
#     ground = v_labels.detach().cpu().numpy().astype(int)
#     #print(len(ground))
#     count = count + np.sum(output == ground)
    
print(count/len(test_x))
print(len(test_x))
print(sums)

print("Test Loss: {:.4f}".format(np.mean(valid_losses)))

RuntimeError: index out of range: Tried to access index 20451 out of table with 20450 rows. at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418

# Inference Sample

In [None]:


# 

def inference(net, review, seq_length = seqence_len):
    device = "cuda" #"cuda" if torch.cuda.is_available() else "cpu"
    
    text = review.lower()
    text = "".join([ch for ch in text if ch not in punctuation])
    words = text
    
    encoded_words = [vocab_to_int[word] for word in words.split(" ")]
    padded_words = pad_text([encoded_words], seq_length)
    padded_words = torch.from_numpy(padded_words).to(device)

    
    net.eval().to(device)
    output, h = net(padded_words )#, h)
    pred = torch.round(output.squeeze())  
    return pred


inference(net, "I am sad") 


# 

In [None]:






def predict(checkpoint, test_file, Vocab):

    #     n_vocab = len(vocab_to_int)
#     n_embed = 400
#     n_hidden = 512
#     n_output = 1   # 1 ("positive") or 0 ("negative")
#     layers = 1

#     net = NetworkRNN(n_vocab, n_embed, n_hidden, n_output, layers).cuda()
    
#     test_x, test_y, Vocab = LoadData("../data/test.txt",Vocab)

#     net.load_state_dict(torch.load("LSTM.pt"))
    
#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    net.eval().to(device)
    count = 0
    for v_inputs, v_labels in test_loader:
        v_inputs, v_labels = inputs.to(device), labels.to(device)

        v_output, v_h = net(v_inputs)

        output = torch.round(v_output.squeeze()).detach().cpu().numpy().astype(int)
        #print(len(output))
        ground = labels.detach().cpu().numpy().astype(int)
        #print(len(ground))
        count = count + np.sum(output == ground)
    print(count/len(test_x))


# In[162]:


predict("LSTM.pt","../data/test.txt",Vocab)
