In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,6,7"

In [10]:
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import sys
import os
import string, nltk
#nltk.download('stopwords')
nltk.data.path.append("/home/ubuntu/nltk_data")
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
#nltk.download('punkt')
#nltk.download('wordnet')
import numpy as np
import torch
from string import punctuation
from collections import Counter
from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim
#from torchsummary import summary
from torchinfo import summary

import pandas as pd
import tensorflow as tf

import re


In [88]:
seqence_len = 40
embed_len = 300

glove = pd.read_csv('glove.6B.'+str(embed_len)+'d.txt', sep=" ", quoting=3, header=None, index_col=0)
glove_embedding = {key: val.values for key, val in glove.T.items()}

In [89]:
Vocab = []


wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english') + list(string.punctuation)) 

def create_embedding_matrix(word_index,embedding_dict,dimension):
    embedding_matrix=np.zeros((len(word_index)+1,dimension))

    for word,index in word_index.items():
        if word in embedding_dict:
            embedding_matrix[index]=embedding_dict[word]
    return embedding_matrix


def Norm(text,wordnet_lemmatizer,stop_words):
    text = text.lower().strip()
    text =  re.sub(' +', ' ', text)
    word_tokens = word_tokenize(text) 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            w = wordnet_lemmatizer.lemmatize(w, pos="v")
            filtered_sentence.append(w) 
    texts=" ".join(str(x) for x in filtered_sentence)
    return text

def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
        
    return np.array(reviews)

def LoadData(file, Vocab=Vocab):
    with open(file, "r",encoding="ISO-8859-1") as f:
        data_x = []
        data_y = []
        contents = f.read().splitlines()
        for line in contents:
            try:
                _,text,label = line.split("#")
            except:
                continue
            text = text.split(" ",1)[1]
            
            text = "".join([ch for ch in text if ch not in punctuation])
            text = Norm(text,wordnet_lemmatizer,stop_words)
            
            data_x.append(text)
            data_y.append(label)
            Vocab = Vocab + text.split(" ")
        return data_x, data_y, Vocab

# Data Preprocessing

In [90]:
train_x, train_y, Vocab = LoadData("../data/train.txt",Vocab)
dev_x, dev_y, Vocab = LoadData("../data/dev.txt",Vocab)
test_x, test_y, Vocab = LoadData("../data/test.txt",Vocab)

# train_x = train_x + dev_x + test_x
# train_y = train_y + dev_y + test_y
print(test_x[:5])

['sandra bullock and hugh grant make a great team but this predictable romantic comedy should get a pink slip', 'those eternally devoted to the insanity of black will have an intermittently good time feel free to go get popcorn whenever hes not onscreen', 'this is wild surreal stuff but brilliant and the camera just kind of sits there and lets you look at this and its like youre going from one room to the next and none of them have any relation to the other', 'this is a harrowing movie about how parents know where all the buttons are and how to push them', 'without shakespeares eloquent language the update is dreary and sluggish']


In [91]:



tokenizer=tf.keras.preprocessing.text.Tokenizer(split=" ")
tokenizer.fit_on_texts(train_x+dev_x+test_x)

encoded_train =tokenizer.texts_to_sequences(train_x)
encoded_dev =tokenizer.texts_to_sequences(dev_x)
encoded_test =tokenizer.texts_to_sequences(test_x)


train_x = pad_text(encoded_train, seq_length = seqence_len)
train_y = np.array([1 if label == "pos" else 0 for label in train_y])


dev_x = pad_text(encoded_dev, seq_length = seqence_len)
dev_y = np.array([1 if label == "pos" else 0 for label in dev_y])


test_x = pad_text(encoded_test, seq_length = seqence_len)
test_y = np.array([1 if label == "pos" else 0 for label in test_y])

# print(len(type(encoded_test)))


train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size = 50
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [92]:


embedding_matrix=create_embedding_matrix(tokenizer.word_index,embedding_dict=glove_embedding,dimension=embed_len)

# Models

In [93]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class NetworkLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.embedding.weight=torch.nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
        
        self.lstm = nn.LSTM(n_embed, hidden_node, layers, batch_first = True, bidirectional=False)
        
        self.fc = nn.Linear(n_hidden, n_output)
        
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.5)
        
        
    def forward (self, input_words):                       # => batch size, sent len
        embedded_words = self.embedding(input_words)    # => (batch_size, seq_length, n_embed)
        lstm_out, hidden = self.lstm(embedded_words)         # =>  (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        out = self.fc(lstm_out[:, -1, :])
        
        sig = self.sigmoid(out)
        return sig, hidden

class NetworkLSTM_(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.linear_hidden = nn.Linear(n_hidden, n_hidden)
        self.linear_input = nn.Linear(n_embed, n_hidden)
        
        self.hidden_node = hidden_node
        self.layers = layers
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.5)
        
    def forward (self, input_words):                    # => (batch size, sent len)
        
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        embedded_words = embedded_words.permute(1,0,2)   #  (seq_length,batch_size,  n_embed)
        hidden = torch.zeros(input_words.size(0), self.hidden_node).to(device)  # batch-node
        
        c = torch.zeros(input_words.size(0), self.hidden_node).to(device)
        
        for i in range(input_words.size(1)):           #for i in seq_length

            ir=self.linear_input(embedded_words[i])
            hr=self.linear_hidden(hidden)
            r= ir.add(hr)
            rt = self.sigmoid(r)
            
            iff=self.linear_input(embedded_words[i])
            hff=self.linear_hidden(hidden)
            ff= iff.add(hff)
            fft = self.sigmoid(ff)
            
            ig=self.linear_input(embedded_words[i])
            hg=self.linear_hidden(hidden)
            g= ig.add(hg)
            gt = self.tanh(g)
            
            io=self.linear_input(embedded_words[i])
            ho=self.linear_hidden(hidden)
            o= io.add(ho)
            ot = self.sigmoid(o)
            
            c = fft*c + rt*gt
            hidden = ot*self.tanh(c)
        
        out = self.fc(hidden)
        
        sig = self.sigmoid(out)
        return sig, hidden
    
class NetworkGRU_(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.hidden_node = hidden_node
        self.layers = layers
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
        self.linear_hidden_r = nn.Linear(n_hidden, n_hidden)
        self.linear_hidden_z = nn.Linear(n_hidden, n_hidden)
        self.linear_hidden_n = nn.Linear(n_hidden, n_hidden)
        self.linear_input_r = nn.Linear(n_embed, n_hidden)
        self.linear_input_z = nn.Linear(n_embed, n_hidden)
        self.linear_input_n = nn.Linear(n_embed, n_hidden)
        
    def forward (self, input_words):                    # => (batch size, sent len)
        
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        embedded_words = embedded_words.permute(1,0,2)   #  (seq_length,batch_size,  n_embed)
        hidden = torch.zeros(input_words.size(0), self.hidden_node).to(device)  # batch-node
        
        for i in range(input_words.size(1)):           #for i in seq_length

            ir=self.linear_input_r(embedded_words[i])
            hr=self.linear_hidden_r(hidden)
            r= ir.add(hr)
            rt = self.sigmoid(r)
            
            iz=self.linear_input_z(embedded_words[i])
            hz=self.linear_hidden_z(hidden)
            z= iz.add(hz)
            zt = self.sigmoid(z)
            
            
            iN=self.linear_input_n(embedded_words[i])
            hN=self.linear_hidden_n(hidden)*rt
            N= iN.add(hN)
            Nt = self.tanh(N)
            
            hidden = (1-zt)*Nt + zt*hidden
        
        out = self.fc(hidden)
        
        sig = self.sigmoid(out)
        return sig, hidden
    
class NetworkRNN_(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
#         self.Wi = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
#         self.Wh = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
        self.hidden_node = hidden_node
        self.embedding = nn.Embedding(n_vocab, n_embed)
#         self.rnn = nn.RNN(n_embed, hidden_node, layers, batch_first = True, bidirectional=False)
#         self.rnn_cell = nn.RNNCell(n_embed, hidden_node)
        self.fc = nn.Linear(n_hidden, n_output)
        
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.5)

        self.linear_hidden = nn.Linear(n_hidden, n_hidden)
        self.linear_input = nn.Linear(n_embed, n_hidden)
        
    def forward (self, input_words):                    # => (batch size, sent len)
        
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        embedded_words = embedded_words.permute(1,0,2)   #  (seq_length,batch_size,  n_embed)
        hidden = torch.zeros(input_words.size(0), self.hidden_node).to(device)  # batch-node
        
        for i in range(input_words.size(1)):           #for i in seq_length

            A = self.linear_hidden(hidden)
            B = self.linear_input(embedded_words[i])
            C =  A.add(B)
            hidden = self.sigmoid(C)
        
        out = self.fc(hidden)
        
        sig = self.sigmoid(out)
        return sig, hidden
    
class NetworkGRU(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        
        self.gru = nn.GRU(n_embed, hidden_node, layers, batch_first = True, bidirectional=False)
        
        self.fc = nn.Linear(n_hidden, n_output)
        
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.8)
        
        
    def forward (self, input_words):
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        gru_out, hidden = self.gru(embedded_words)         # (batch_size, seq_length, n_hidden)
        gru_out = self.dropout(gru_out)
        
        out = self.fc(gru_out[:, -1, :])
        
        sig = self.sigmoid(out)
        return sig, hidden
    
class NetworkRNN(nn.Module):
    
    def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
        super().__init__()
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        
        self.rnn = nn.RNN(n_embed, hidden_node, layers, batch_first = True, bidirectional=False)
        
        self.fc = nn.Linear(n_hidden, n_output)
        
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.6)
        
        
    def forward (self, input_words):
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        rnn_out, hidden = self.rnn(embedded_words)         # (batch_size, seq_length, n_hidden)
        rnn_out = self.dropout(rnn_out)
        out = self.fc(rnn_out[:, -1, :])
        
        sig = self.sigmoid(out)
        return sig, hidden

# Training

In [94]:
n_vocab=embedding_matrix.shape[0]
n_embed=embedding_matrix.shape[1]
n_hidden = 512
n_output = 1   # 1 ("positive") or 0 ("negative")
layers = 2

net = NetworkLSTM_(n_vocab, n_embed, n_hidden, n_output, layers).cuda()

criterion = nn.BCELoss()
criterion = criterion.cuda()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

print(net)

summary(
    net,
    (1, 50),
    dtypes=[torch.long],
    verbose=2,
    col_width=16,
    col_names=["kernel_size", "output_size", "num_params", "mult_adds"],
)

NetworkLSTM_(
  (linear_hidden): Linear(in_features=512, out_features=512, bias=True)
  (linear_input): Linear(in_features=300, out_features=512, bias=True)
  (embedding): Embedding(20451, 300)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (relu): ReLU()
  (tanh): Tanh()
  (dropout): Dropout(p=0.5, inplace=False)
)
Layer (type:depth-idx)                   Kernel Shape     Output Shape     Param #          Mult-Adds
├─Embedding: 1-1                         [300, 20451]     [1, 50, 300]     6,135,300        6,135,300
├─Linear: 1-2                            [300, 512]       [1, 512]         154,112          153,600
├─Linear: 1-3                            [512, 512]       [1, 512]         262,656          262,144
├─Sigmoid: 1-4                           --               [1, 512]         --               --
├─Linear: 1-5                            [300, 512]       [1, 512]         (recursive)      153,600
├─Linear: 1-6                            [512,

Layer (type:depth-idx)                   Kernel Shape     Output Shape     Param #          Mult-Adds
├─Embedding: 1-1                         [300, 20451]     [1, 50, 300]     6,135,300        6,135,300
├─Linear: 1-2                            [300, 512]       [1, 512]         154,112          153,600
├─Linear: 1-3                            [512, 512]       [1, 512]         262,656          262,144
├─Sigmoid: 1-4                           --               [1, 512]         --               --
├─Linear: 1-5                            [300, 512]       [1, 512]         (recursive)      153,600
├─Linear: 1-6                            [512, 512]       [1, 512]         (recursive)      262,144
├─Sigmoid: 1-7                           --               [1, 512]         --               --
├─Linear: 1-8                            [300, 512]       [1, 512]         (recursive)      153,600
├─Linear: 1-9                            [512, 512]       [1, 512]         (recursive)      262,144
├─Tanh

In [95]:




print_every = 50
step = 0
n_epochs = 5
clip = 5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
for epoch in range(n_epochs):
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)
        
        net.zero_grad()
        output, h = net(inputs)
        try:
            loss = criterion(output.squeeze(), labels.float())
        except:
            output[output < 0.0] = 0.0
            output[output > 1.0] = 1.0
            loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            net.eval()
            valid_losses = []
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = v_inputs.to(device), v_labels.to(device)

                
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())
                
                
            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))

            
            net.train()
            
#torch.save(net.state_dict(), "LSTM.pt")


cuda




Epoch: 1/5 Step: 50 Training Loss: 0.6951 Validation Loss: 0.6765
Epoch: 1/5 Step: 100 Training Loss: 0.6041 Validation Loss: 0.6557
Epoch: 1/5 Step: 150 Training Loss: 0.6533 Validation Loss: 0.6454
Epoch: 2/5 Step: 200 Training Loss: 0.4413 Validation Loss: 0.6420
Epoch: 2/5 Step: 250 Training Loss: 0.4695 Validation Loss: 0.6357
Epoch: 2/5 Step: 300 Training Loss: 0.4018 Validation Loss: 0.6205
Epoch: 3/5 Step: 350 Training Loss: 0.4028 Validation Loss: 0.6924
Epoch: 3/5 Step: 400 Training Loss: 0.2568 Validation Loss: 0.6817
Epoch: 3/5 Step: 450 Training Loss: 0.2336 Validation Loss: 0.6692
Epoch: 4/5 Step: 500 Training Loss: 0.1627 Validation Loss: 0.8162
Epoch: 4/5 Step: 550 Training Loss: 0.1214 Validation Loss: 0.7937
Epoch: 4/5 Step: 600 Training Loss: 0.2556 Validation Loss: 0.7061
Epoch: 5/5 Step: 650 Training Loss: 0.1269 Validation Loss: 0.9031
Epoch: 5/5 Step: 700 Training Loss: 0.0383 Validation Loss: 0.9053
Epoch: 5/5 Step: 750 Training Loss: 0.0448 Validation Loss: 1.0

# Predict test

In [96]:
net.eval().to(device)
count = 0
sums = 0

test_losses = []

for v_inputs, v_labels in test_loader:
    sums = sums + len(v_inputs)
    v_inputs, v_labels = v_inputs.to(device), v_labels.to(device)
    
    #print(len(inputs))
    
    v_output, v_h = net(v_inputs)
    
    
    v_loss = criterion(v_output.squeeze(), v_labels.float())
    test_losses.append(v_loss.item())
        

    output = torch.round(v_output.squeeze()).detach().cpu().numpy().astype(int)
    #print(len(output))
    ground = v_labels.detach().cpu().numpy().astype(int)
    #print(len(ground))
    count = count + np.sum(output == ground)
    
print(count/len(test_x))
print(len(test_x))
print(sums)

print("Test Loss: {:.4f}".format(np.mean(test_losses)))

0.7036506800286327
1397
1397
Test Loss: 0.8004


# Inference Sample

In [87]:


# 

def inference(net, review, seq_length = 200):
    device = "cuda" #"cuda" if torch.cuda.is_available() else "cpu"
    
    text = review.lower()
    text = "".join([ch for ch in text if ch not in punctuation])
    words = text
    
    encoded_train =tokenizer.texts_to_sequences([words])
    padded_words = pad_text(encoded_train, seq_length = 200)
    padded_words = torch.from_numpy(padded_words).to(device)

    
    net.eval().to(device)
    output, h = net(padded_words )#, h)
    pred = torch.round(output.squeeze())  
    return pred


inference(net, "I am funny") 


# 

tensor(1., device='cuda:0', grad_fn=<RoundBackward>)

In [None]:
# class NetworkLSTM_(nn.Module):
    
#     def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
#         super().__init__()
        
#         self.Wir = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
#         self.Whr = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
#         self.Wif = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
#         self.Whf = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
#         self.Wig = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
#         self.Whg = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
#         self.Wio = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
#         self.Who = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
#         self.hidden_node = hidden_node
#         self.layers = layers
        
#         self.embedding = nn.Embedding(n_vocab, n_embed)
#         self.fc = nn.Linear(n_hidden, n_output)
#         self.sigmoid = nn.Sigmoid()
#         self.relu = nn.ReLU()
#         self.tanh = nn.Tanh()
#         self.dropout = nn.Dropout(0.5)
        
#     def forward (self, input_words):                    # => (batch size, sent len)
        
#         embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
#         embedded_words = embedded_words.permute(1,0,2)   #  (seq_length,batch_size,  n_embed)
#         hidden = torch.zeros(input_words.size(0), self.hidden_node).to(device)  # batch-node
        
#         c = torch.zeros(input_words.size(0), self.hidden_node).to(device)
        
#         for i in range(input_words.size(1)):           #for i in seq_length

#             ir=embedded_words[i].matmul(self.Wir)
#             hr=hidden.matmul(   self.Whr)
#             r= ir.add(hr)
#             rt = self.sigmoid(r)
            
#             iff=embedded_words[i].matmul(self.Wif)
#             hff=hidden.matmul(   self.Whf)
#             ff= iff.add(hff)
#             fft = self.sigmoid(ff)
            
#             ig=embedded_words[i].matmul(self.Wig)
#             hg=hidden.matmul(   self.Whg)
#             g= ig.add(hg)
#             gt = self.tanh(g)
            
#             io=embedded_words[i].matmul(self.Wio)
#             ho=hidden.matmul(   self.Who)
#             o= io.add(ho)
#             ot = self.sigmoid(o)
            
#             c = fft*c + rt*gt
#             hidden = ot*self.tanh(c)
        
#         out = self.fc(hidden)
        
#         sig = self.sigmoid(out)
#         return sig, hidden
    
# class NetworkGRU_(nn.Module):
    
#     def __init__(self, n_vocab, n_embed, hidden_node, n_output, layers):
#         super().__init__()
        
#         self.Wir = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
#         self.Whr = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
#         self.Wiz = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
#         self.Whz = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
#         self.Win = nn.Parameter(torch.randn( (n_embed,hidden_node), requires_grad=True, dtype=torch.float))
#         self.Whn = nn.Parameter(torch.randn( (hidden_node,hidden_node) , requires_grad=True, dtype=torch.float))
        
#         self.hidden_node = hidden_node
#         self.layers = layers
        
#         self.embedding = nn.Embedding(n_vocab, n_embed)
#         self.fc = nn.Linear(n_hidden, n_output)
#         self.sigmoid = nn.Sigmoid()
#         self.relu = nn.ReLU()
#         self.tanh = nn.Tanh()
#         self.dropout = nn.Dropout(0.6)
        
        
#     def forward (self, input_words):                    # => (batch size, sent len)
        
#         embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
#         embedded_words = embedded_words.permute(1,0,2)   #  (seq_length,batch_size,  n_embed)
#         hidden = torch.zeros(input_words.size(0), self.hidden_node).to(device)  # batch-node
        
#         for i in range(input_words.size(1)):           #for i in seq_length

#             ir=embedded_words[i].matmul(self.Wir)
#             hr=hidden.matmul(   self.Whr)
#             r= ir.add(hr)
#             rt = self.sigmoid(r)
            
#             #print(rt.shape)
            
#             iz=embedded_words[i].matmul(self.Wiz)
#             hz=hidden.matmul(   self.Whz)
#             z= iz.add(hz)
#             zt = self.sigmoid(z)
            
#             iN=embedded_words[i].matmul(self.Win)
#             hN=hidden.matmul(   self.Whz)*rt
#             N= iN.add(hN)
#             Nt = self.tanh(N)
            
#             hidden = (1-zt)*Nt + zt*hidden
        
#         out = self.fc(hidden)
        
#         sig = self.sigmoid(out)
#         return sig, hidden