In [76]:
import nltk
import re
import string
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from os import getcwd
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset


In [2]:
nltk.download("twitter_samples")
nltk.download("stopwords")

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/surajkarki/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/surajkarki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples

In [4]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [5]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [6]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [7]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickets like $GE
    tweet = re.sub(r'\$\w*', ' ', tweet)
    # remove old style retweet text RT
    tweet = re.sub(r'^RT[\s]+', ' ', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', ' ', tweet)
    # remove hash tags
    tweet = re.sub(r'#', '', tweet)
    # instantiate tokenizer class
    tokenizer = TweetTokenizer(
        preserve_case=False, strip_handles=True, reduce_len=True)
    # tokenize tweets
    tweet_tokens = tokenizer.tokenize(tweet)

    clean_tweets = []
    for word in tweet_tokens:
        if (word not in stopwords_english) and (word not in string.punctuation):
            # clean_tweets.append(word)
            stem_word = stemmer.stem(word)
            clean_tweets.append(stem_word)
    return clean_tweets


In [8]:
for i in range(len(train_x)):
    print(process_tweet(train_x[i]))
    break

['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [9]:
train_X = []
test_X = []

In [10]:
for i in range(len(train_x)):
    tweet = process_tweet(train_x[i])
    train_X.append(tweet)

In [11]:
print(train_X[0:2])

[['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)'], ['hey', 'jame', 'odd', ':/', 'pleas', 'call', 'contact', 'centr', '02392441234', 'abl', 'assist', ':)', 'mani', 'thank']]


In [12]:
for i in range(len(test_x)):
    tweet = process_tweet(test_x[i])
    test_X.append(tweet)

In [13]:
print(test_X[0:2])

[['bro', 'u', 'wan', 'cut', 'hair', 'anot', 'ur', 'hair', 'long', 'liao', 'bo', 'sinc', 'ord', 'liao', 'take', 'easi', 'lor', 'treat', 'save', 'leav', 'longer', ':)', 'bro', 'lol', 'sibei', 'xialan'], ['back', 'thnx', 'god', "i'm", 'happi', ':)']]


In [15]:
X = train_x + test_x

In [20]:
for i in X:
    x.append(process_tweet(i))
     

In [36]:
words = []
for i in x:
    words += i
    

In [37]:
words[0:3]

['followfriday', 'top', 'engag']

In [40]:
count_words = Counter(words)
total_words=len(words)
sorted_words=count_words.most_common(total_words)

In [44]:
# Create a dictionary to convert words to Integers based on the number of occurrence of the word

vocab_to_int={w:i+1 for i,(w,c) in enumerate(sorted_words)}
#print(vocab_to_int)

In [58]:
def encode(tweets):
    encoded_tweets=list()
    for tweet in tweets:
        encoded_tweet=list()
        for word in tweet:
            if word not in vocab_to_int.keys():
                encoded_tweet.append(0)
            else:
                encoded_tweet.append(vocab_to_int[word])
        encoded_tweets.append(encoded_tweet)
    return encoded_tweets

In [59]:
def pad_sequences(encoded_tweets, sequence_length=100):
    ''' 
    Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features=np.zeros((len(encoded_tweets), sequence_length), dtype=int)
    
    for i, tweet in enumerate(encoded_tweets):
        tweet_len=len(tweet)
        if (tweet_len<=sequence_length):
            zeros=list(np.zeros(sequence_length-tweet_len))
            new=zeros+tweet
        else:
            new=tweet[:sequence_length]
        features[i,:]=np.array(new)
    return features

In [60]:
def preprocess(tweets):
    """
    This Function will tranform tweets in to model readable form
    """
    encoded_tweets=encode(tweets)
    features=pad_sequences(encoded_tweets, 100)
    return features

In [67]:
train_X = preprocess(train_X)
test_X = preprocess(test_X)

In [66]:
train_X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,  408,  257, 1107,  449,  287,   51,
          2])

In [69]:
test_X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,  378,   16, 1696,
        558,  315, 3879,  152,  315,  127, 3994, 2984,  192, 9182, 3994,
         97,  611, 3995,  908,  533,  180,  691,    2,  378,   79, 9183,
       9184])

In [72]:
train_y

array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [73]:
test_y

array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [77]:
# Create Tensor Dataset

train_data=TensorDataset(torch.from_numpy(train_X), torch.from_numpy(train_y))
valid_data=TensorDataset(torch.from_numpy(test_X), torch.from_numpy(test_y))

In [78]:
# Dataloader
batch_size = 64

train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader=DataLoader(valid_data, batch_size=batch_size, shuffle=True)

In [79]:
# obtain one batch of training data

data_iter = iter(train_loader)
sample_x, sample_y = data_iter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([64, 100])
Sample input: 
 tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
Sample label size:  torch.Size([64, 1])
Sample label: 
 tensor([[1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0

In [80]:
class Model(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):    
        """
        Initialize the model by setting up the layers
        """
        super().__init__()
        self.output_size=output_size
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim
        
        #Embedding and LSTM layers
        self.embedding=nn.Embedding(vocab_size, embedding_dim)
        self.lstm=nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        
        #dropout layer
        self.dropout=nn.Dropout(0.3)
        
        #Linear and sigmoid layer
        self.fc1=nn.Linear(hidden_dim, 64)
        self.fc2=nn.Linear(64, 16)
        self.fc3=nn.Linear(16,output_size)
        self.sigmoid=nn.Sigmoid()
        
    def forward(self, x):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size=x.size()
        
        #Embedding and LSTM output
        embedd=self.embedding(x)
        lstm_out, hidden=self.lstm(embedd)
        
        #stack up the lstm output(dimension reduction)(batch, sequence_len, emd_dim)
        lstm_out=lstm_out.contiguous().view(-1, self.hidden_dim)
        
        #(batch x sequence_len, emb_dim)
        #dropout and fully connected layers
        out=self.dropout(lstm_out)
        out=self.fc1(out)
        out=self.dropout(out)
        out=self.fc2(out)
        out=self.dropout(out)
        out=self.fc3(out)
        sig_out=self.sigmoid(out)
        
        sig_out=sig_out.view(batch_size, -1)
        sig_out=sig_out[:, -1]
        
        return sig_out, hidden
    

In [81]:
# Instantiate the model w/ hyperparams

vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

In [83]:
net = Model(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

Model(
  (embedding): Embedding(10417, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [84]:
# Train the model
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [88]:
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

In [89]:
# training params

epochs = 3 
counter = 0
print_every = 100
clip=5 # gradient clipping

In [90]:
net.train()

Model(
  (embedding): Embedding(10417, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [92]:
# train for some number of epochs
for e in range(epochs):
    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs=inputs.cuda()
            labels=labels.cuda()


        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                inputs, labels = inputs.cuda(), labels.cuda()  
                output, val_h = net(inputs)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

KeyboardInterrupt: 