In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import time
import gensim
from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

In [None]:
#This code block imports the glove embeddings.
#It can take anywhere from 15 to 100 seconds depending on your computer.

path = "glove.6B.50d.txt.w2v"
t0 = time.time()
glove = KeyedVectors.load_word2vec_format(path, binary=False)
t1 = time.time()
print("elapsed %ss" % (t1 - t0))
# 50d: elapsed 17.67420792579651s
# 100d: 

In [None]:
#This line will throw an error if there was a failure in the glove embeddings.
#Checks if a word is in the glove. Should always print True.
'meme' in glove

In [None]:
#Opens the test data and allows you to see how it's formatted before the processing.
#THIS CELL DOES NOT NEED TO BE RUN.

play_with_test_data = np.load('./data/test_twitter_data.npz')
play_with_test_data = play_with_test_data.f.arr_0
play_with_test_data[100:110]

In [None]:
#Imports the training and testing data and formats them into usable arrays.

from preprocess import new_preprocess

x_test, test_max = new_preprocess("./data/test_twitter_data.npz")
x_train, train_max = new_preprocess("./data/train_twitter_data.npz")

with np.load("./data/train_twitter_label.npz") as f:
    y_train = f["arr_0"].astype(int)

with np.load("./data/test_twitter_label.npz") as f:
    y_test = f["arr_0"].astype(int)

In [None]:
#Allows you to see how the data is currently formatted

print(x_train[0])
print('\n', y_train[0])
print(len(x_train))

In [None]:
class Model(nn.Module):
    
    def __init__(self, dim_input = 50, dim_recurrent = 100, dim_output = 2):
        
        '''
        Initializes the model.
        
        INPUTS:
            dim_input - The dimensionality of the input data.
                Defaults to 50, the size of word embeddings.
            dim_recurrent - The number of recurrent layers.
                This is a hyperparameter. Defaults to 100.
            dim_output - The number of predictions to make.
                Defaults to 2, the number of predictions the model should make.
                
        OUTPUTS:
            None
        '''
        #Initializes model as a pytorch object
        super(Model, self).__init__()
        
        #Initializes internal variables
        self.C = dim_input
        self.D = dim_recurrent
        self.K = dim_output
        
        
        #Initializes the internal layers of the network.
        self.dense1 = nn.Linear(dim_input, dim_recurrent)
        self.dense2 = nn.Linear(dim_recurrent, dim_recurrent, bias = False)
        self.dense3 = nn.Linear(dim_input, dim_recurrent)
        self.dense4 = nn.Linear(dim_recurrent, dim_recurrent, bias = False)
        self.dense5 = nn.Linear(dim_recurrent, dim_output)
        
    def forward(self, x):
        
        '''
        Takes in a batch of N tweets and outputs N predictions from an RNN.
        
        INPUT:
            x - batch of tweets to be processed.
            
        OUTPUT:
            predictions - predictions for each tweet.
        
        '''
        
        #Creates the hidden layer
        hidden = torch.zeros(len(x), self.D)
        
        #Processes each row
        for i in range(x.shape[2]):
            row = x[:, :, i]
            
            #Iterates through the RNN
            subHid = self.dense1(row)
            mem = self.dense2(hidden)
            subHid += mem
            subHid = F.relu(subHid)
            z = F.sigmoid(self.dense3(row) + self.dense4(hidden))
            hidden = z * hidden + (1 - z) * subHid
        
        #Converts the final hidden state to predictions.
        return self.dense5(hidden)

In [None]:
def process(tweets):
    '''
    Takes in a batch of tweets and formats them for training.
    
    INPUT:
        tweets - batch of tweets to process.
        
    OUTPUT:
        ret - processed tweets as word embeddings ready for the RNN.
    '''
    ret = torch.zeros((len(tweets), len(max(tweets, key = len)), 50))
    for n in range(len(tweets)):
        tweet = tweets[n]
        for x in range(len(tweet)):
            word = tweet[x]
            if word in glove:
                ret[n, x] = torch.tensor(glove[word])
    return ret

In [None]:
def accuracy(pred, truth):
    '''
    Gets the accuracy of predictions when compared to truth data.
    '''
    maxes = torch.argmax(pred, dim = -1)
    maxes = maxes == truth
    maxes = maxes.type(torch.FloatTensor)
    return torch.mean(maxes)

In [None]:
#Initializes the network.

net = Model()
from torch.optim import Adam

optim = Adam(net.parameters())

In [None]:
#Creates a cool liveplot!

%matplotlib notebook
import liveplot
plotter, fig, ax = liveplot.create_plot(metrics=["loss", "accuracy"], refresh=5)

In [None]:
#This code saves the model, then reopens it as a newNet.
#Does not run when shift-tabbing through the data due to the liveplot.

from pickle import dump, load
dump(net, open('sentnet.dat', 'wb'))

newNet = load(open('sentnet.dat', 'rb'))

In [None]:
batch_size = 100

soft = nn.CrossEntropyLoss()

for epoch_cnt in range(10):
    
    idxs = np.arange(len(x_train))
    np.random.shuffle(idxs)
    
    for batch_cnt in range(len(x_train) // batch_size):
        
        batch = [x_train[i] for i in idxs[batch_cnt * batch_size : (batch_cnt + 1) * batch_size]]
        
        batch = process(batch)
        
        batch = torch.transpose(batch, 1, 2)
        
        prediction = net(batch)
        
        truth = torch.LongTensor([y_train[i] for i in idxs[batch_cnt * batch_size : (batch_cnt + 1) * batch_size]])
        
        loss = soft(prediction, truth)
        
        optim.zero_grad()
        
        loss.backward()
        
        optim.step()
        
        acc = accuracy(prediction, truth)
        
        plotter.set_train_batch({"loss" : loss.item(),
                                 "accuracy" : acc.item()},
                                 batch_size=batch_size)
    with torch.no_grad():
        idxs = np.arange(len(x_test))
        idxs = np.random.shuffle(idxs)
        for batch_cnt in range(0, len(x_test)//batch_size):
            batch_indices = slice(batch_cnt*batch_size, (batch_cnt + 1)*batch_size) # make as slice
            batch = x_test[batch_indices]

            batch = process(batch)

            batch = torch.transpose(batch, 1, 2)



            prediction = net(batch)

            truth = torch.LongTensor(y_test[batch_indices])

            loss = soft(prediction, truth)

            acc = accuracy(prediction, truth)

            plotter.set_test_batch({"loss" : loss.item(),
                                     "accuracy" : acc.item()},
                                     batch_size=batch_size)
    plotter.plot_train_epoch()
    plotter.plot_test_epoch()

In [None]:
from pickle import load

def sentiment(sentence, path = 'sentnet.dat'):
    '''
    Decides whether or not a string has happy sentiment.
    
    INPUTS:
        sentence - string to be analyzed
        path (optional) - string with the path to the databse
        
    OUTPUT:
        sentiment (int) - 0 if the sentiment is negative, 1 if it's positive.
    '''
    
    net = load(open(path, 'rb'))
    sentence = process([sentence.split()])
    sentence = torch.transpose(sentence, 1, 2)
    return torch.argmax(net(sentence)).item()

In [None]:
#It knows that loving my mom is good! Woo-hoo!
sentiment('I love my mom')