In [82]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import random
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

import string
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

#from gensim.models import word2vec

SEED = 1234
UNK = '<UNK>'
PAD = '<PAD>'

TWEET_LEN = 20

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DATA_FOLDER = '~/Local Documents/CS230/Project/Twitter-Sentiment/data/Data-mini/'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaelcai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [83]:
# Load train, dev, and test sets

In [84]:
train_data = pd.read_csv(DATA_FOLDER + 'train_mini.csv', encoding = 'latin-1')
train_n = train_data.shape[0]
print (train_data)

dev_data = pd.read_csv(DATA_FOLDER + 'dev_mini.csv', encoding = 'latin-1')
dev_n = dev_data.shape[0]
#print (dev_data)

test_data = pd.read_csv(DATA_FOLDER + 'test_mini.csv', encoding = 'latin-1')
test_n = test_data.shape[0]
#print (test_data)

dataset = pd.concat([train_data, dev_data, test_data])
dataset_n = dataset.shape[0]
#print (dataset)

       Unnamed: 0  Pos_Neg          ID                          Date  \
0               0        0  1467810369  Mon Apr 06 22:19:45 PDT 2009   
1               1        0  1467815988  Mon Apr 06 22:21:09 PDT 2009   
2               2        0  1467823851  Mon Apr 06 22:23:09 PDT 2009   
3               3        0  1467836500  Mon Apr 06 22:26:28 PDT 2009   
4               4        0  1467841832  Mon Apr 06 22:27:55 PDT 2009   
5               5        0  1467853356  Mon Apr 06 22:30:54 PDT 2009   
6               6        0  1467859820  Mon Apr 06 22:32:36 PDT 2009   
7               7        0  1467871661  Mon Apr 06 22:35:41 PDT 2009   
8               8        0  1467876652  Mon Apr 06 22:37:03 PDT 2009   
9               9        0  1467882491  Mon Apr 06 22:38:37 PDT 2009   
10             10        0  1467894600  Mon Apr 06 22:41:51 PDT 2009   
11             11        0  1467899753  Mon Apr 06 22:43:18 PDT 2009   
12             12        0  1467909222  Mon Apr 06 22:45:53 PDT 

In [85]:
# Define custome dataset class for twitter data

In [86]:
class TweetDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, X, Y):
        self.x = X
        self.y = Y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        content = self.x[idx]
        label = self.y[idx]
        sample = {'content': content, 'label': label}

        return sample

In [87]:
# Get ground truth x and y values
train_x_raw = train_data.loc[:]["Content"]
train_y = [0.0 if y == 0 else 1.0 for y in train_data.loc[:]["Pos_Neg"]]
#print (train_y)

dev_x_raw = dev_data.loc[:]["Content"]
dev_y = [0.0 if y == 0 else 1.0 for y in dev_data.loc[:]["Pos_Neg"]]
#print (dev_y)

test_x_raw = test_data.loc[:]["Content"]
test_y = [0.0 if y == 0 else 1.0 for y in test_data.loc[:]["Pos_Neg"]]
#print (test_y)

# Load word embeddings from pretrained embeddings file en-cw.txt, courtesy CS224N
# For final, use Word2Vec embeddings, but for now this should suffice
word_vectors = {}
for line in open('./data/en-cw.txt').readlines():
    sp = line.strip().split()
    word_vectors[sp[0]] = [float(x) for x in sp[1:]]

In [88]:
# Generate token IDs from full dataset

In [89]:
tok2id = {}

for ex in dataset['Content']:
    for w in word_tokenize(ex):
        if w in string.punctuation:
            continue
        if not w in tok2id:
            tok2id[w] = len(tok2id)

tok2id[UNK] = len(tok2id)
tok2id[PAD] = len(tok2id)
# print (tok2id)

In [90]:
# Vectorize datasets with token ids

In [91]:
def vectorize(examples):
    vec_examples = []
    for ex in examples:
        #print (ex)
        sentence = []
        for w in word_tokenize(ex):
            if w in string.punctuation:
                continue
            if w in tok2id:
                sentence.append(tok2id[w])
        if len(sentence) < TWEET_LEN:
            sentence += [tok2id[PAD] for i in range(TWEET_LEN - len(sentence))]
        else:
            sentence = sentence[:TWEET_LEN]
        vec_examples.append(sentence)
    return vec_examples

In [92]:
# Vectorize train, dev, and test sets

In [93]:
train_x = vectorize(train_x_raw)

dev_x = vectorize(dev_x_raw)

test_x = vectorize(test_x_raw)

train_dataset = TweetDataset(train_x, train_y)
dev_dataset = TweetDataset(dev_x, dev_y)
test_dataset = TweetDataset(test_x, test_y)



In [94]:
# Generate embeddings matrix

In [95]:
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (len(tok2id), 50)), dtype='float32')

for token in tok2id:
    i = tok2id[token]
    if token in word_vectors:
        embeddings_matrix[i] = word_vectors[token]
    elif token.lower() in word_vectors:
        embeddings_matrix[i] = word_vectors[token.lower()]

#print (embeddings_matrix)

In [96]:
# CNN class definition, courtesy https://github.com/bentrevett/pytorch-sentiment-analysis

In [97]:
class CNN(nn.Module):
    def __init__(self, embeddings, n_filters, filter_sizes, output_dim, dropout=0.5):
        super().__init__()
        
        self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        self.embedding.weight = nn.Parameter(torch.tensor(embeddings))
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embeddings.shape[1])) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        x = x.permute(1, 0)
                
        #x = [batch size, sent len]
        
        embedded = self.embedding(x)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim=1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [98]:
# RNN class definition

In [99]:
class RNN(nn.Module):
    def __init__(self, embeddings, hidden_dim, n_layers, output_dim, bidirectional = True, dropout = 0.5):
        super().__init__()
        self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        self.embedding.weight = nn.Parameter(torch.tensor(embeddings))
        self.rnn = nn.LSTM(embeddings.shape[1], hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [None]:
# Initialize RNN Model

In [None]:
# hidden_dim = 256
# output_dim = 1
# n_layers = 2

# RNN_model = RNN(embeddings_matrix, hidden_dim, n_layers, output_dim)

In [None]:
# Initialize the model

In [None]:
n_filters = 200
filter_sizes = [1, 2, 3, 4]
output_dim = 1
dropout = 0.5

CNN_model = CNN(embeddings_matrix, n_filters, filter_sizes, output_dim, dropout)

In [None]:
# Set optimizer and criterion (if GPU is available)

In [None]:
# Binary accuracy score, i.e. percentage correct per batch

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [None]:
# Define training function

In [None]:
def train(model, train_loader, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    loss_func = nn.CrossEntropyLoss()
    
    for batchnum, batch in enumerate(train_loader):
        #print ("Training on batch #" + str(batchnum))
        train_x = torch.stack(batch['content'])
        #print (train_x)
        train_y = batch['label'].float()
        #train_y = batch['label'].long()
        if train_x.shape[1] == 1: continue
        #print (train_y.view(-1).shape)
        
        predictions = model.forward(train_x).squeeze(1)
        #print (predictions.shape)
        loss = criterion(predictions, train_y)
        # print (loss)
        loss.backward()
        optimizer.step()
        acc = binary_accuracy(predictions, train_y)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(train_loader), epoch_acc / len(train_loader)

In [None]:
# train_loader = DataLoader(train_dataset,
#                       batch_size=40,
#                       shuffle=True,
#                       num_workers=4
#                      # pin_memory=True # CUDA only
#                      )

# train(RNN_model, enumerate(train_loader), optimizer, criterion)

In [None]:
# Define evaluation function

In [None]:
def evaluate(model, dev_loader, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    

    
    model.eval()
    
    with torch.no_grad():
    
        for batchnum, batch in enumerate(dev_loader):
            dev_x = torch.stack(batch['content'])
            #print (train_x)
            dev_y = batch['label'].float()
            
            
            predictions = model(dev_x).squeeze(1)
            #print (predictions)
            
            loss = criterion(predictions, dev_y)
            
            acc = binary_accuracy(predictions, dev_y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(dev_loader), epoch_acc / len(dev_loader)

In [None]:
#evaluate(model, dev_dataset, criterion)

In [None]:
# Train CNN Model over 10 epochs

In [None]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(CNN_model.parameters())

criterion = nn.BCEWithLogitsLoss()

CNN_model = CNN_model.to(device)

criterion = criterion.to(device)

N_EPOCHS = 10

train_loader = DataLoader(train_dataset,
                      batch_size=64,
                      shuffle=True,
                      num_workers=4
                     # pin_memory=True # CUDA only
                     )
    
dev_loader = DataLoader(dev_dataset,
                  shuffle=False,
                  num_workers=4
                 # pin_memory=True # CUDA only
                 )

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(CNN_model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(CNN_model, dev_loader, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 15.092 | Train Acc: 51.66% | Val. Loss: 31.181 | Val. Acc: 50.10% |
| Epoch: 02 | Train Loss: 75.332 | Train Acc: 60.70% | Val. Loss: 112.320 | Val. Acc: 57.13% |
| Epoch: 03 | Train Loss: 136.276 | Train Acc: 71.26% | Val. Loss: 260.590 | Val. Acc: 59.24% |
| Epoch: 04 | Train Loss: 134.067 | Train Acc: 78.23% | Val. Loss: 742.478 | Val. Acc: 61.95% |
| Epoch: 05 | Train Loss: 128.239 | Train Acc: 81.71% | Val. Loss: 2106.801 | Val. Acc: 59.64% |
| Epoch: 06 | Train Loss: 151.424 | Train Acc: 83.67% | Val. Loss: 4782.700 | Val. Acc: 58.53% |


In [None]:
# Evaluate model on test set

In [None]:
test_loader = DataLoader(test_dataset,
                      shuffle=True,
                      num_workers=4
                     # pin_memory=True # CUDA only
                     )

test_loss, test_acc = evaluate(CNN_model, test_loader, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')


In [141]:
# Train RNN Model over 5 epochs (SLOW)

In [None]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(RNN_model.parameters())

criterion = nn.BCEWithLogitsLoss()

RNN_model = RNN_model.to(device)

criterion = criterion.to(device)

N_EPOCHS = 5 # Since the RNN is so slow

train_loader = DataLoader(train_dataset,
                      batch_size=40,
                      shuffle=True,
                      num_workers=4
                     # pin_memory=True # CUDA only
                     )
    
dev_loader = DataLoader(dev_dataset,
                  batch_size=40,
                  shuffle=False,
                  num_workers=4
                 # pin_memory=True # CUDA only
                 )

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(RNN_model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(RNN_model, dev_loader, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')