In [199]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import random
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

import string
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

#from gensim.models import word2vec

SEED = 1234
UNK = '<UNK>'
PAD = '<PAD>'

TWEET_LEN = 30

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DATA_FOLDER = '~/Local Documents/CS230/Project/Twitter-Sentiment/data/Data-mini/'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaelcai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [200]:
# Load train, dev, and test sets

In [201]:
train_data = pd.read_csv(DATA_FOLDER + 'train_mini.csv', encoding = 'latin-1')
train_n = train_data.shape[0]
#print (train_data)

dev_data = pd.read_csv(DATA_FOLDER + 'dev_mini.csv', encoding = 'latin-1')
dev_n = dev_data.shape[0]
#print (dev_data)

test_data = pd.read_csv(DATA_FOLDER + 'test_mini.csv', encoding = 'latin-1')
test_n = test_data.shape[0]
#print (test_data)

dataset = pd.concat([train_data, dev_data, test_data])
dataset_n = dataset.shape[0]
#print (dataset)

In [202]:
# Define custome dataset class for twitter data

In [203]:
class TweetDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, X, Y):
        self.x = X
        self.y = Y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        content = self.x[idx]
        label = self.y[idx]
        sample = {'content': content, 'label': label}

        return sample

In [239]:
# Get ground truth x and y values
train_x_raw = train_data.loc[:]["Content"]
train_y = [0.0 if y == 0 else 1.0 for y in train_data.loc[:]["Pos_Neg"]]
#print (train_y)

dev_x_raw = dev_data.loc[:]["Content"]
dev_y = [0.0 if y == 0 else 1.0 for y in dev_data.loc[:]["Pos_Neg"]]
#print (dev_y)

test_x_raw = test_data.loc[:]["Content"]
test_y = [0.0 if y == 0 else 1.0 for y in test_data.loc[:]["Pos_Neg"]]
#print (test_y)

# Load word embeddings from pretrained embeddings file en-cw.txt, courtesy CS224N
# For final, use Word2Vec embeddings, but for now this should suffice
word_vectors = {}
for line in open('./data/en-cw.txt').readlines():
    sp = line.strip().split()
    word_vectors[sp[0]] = [float(x) for x in sp[1:]]

In [205]:
# Generate token IDs from full dataset

In [206]:
tok2id = {}

for ex in dataset['Content']:
    for w in word_tokenize(ex):
        if w in string.punctuation:
            continue
        if not w in tok2id:
            tok2id[w] = len(tok2id)

tok2id[UNK] = len(tok2id)
tok2id[PAD] = len(tok2id)
# print (tok2id)

In [207]:
# Vectorize datasets with token ids

In [208]:
def vectorize(examples):
    vec_examples = []
    for ex in examples:
        #print (ex)
        sentence = []
        for w in word_tokenize(ex):
            if w in string.punctuation:
                continue
            if w in tok2id:
                sentence.append(tok2id[w])
        if len(sentence) < TWEET_LEN:
            sentence += [tok2id[PAD] for i in range(TWEET_LEN - len(sentence))]
        else:
            sentence = sentence[:TWEET_LEN]
        vec_examples.append(sentence)
    return vec_examples

In [209]:
# Vectorize train, dev, and test sets

In [210]:
train_x = vectorize(train_x_raw)

dev_x = vectorize(dev_x_raw)

test_x = vectorize(test_x_raw)

train_dataset = TweetDataset(train_x, train_y)
dev_dataset = TweetDataset(dev_x, dev_y)
test_dataset = TweetDataset(test_x, test_y)



In [211]:
# Generate embeddings matrix

In [212]:
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (len(tok2id), 50)), dtype='float32')

for token in tok2id:
    i = tok2id[token]
    if token in word_vectors:
        embeddings_matrix[i] = word_vectors[token]
    elif token.lower() in word_vectors:
        embeddings_matrix[i] = word_vectors[token.lower()]

#print (embeddings_matrix)

In [213]:
# CNN class definition, courtesy https://github.com/bentrevett/pytorch-sentiment-analysis

In [227]:
class CNN(nn.Module):
    def __init__(self, embeddings, n_filters, filter_sizes, output_dim, dropout=0.5):
        super().__init__()
        
        self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        self.embedding.weight = nn.Parameter(torch.tensor(embeddings))
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embeddings.shape[1])) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        x = x.permute(1, 0)
                
        #x = [batch size, sent len]
        
        embedded = self.embedding(x)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim=1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        fc = self.fc(cat)
        
        return self.sigmoid(fc)

In [228]:
# Initialize the model

In [229]:
n_filters = 100
filter_sizes = [3,4,5]
output_dim = 1
dropout = 0.5

model = CNN(embeddings_matrix, n_filters, filter_sizes, output_dim, dropout)

In [230]:
# Set optimizer and criterion (if GPU is available)

In [231]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)


In [232]:
# Binary accuracy score, i.e. percentage correct per batch

In [233]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [234]:
# Define training function

In [319]:
def train(model, train_dataset, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    train_loader = DataLoader(train_dataset,
                          batch_size=30,
                          shuffle=True,
                          num_workers=4
                         # pin_memory=True # CUDA only
                         )
    
    loss_func = nn.CrossEntropyLoss()
    
    for batchnum, batch in enumerate(train_loader):
        train_x = torch.stack(batch['content'])
        #print (train_x)
        train_y = batch['label'].reshape([batch['label'].shape[0], 1]).float()
        #train_y = batch['label'].long()
        #print (train_x.shape)
        #print (train_y.view(-1).shape)
        
        predictions = model.forward(train_x)
        # print (logits)
        loss = criterion(predictions, train_y)
        # print (loss)
        loss.backward()
        optimizer.step()
        acc = binary_accuracy(predictions, train_y)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(train_loader), epoch_acc / len(train_loader)

In [320]:
#train(model, train_dataset, optimizer, criterion)

In [321]:
# Define evaluation function

In [326]:
def evaluate(model, dev_loader, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    

    
    model.eval()
    
    with torch.no_grad():
    
        for batchnum, batch in enumerate(dev_loader):
            #print (batch)
            dev_x = torch.stack(batch['content'])
            #print (train_x)
            dev_y = batch['label'].float()
            
            
            predictions = model(dev_x).squeeze(1)
            #print (predictions)
            
            loss = criterion(predictions, dev_y)
            
            acc = binary_accuracy(predictions, dev_y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(dev_loader), epoch_acc / len(dev_loader)

In [327]:
#evaluate(model, dev_dataset, criterion)

In [328]:
# Train over 5 epochs

In [329]:
N_EPOCHS = 5

dev_loader = DataLoader(dev_dataset,
                  batch_size=30,
                  shuffle=False,
                  num_workers=4
                 # pin_memory=True # CUDA only
                 )

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_dataset, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dev_loader, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.693 | Train Acc: 49.87% | Val. Loss: 0.693 | Val. Acc: 49.02% |
| Epoch: 02 | Train Loss: 0.693 | Train Acc: 50.37% | Val. Loss: 0.693 | Val. Acc: 49.02% |
| Epoch: 03 | Train Loss: 0.693 | Train Acc: 49.53% | Val. Loss: 0.693 | Val. Acc: 49.02% |
| Epoch: 04 | Train Loss: 0.693 | Train Acc: 50.04% | Val. Loss: 0.693 | Val. Acc: 49.02% |
| Epoch: 05 | Train Loss: 0.693 | Train Acc: 49.87% | Val. Loss: 0.693 | Val. Acc: 49.02% |
