In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import random
import os
import pandas as pd
import numpy as np

import string
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from TweetDataset import TweetDataset
from vocab import VocabEntry
from convblock import ConvBlock

#from gensim.models import word2vec

SEED = 1234
UNK = '<unk>'
PAD = '<pad>'

TWEET_LEN = 20
EMBED_LEN = 100

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DATA_FOLDER = '~/Local Documents/CS230/Project/Twitter-Sentiment/data/Data-mini/'

train_data = pd.read_csv(DATA_FOLDER + 'train_mini.csv', encoding = 'latin-1')
train_n = train_data.shape[0]
#print (train_data)

dev_data = pd.read_csv(DATA_FOLDER + 'dev_mini.csv', encoding = 'latin-1')
dev_n = dev_data.shape[0]
#print (dev_data)

test_data = pd.read_csv(DATA_FOLDER + 'test_mini.csv', encoding = 'latin-1')
test_n = test_data.shape[0]
#print (test_data)

dataset = pd.concat([train_data, dev_data, test_data])
dataset_n = dataset.shape[0]
#print (dataset)

# Get ground truth x and y values
train_x_raw = train_data.loc[:]["Content"]
train_y = [0.0 if y == 0 else 1.0 for y in train_data.loc[:]["Pos_Neg"]]
#print (train_y)

dev_x_raw = dev_data.loc[:]["Content"]
dev_y = [0.0 if y == 0 else 1.0 for y in dev_data.loc[:]["Pos_Neg"]]
#print (dev_y)

test_x_raw = test_data.loc[:]["Content"]
test_y = [0.0 if y == 0 else 1.0 for y in test_data.loc[:]["Pos_Neg"]]
#print (test_y)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaelcai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# # Load character embeddings from pretrained embeddings file char-embeddings.txt, courtesy github user minimaxir
# char_vectors = {}
# i = 0
# tokens = open('./data/metadata.txt').readlines()
# embeddings = open('./data/character-embeddings.txt').readlines()

# for i in range(len(tokens)):
#     token = tokens[i].strip()
#     sp = embeddings[i].strip().split()
#     char_vectors[token] = [float(x) for x in sp]

# print (char_vectors['three'])
# print (len(char_vectors))
# print (len(char_vectors['three']))

char_vectors = {}
for line in open('./glove.6B/glove.6B.100d.txt').readlines():
    sp = line.strip().split()
    if len(sp) == 0: continue
    char_vectors[sp[0]] = [float(x) for x in sp[1:]]
    
print (char_vectors)
print (len(char_vectors))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [3]:
tok2id = {}

for ex in dataset['Content']:
    for w in word_tokenize(ex):
        if w in string.punctuation:
            continue
        if not w in tok2id:
            tok2id[w] = len(tok2id)

tok2id[UNK] = len(tok2id)
tok2id[PAD] = len(tok2id)
# print (tok2id)

In [4]:
def vectorize(examples):
    vec_examples = []
    for ex in examples:
        #print (ex)
        sentence = []
        for w in word_tokenize(ex):
            if w in string.punctuation:
                continue
            if w in tok2id:
                sentence.append(tok2id[w])
        if len(sentence) < TWEET_LEN:
            sentence += [tok2id[PAD] for i in range(TWEET_LEN - len(sentence))]
        else:
            sentence = sentence[:TWEET_LEN]
        vec_examples.append(sentence)
    return vec_examples

In [5]:
train_x = vectorize(train_x_raw)

dev_x = vectorize(dev_x_raw)

test_x = vectorize(test_x_raw)

train_dataset = TweetDataset(train_x, train_y)
dev_dataset = TweetDataset(dev_x, dev_y)
test_dataset = TweetDataset(test_x, test_y)

In [6]:
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (len(tok2id), EMBED_LEN)), dtype='float32')

for token in tok2id:
    i = tok2id[token]
    if token in char_vectors:
        embeddings_matrix[i] = char_vectors[token]
    elif token.lower() in char_vectors:
        embeddings_matrix[i] = char_vectors[token.lower()]

#print (embeddings_matrix)

In [7]:
# class CNN(nn.Module):
#     def __init__(self, embeddings, n_filters, filter_sizes, output_dim, dropout=0.5):
#         super().__init__()
        
#         self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
#         self.embedding.weight = nn.Parameter(torch.tensor(embeddings))
#         self.convs = nn.ModuleList([nn.Conv1d(in_channels=EMBED_LEN, out_channels=n_filters, kernel_size=fs) for fs in filter_sizes])
        
#         for conv in self.convs:
#             torch.nn.init.xavier_uniform_(conv.weight)
        
#         self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
#         torch.nn.init.xavier_uniform_(self.fc.weight)
        
#         self.sigmoid = nn.Sigmoid()
#         self.dropout = nn.Dropout(dropout)
        
#     def forward(self, x):
        
#         #x = [sent len, batch size]
        
#         x = x.permute(1, 0)
                
#         #x = [batch size, sent len]
        
#         embedded = self.embedding(x).permute(0, 2, 1)
                
#         #embedded = [batch size, emb dim, sent len]
#         print (embedded.shape)
# #         embedded = embedded.unsqueeze(1)
        
#         #embedded = [batch size, 1, sent len, emb dim]
        
#         conved = [F.relu(conv(embedded)) for conv in self.convs]
            
#         #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
#         pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
#         #pooled_n = [batch size, n_filters]
        
#         cat = self.dropout(torch.cat(pooled, dim=1))

#         #cat = [batch size, n_filters * len(filter_sizes)]
            
#         return self.fc(cat)

In [8]:
class DeepCNN(nn.Module):
    def __init__(self, embeddings, dropout=0.5):
        super().__init__()
        
        self.batch_size = embeddings.shape[1]
        
        # Freeze pretrained GloVe embeddings 
        self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        self.embedding.weight = nn.Parameter(torch.tensor(embeddings))
        self.embedding.weight.requires_grad = False
        
        self.conv = nn.Conv1d(in_channels=embeddings.shape[1], out_channels=128, kernel_size=5, padding = 2)
        #torch.nn.init.xavier_uniform_(self.conv.weight)
        self.block = ConvBlock(128, 256)
        self.block2 = ConvBlock(256, 256)
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(5 * 256, 256)
        self.dropout2 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(256, 1)
        #torch.nn.init.xavier_uniform_(self.fc.weight)

    def forward(self, x, print_sizes = False):
        # x.shape = (sent_len, batch_size)
        x = x.permute(1, 0)
        if print_sizes: print ("x.shape: " + str(x.shape))
        # x.shape = (batch_size, sent_len)
        
        x_emb = self.embedding(x).permute(0, 2, 1)
        if print_sizes: print ("x_emb.shape: " + str(x_emb.shape))
        # x_emb.shape = (batch_size, emb_size, sent_len)
        
        x_conv = self.conv(x_emb)
        # x_conv.shape = (batch_size, 128, sent_len)
        if print_sizes: print ("x_conv.shape: " + str(x_conv.shape))
        
        x_block = self.block.forward(x_conv)
        # x_block.shape = (batch_size, 128, sent_len / 2)
        if print_sizes: print ("x_block.shape: " + str(x_block.shape))
            
        x_block_2 = self.block2.forward(x_block)
        # x_block.shape = (batch_size, 256, sent_len / 4)
        if print_sizes: print ("x_block_2.shape: " + str(x_block_2.shape))

        x_cat = x_block_2.view(-1, x_block_2.shape[1] * x_block_2.shape[2])
        # x_cat.shape = (batch_size, 128 * sent_len / 2)
        if print_sizes: print ("x_cat.shape: " + str(x_cat.shape))
            
        x_fc = self.fc(self.dropout(x_cat))
        # x_fc.shape = (batch_size, 1)
        if print_sizes: print ("x_fc.shape: " + str(x_fc.shape))
            
        x_fc2 = self.fc2(self.dropout2(x_fc))

        return x_fc2


In [9]:
# model = DeepCNN(embeddings_matrix)
# x = torch.tensor(np.zeros((20, 128)), dtype = torch.long)
# model.forward(x, True)

In [10]:
# n_filters = 100
# filter_sizes = [1, 2, 3, 4]
# output_dim = 1
# dropout = 0.5

# CNN_model = CNN(embeddings_matrix, n_filters, filter_sizes, output_dim, dropout)
CNN_model = DeepCNN(embeddings_matrix)

In [11]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def train(model, train_loader, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
        
    for batchnum, batch in enumerate(train_loader):
        #print ("Training on batch #" + str(batchnum))
        train_x = torch.stack(batch['content'])
        #print (train_x.shape)
        train_y = batch['label'].float()
        #train_y = batch['label'].long()
        if train_x.shape[1] == 1: continue
        #print (train_y.view(-1).shape)

        predictions = model.forward(train_x).squeeze(1)
        #print (predictions.shape)
        loss = criterion(predictions, train_y)
        # print (loss)
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        acc = binary_accuracy(predictions, train_y)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(train_loader), epoch_acc / len(train_loader)

def evaluate(model, dev_loader, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    

    
    model.eval()
    
    with torch.no_grad():
    
        for batchnum, batch in enumerate(dev_loader):
            dev_x = torch.stack(batch['content'])
            #print (train_x)
            dev_y = batch['label'].float()
            
            
            predictions = model(dev_x).squeeze(1)
            #print (torch.round(predictions))
            
            loss = criterion(predictions, dev_y)
            
            acc = binary_accuracy(predictions, dev_y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(dev_loader), epoch_acc / len(dev_loader)

In [12]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(CNN_model.parameters())

criterion = nn.BCEWithLogitsLoss()

CNN_model = CNN_model.to(device)

criterion = criterion.to(device)

N_EPOCHS = 40

train_loader = DataLoader(train_dataset,
                      batch_size=128,
                      shuffle=True,
                      num_workers=4
                     # pin_memory=True # CUDA only
                     )
    
dev_loader = DataLoader(dev_dataset,
                  batch_size=128,
                  shuffle=False,
                  num_workers=4
                 # pin_memory=True # CUDA only
                 )

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(CNN_model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(CNN_model, dev_loader, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.947 | Train Acc: 49.84% | Val. Loss: 0.799 | Val. Acc: 54.13% |
| Epoch: 02 | Train Loss: 0.834 | Train Acc: 49.71% | Val. Loss: 0.890 | Val. Acc: 47.15% |
| Epoch: 03 | Train Loss: 0.808 | Train Acc: 49.72% | Val. Loss: 1.659 | Val. Acc: 52.40% |
| Epoch: 04 | Train Loss: 0.828 | Train Acc: 50.47% | Val. Loss: 0.861 | Val. Acc: 50.72% |
| Epoch: 05 | Train Loss: 0.828 | Train Acc: 50.20% | Val. Loss: 0.708 | Val. Acc: 50.32% |
| Epoch: 06 | Train Loss: 0.824 | Train Acc: 49.23% | Val. Loss: 0.803 | Val. Acc: 50.78% |
| Epoch: 07 | Train Loss: 0.819 | Train Acc: 51.68% | Val. Loss: 1.349 | Val. Acc: 58.49% |
| Epoch: 08 | Train Loss: 0.800 | Train Acc: 54.55% | Val. Loss: 0.835 | Val. Acc: 57.93% |
| Epoch: 09 | Train Loss: 0.769 | Train Acc: 52.96% | Val. Loss: 0.804 | Val. Acc: 48.73% |
| Epoch: 10 | Train Loss: 0.827 | Train Acc: 51.69% | Val. Loss: 0.663 | Val. Acc: 60.94% |
| Epoch: 11 | Train Loss: 0.774 | Train Acc: 55.85% | Val. Loss: 1.264 | Val. Ac

In [13]:
test_loader = DataLoader(test_dataset,
                      batch_size=128,
                      shuffle=True,
                      num_workers=4
                     # pin_memory=True # CUDA only
                     )

test_loss, test_acc = evaluate(CNN_model, test_loader, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')


| Test Loss: 1.440 | Test Acc: 50.57% |
