In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import random
import os
import pandas as pd
import numpy as np

import string
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from TweetDataset import TweetDataset
from vocab import VocabEntry

#from gensim.models import word2vec

SEED = 1234
UNK = '<unk>'
PAD = '<pad>'

TWEET_LEN = 20
EMBED_LEN = 300

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DATA_FOLDER = '~/Local Documents/CS230/Project/Twitter-Sentiment/data/Data-mini/'

train_data = pd.read_csv(DATA_FOLDER + 'train_mini.csv', encoding = 'latin-1')
train_n = train_data.shape[0]
#print (train_data)

dev_data = pd.read_csv(DATA_FOLDER + 'dev_mini.csv', encoding = 'latin-1')
dev_n = dev_data.shape[0]
#print (dev_data)

test_data = pd.read_csv(DATA_FOLDER + 'test_mini.csv', encoding = 'latin-1')
test_n = test_data.shape[0]
#print (test_data)

dataset = pd.concat([train_data, dev_data, test_data])
dataset_n = dataset.shape[0]
#print (dataset)

# Get ground truth x and y values
train_x_raw = train_data.loc[:]["Content"]
train_y = [0.0 if y == 0 else 1.0 for y in train_data.loc[:]["Pos_Neg"]]
#print (train_y)

dev_x_raw = dev_data.loc[:]["Content"]
dev_y = [0.0 if y == 0 else 1.0 for y in dev_data.loc[:]["Pos_Neg"]]
#print (dev_y)

test_x_raw = test_data.loc[:]["Content"]
test_y = [0.0 if y == 0 else 1.0 for y in test_data.loc[:]["Pos_Neg"]]
#print (test_y)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaelcai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load character embeddings from pretrained embeddings file char-embeddings.txt, courtesy github user minimaxir
char_vectors = {}
i = 0
for line in open('./data/char-embeddings.txt').readlines():
    sp = line.split()
    if len(sp) == 0: continue
    if i == 32: 
        char_vectors['\n'] = [float(x) for x in sp]
    elif i == 35: 
        char_vectors[' '] = [float(x) for x in sp]
    else: 
        char_vectors[sp[0]] = [float(x) for x in sp[1:]]
    i += 1
    
# print (char_vectors)
# print (len(char_vectors))

In [3]:
# Generate char IDs from full dataset

char2id = {}

for w in char_vectors:
    char2id[w] = len(char2id)

char2id[UNK] = len(char2id)
char2id[PAD] = len(char2id)

PAD_IDX = len(char2id) - 1

print (char2id)

ve = VocabEntry()
ve.char2id = char2id

{'Y': 0, 'F': 1, 'd': 2, 'N': 3, 'h': 4, '(': 5, 'C': 6, 'w': 7, '@': 8, 'V': 9, 'z': 10, 'Z': 11, 'e': 12, 'i': 13, 'y': 14, 'T': 15, 'q': 16, '+': 17, 'O': 18, 'l': 19, ']': 20, '8': 21, '[': 22, 'u': 23, "'": 24, 'H': 25, '.': 26, '}': 27, 'f': 28, ';': 29, '|': 30, 'M': 31, '\n': 32, '*': 33, 'c': 34, ' ': 35, '"': 36, '/': 37, '7': 38, 'A': 39, ')': 40, 'D': 41, 'S': 42, 'o': 43, '5': 44, 'x': 45, 'R': 46, 'W': 47, 'Q': 48, '&': 49, '6': 50, '!': 51, '?': 52, '9': 53, 'K': 54, 'U': 55, 't': 56, 'P': 57, 'g': 58, '3': 59, 's': 60, 'J': 61, 'I': 62, 'B': 63, '#': 64, '^': 65, '0': 66, 'E': 67, 'p': 68, 'r': 69, '_': 70, 'k': 71, 'm': 72, ',': 73, '4': 74, 'v': 75, 'G': 76, 'n': 77, 'a': 78, 'X': 79, '2': 80, 'L': 81, '1': 82, '~': 83, '%': 84, '-': 85, '{': 86, 'j': 87, '$': 88, ':': 89, 'b': 90, '<unk>': 91, '<pad>': 92}


In [4]:
def vectorize(examples):
    vec_examples = []
    for ex in examples:
        #print (ex)
        sentence = [w for w in ex.strip().split()]
#         for w in word_tokenize(ex):
#             if w in string.punctuation:
#                 continue
#             sentence.append(w)
        vec_examples.append(sentence)
    return vec_examples

In [5]:
# Vectorize train, dev, and test sets and convert to TweetDataset objects

#train_x = ve.to_input_tensor_char(vectorize(train_x_raw), torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

#dev_x = ve.to_input_tensor_char(vectorize(dev_x_raw), torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

#test_x = ve.to_input_tensor_char(vectorize(test_x_raw), torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

#print(dev_x.shape)

train_dataset = TweetDataset(train_x_raw, train_y)
dev_dataset = TweetDataset(dev_x_raw, dev_y)
test_dataset = TweetDataset(test_x_raw, test_y)
print (train_dataset)

<TweetDataset.TweetDataset object at 0x10d8e83c8>


In [6]:
# Build embeddings matrix

embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (len(char2id), EMBED_LEN)), dtype='float32')

for char in char2id:
    i = char2id[char]
    if char in char_vectors:
        embeddings_matrix[i] = char_vectors[char]

# print (embeddings_matrix.shape)

In [7]:
class CNN(nn.Module):
    def __init__(self, embeddings, n_filters, filter_sizes, output_dim, dropout=0.5):
        super().__init__()
        
        self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        #torch.nn.init.xavier_uniform_(self.embedding.weight)
        self.embedding.weight = nn.Parameter(torch.tensor(embeddings))
        self.embedding.weight.requires_grad = False
        self.convs = nn.ModuleList([nn.Conv1d(in_channels=EMBED_LEN, out_channels=n_filters, kernel_size=(fs,embeddings.shape[1])) for fs in filter_sizes])
        
        for conv in self.convs:
            torch.nn.init.xavier_uniform_(conv.weight)
        
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        torch.nn.init.xavier_uniform_(self.fc.weight)
        
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        print (x.shape)
        # x.shape = (sent_len, batch_size, word_len)
        
        #x = [sent len, batch size]
        
#         x = x.permute(1, 0)
                
#         #x = [batch size, sent len]
        
        embedded = self.embedding(x)
        print (embedded.shape)
                
#         #embedded = [batch size, sent len, emb dim]
        
#         embedded = embedded.unsqueeze(1)
        
#         #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
#         #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
#         pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
#         #pooled_n = [batch size, n_filters]
        
#         cat = self.dropout(torch.cat(pooled, dim=1))

#         #cat = [batch size, n_filters * len(filter_sizes)]
            
#         return self.fc(cat)
        return None

In [8]:
# class CNN(nn.Module):
#     def __init__(self, embeddings, num_filters, k = 5, stride = 1, dropout = 0.5):
#         """ Initialize a CNN network with a kernel of size k,  """
#         super().__init__()

#         self.input_size = embeddings.shape[1]
#         self.num_filters = num_filters
#         self.k = k
#         self.stride = stride
        
#         self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
#         #torch.nn.init.xavier_uniform_(self.embedding.weight)
#         self.embedding.weight = nn.Parameter(torch.tensor(embeddings))
#         self.embedding.weight.requires_grad = False
        
#         self.conv = nn.Conv1d(in_channels = embeddings.shape[1], out_channels = num_filters, kernel_size = k, bias = True)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(dropout)

#         #self.pool = nn.MaxPool1d(kernel_size = k, stride = stride)

#     def forward(self, x, print_sizes = False) -> torch.Tensor:
#         """ Input x of size (max_sent_len, batch_size, max_word_length) and output of size [batchsize, embed_size] """
#         #x = x.permute(1, 3, 0, 2)
#         x_emb = self.embedding(x).permute(0, 1, 3, 2)
        
#         cnn_output = torch.stack([my_conv.forward(sent) for sent in x_emb])

#         x_conv = self.relu(self.conv(x_emb))
#         if print_sizes: print (x_conv.shape)

#         x_pool, x_pool_indices = torch.max(x_conv, dim = 2)
#         # print (x_pool)
#         if print_sizes: print(x_pool.shape)

#         print (x_pool.shape)
#         return x_pool

In [9]:
n_filters = 100
filter_sizes = [1, 2, 3, 4]
output_dim = 1
dropout = 0.5

CNN_model = CNN(embeddings_matrix, n_filters, filter_sizes, output_dim, dropout)

In [10]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
   # print (rounded_preds)
   # print (y)
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [11]:
def train(model, train_loader, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    loss_func = nn.CrossEntropyLoss()
    
    for batchnum, batch in enumerate(train_loader):
        #print ("Training on batch #" + str(batchnum))
#         print (batch)
        #train_x = torch.stack(batch['content'])
        train_x = ve.to_input_tensor_char(batch['content'], device)
        print (train_x.shape)
        #print (train_x.shape)
        train_y = batch['label'].float()
        #train_y = batch['label'].long()
        if train_x.shape[1] == 1: continue
        #print (train_y.view(-1).shape)

        predictions = model.forward(train_x).squeeze(1)
        #print (predictions.shape)
        loss = criterion(predictions, train_y)
        # print (loss)
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        acc = binary_accuracy(predictions, train_y)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(train_loader), epoch_acc / len(train_loader)

In [12]:
def evaluate(model, dev_loader, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    

    
    model.eval()
    
    with torch.no_grad():
    
        for batchnum, batch in enumerate(dev_loader):
            dev_x = torch.stack(batch['content'])
            #print (train_x)
            dev_y = batch['label'].float()
            
            
            predictions = model(dev_x).squeeze(1)
            print (torch.round(torch.sigmoid(predictions)))
            
            loss = criterion(predictions, dev_y)
            
            acc = binary_accuracy(predictions, dev_y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(dev_loader), epoch_acc / len(dev_loader)

In [13]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(CNN_model.parameters())

criterion = nn.BCEWithLogitsLoss()

CNN_model = CNN_model.to(device)

criterion = criterion.to(device)

N_EPOCHS = 10

train_loader = DataLoader(train_dataset,
                      batch_size=128,
                      shuffle=True,
                      num_workers=4
                     # pin_memory=True # CUDA only
                     )
    
dev_loader = DataLoader(dev_dataset,
                  batch_size=128,
                  shuffle=False,
                  num_workers=4
                 # pin_memory=True # CUDA only
                 )

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(CNN_model, train_loader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(CNN_model, dev_loader, criterion, device)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

torch.Size([21, 128, 21])
torch.Size([21, 128, 21, 300])


RuntimeError: Given groups=1, weight of size [100, 300, 1, 300], expected input[21, 128, 21, 300] to have 300 channels, but got 128 channels instead