In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

import pandas as pd
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

In [2]:
tweets_df = pd.read_csv('./training.1600000.processed.noemoticon.csv', engine='python', header=None, encoding='latin1')
tweets_df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
tweets_df[0].value_counts()

0
0    800000
4    800000
Name: count, dtype: int64

In [4]:
tweets_df['sentiment_cat'] = tweets_df[0].astype('category')
tweets_df['sentiment'] = tweets_df['sentiment_cat'].cat.codes

In [5]:
tweets_df.sample(10)

Unnamed: 0,0,1,2,3,4,5,sentiment_cat,sentiment
832206,4,1557555594,Sun Apr 19 04:48:51 PDT 2009,NO_QUERY,JamesHancox,@Courageous_one Awww... she just wants to be c...,4,1
585102,0,2215511752,Wed Jun 17 18:14:30 PDT 2009,NO_QUERY,valsan71,@dannygokey Iam so sad I missed your chat sees...,0,0
323916,0,2005830414,Tue Jun 02 10:14:57 PDT 2009,NO_QUERY,ChelseySyrnyk,No sleep last night. And my mind is still a cl...,0,0
1141358,4,1977169998,Sat May 30 20:16:46 PDT 2009,NO_QUERY,HaasDesigns,HaasDesignsis finally home for the night. Wher...,4,1
838953,4,1559257355,Sun Apr 19 10:40:27 PDT 2009,NO_QUERY,LindsayWhite,@paulaabdul have a great sunday! Have fun danc...,4,1
1411945,4,2056571878,Sat Jun 06 11:23:26 PDT 2009,NO_QUERY,xosaraa,@JessChristine13 I know I have a problem,4,1
29278,0,1562360062,Sun Apr 19 19:47:55 PDT 2009,NO_QUERY,brucemjackson,@BeckyBuckwild wonder if u told him about the ...,0,0
1550088,4,2183696539,Mon Jun 15 14:51:40 PDT 2009,NO_QUERY,vortexsquid,Yeah i ended up ordering the dvd AND watching ...,4,1
1005565,4,1880524238,Fri May 22 01:33:04 PDT 2009,NO_QUERY,freeek0804,@AshleyLTMSYF http://twitpic.com/5o7al - You l...,4,1
675519,0,2248119186,Fri Jun 19 20:07:50 PDT 2009,NO_QUERY,tifanguyen,#dontyouhate exams?,0,0


In [6]:
tweets_df.to_csv('train-processed.csv', header=None, index=None)
tweets_df.sample(10000).to_csv('train-processed-sample.csv', header=None, index=None)

In [7]:
nlp = spacy.load('en_core_web_sm')

def tokenize(tweet) -> list:
    return [word.text.lower() for word in nlp(tweet)]


def build_vocab(tweets, min_freq=1):
    counter = Counter()
    for tweet in tweets:
        counter.update(tweet)

    vocab = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    
    return vocab

def collate_fn(batch):
    tweets, labels = zip(*batch)
    tweets = pad_sequence(tweets, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)

    return tweets, labels

class TweetDataset(Dataset):
    def __init__(self, tweets, labels, vocab=None) -> None:
        self.tokens = [tokenize(tweet) for tweet in tweets]
        self.labels = labels
        self.vocab = vocab or build_vocab(self.tokens)
        self.data = [self.token_to_num(tokens) for tokens in self.tokens]
    
    def token_to_num(self, tokens):
        return [self.vocab.get(token, self.vocab["<unk>"]) for token in tokens]

    def __len__(self) -> int:
        return len(self.labels)
    
    def __getitem__(self, index):
        return torch.tensor(self.data[index]), torch.tensor(self.labels[index])

In [8]:

batch_size = 64
num_workers = 0
sample_tweets_df = pd.read_csv('./train-processed-sample.csv', header=None)

tweets = sample_tweets_df.loc[:, 5].values
labels = sample_tweets_df.loc[:, 7].values

tweets_train, tweets_temp, labels_train, labels_temp = train_test_split(
    tweets, labels, test_size=0.3, random_state=42, stratify=labels
)
tweets_val, tweets_test, labels_val, labels_test = train_test_split(
    tweets_temp, labels_temp, test_size=0.5, random_state=42, stratify=labels_temp
)

train_data = TweetDataset(tweets=tweets_train, labels=labels_train)
val_data = TweetDataset(tweets=tweets_val, labels=labels_val)
test_data = TweetDataset(tweets=tweets_test, labels=labels_test)

train_loader = DataLoader(dataset=train_data, shuffle=True, batch_size=64, num_workers=num_workers, collate_fn=collate_fn)
val_loader = DataLoader(dataset=val_data, batch_size=64, num_workers=num_workers, collate_fn=collate_fn)
test_loader = DataLoader(dataset=test_data, batch_size=64, num_workers=num_workers, collate_fn=collate_fn)

In [9]:
class SentimentLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_size, vocab_size):
        super(SentimentLSTM, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.predictor = nn.Linear(in_features=hidden_size, out_features=2)

    def forward(self, seq):
        output, (hidden, _) = self.encoder(self.embedding(seq))
        preds = self.predictor(hidden.squeeze(0))
        return preds



In [10]:
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

In [11]:
sentiment_lstm = SentimentLSTM(100, 300, 20002)
sentiment_lstm.to(device)

SentimentLSTM(
  (embedding): Embedding(20002, 100)
  (encoder): LSTM(100, 300, batch_first=True)
  (predictor): Linear(in_features=300, out_features=2, bias=True)
)

In [12]:
optimizer = optim.Adam(params=sentiment_lstm.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [13]:
def train(model=sentiment_lstm, train_loader=train_loader, val_loader=val_loader, loss_fn=criterion, optimizer=optimizer, device=device, epochs=100):
    for epoch in range(1, epochs+1):
        train_loss = 0.0
        val_loss = 0.0
        model.train()
        for inputs, target in train_loader:
            optimizer.zero_grad()
            inputs = inputs.to(device)
            target = target.to(device)
            output = model(inputs)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
        train_loss /= len(train_loader)

        num_correct = 0.0
        num_examples = 0.0
        model.eval()
        for inputs, target in val_loader:
            inputs = inputs.to(device)
            target = target.to(device)
            output = model(inputs)
            loss = loss_fn(output, target)

            correct = torch.eq(torch.max(F.softmax(output), dim=1)[1], target).view(-1)
            num_correct += torch.sum(correct).item()
            num_examples += correct.shape[0]
            val_loss += loss.item()
        val_loss /= len(val_loader)

        print(f"Epoch: {epoch}, Train Loss: {train_loss:.2f}, Validation Loss: {val_loss:.2f}, Accuracy: {(num_correct/num_examples):.2f}")
train()

  correct = torch.eq(torch.max(F.softmax(output), dim=1)[1], target).view(-1)


Epoch: 1, Train Loss: 0.70, Validation Loss: 0.69, Accuracy: 0.50
Epoch: 2, Train Loss: 0.69, Validation Loss: 0.69, Accuracy: 0.50
Epoch: 3, Train Loss: 0.68, Validation Loss: 0.72, Accuracy: 0.50
Epoch: 4, Train Loss: 0.60, Validation Loss: 0.81, Accuracy: 0.48
Epoch: 5, Train Loss: 0.49, Validation Loss: 0.86, Accuracy: 0.51
Epoch: 6, Train Loss: 0.39, Validation Loss: 0.99, Accuracy: 0.51
Epoch: 7, Train Loss: 0.29, Validation Loss: 1.31, Accuracy: 0.51
Epoch: 8, Train Loss: 0.19, Validation Loss: 1.39, Accuracy: 0.52
Epoch: 9, Train Loss: 0.12, Validation Loss: 1.78, Accuracy: 0.52
Epoch: 10, Train Loss: 0.07, Validation Loss: 2.19, Accuracy: 0.52
Epoch: 11, Train Loss: 0.06, Validation Loss: 2.23, Accuracy: 0.52
Epoch: 12, Train Loss: 0.04, Validation Loss: 2.11, Accuracy: 0.50
Epoch: 13, Train Loss: 0.04, Validation Loss: 2.69, Accuracy: 0.52
Epoch: 14, Train Loss: 0.03, Validation Loss: 2.77, Accuracy: 0.52
Epoch: 15, Train Loss: 0.03, Validation Loss: 2.59, Accuracy: 0.52
Epoc

In [14]:
def evaluate(model, dataloader):
    correct, total = 0, 0
    model.eval()
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
    return correct / total

In [16]:
evaluate(model=sentiment_lstm, dataloader=test_loader)

0.5173333333333333