In [1]:
import torch
from torchtext import data
import pandas as pd
from torch import nn

In [2]:
FILE_PATH = "Sentiment140/training.1600000.processed.noemoticon.csv"
SEED = 20

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.__version__, device)

2.0.1+cu118 cuda


Data was retrieved from [kaggle](https://www.kaggle.com/datasets/kazanova/sentiment140/).

In [3]:
df = pd.read_csv(FILE_PATH, header=None, encoding='latin-1')
header_names = ["label", "ids", "date", "flag", "user", "text"]
df.columns = header_names

# the dataset actually only contains labels with 0 and 4
# this will be simplified with 0 for negative and 1 for positive
df['label'] = df['label'].apply(lambda value: 1 if value != 0 else 0)
df = df.sample(frac=1.0, random_state=SEED).head(1000)

# check the distributions
print(f"negative examples: {df['label'].eq(0).sum()}")
print(f"positive examples: {df['label'].eq(1).sum()}")

negative examples: 491
positive examples: 509


In [4]:
# data field for text (text is sequential and to lower)
TEXT = data.Field(sequential=True, batch_first=True, lower=True, pad_token='<pad>', fix_length=32)

# data field for label (label is a float between 0 and 1)
LABEL = data.Field(sequential=False, use_vocab=False, dtype=torch.float)

# create examples by iterating over DataFrame
examples = []
for _, row in df.iterrows():
    # get text
    text = row['text']

     # convert to float
    label = float(row['label'])
    examples.append(data.Example.fromlist([text, label], fields=[('text', TEXT), ('label', LABEL)]))

# split into train, validation, and test sets
train_data, valid_data, test_data = data.Dataset(examples, fields=[('text', TEXT), ('label', LABEL)]).split(split_ratio=[0.7, 0.15, 0.15])

# build the vocab (`min_freq=2` will leave out the words that appear only once)
TEXT.build_vocab(train_data, min_freq=2)
vocab_size = len(TEXT.vocab)

train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=32,
    sort_within_batch=True,
    sort_key=lambda x: len(x.text),
    device='cuda' if torch.cuda.is_available() else 'cpu')


In [5]:
# what does the data look like?
for batch in train_iter:
    text = batch.text
    label = batch.label
    
    # take the first embedding in the batch
    text_vector = text.tolist()[0] 
    
    # take the first label in the batch 
    label_vector = label.tolist()[0]
    
    # convert embedding back to words
    text_words = ' '.join([TEXT.vocab.itos[i] for i in text_vector])
    
    # print the vectorized text and label for the first example in the batch
    print(f"text_vector = {text_vector}")
    print(f"text_words = \"{text_words}\"")
    print(f"label = {label_vector}")
    break


text_vector = [8, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
text_words = "is <unk> <unk> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>"
label = 1.0


In [6]:
import RNN

model = RNN.Classifier(embedding_size=128, hidden_size=128, output_size=1, num_layers=1, vocab_size=vocab_size)

# loss and optimizer
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# move to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

RNN.train(model=model, optimizer=optimizer, loss_fn=loss_fn, train_iter=train_iter, val_iter=val_iter, num_epochs=1)


Epoch 1, Average Loss: 70.58%, Validation Accuracy: 47.33%


In [7]:
import LSTM

model = LSTM.Classifier(embedding_size=128, hidden_size=128, output_size=1, num_layers=1, vocab_size=vocab_size)

# loss and optimizer
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# move to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

LSTM.train(model=model, optimizer=optimizer, loss_fn=loss_fn, train_iter=train_iter, val_iter=val_iter, num_epochs=1)

Epoch 1, Average Loss: 70.04%, Validation Accuracy: 47.33%
