In [1]:
import torchtext
torchtext.__version__

'0.16.1+cpu'

In [2]:
import torch
from torchtext import data
import pandas as pd
from torch import nn

In [3]:
SEED = 20

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.__version__, device)

2.1.0+cu118 cuda


In [19]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 10.1 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [42]:
import spacy
eng = spacy.load("en_core_web_sm") 

Data was retrieved from [kaggle](https://www.kaggle.com/datasets/kazanova/sentiment140/).

In [43]:
import torchdata.datapipes as dp
from nltk.stem.snowball  import SnowballStemmer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T

FILE_PATH = "Sentiment140/training.1600000.processed.noemoticon.csv"

# create our DataPipe which is an iterable of filenames
data_pipe = dp.iter.IterableWrapper([FILE_PATH])

# open the file and parse the CSV; returns an iterable of tuples
data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')
data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter=",", as_tuple=True)



In [44]:
# what does a row look like?
for row in data_pipe:
    print(row)
    break

('0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY', '_TheSpecialOne_', "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D")


In [45]:
# we will only be working on the first & last columns
# i.e. the (text, label) 
data_pipe = data_pipe.map(lambda row: (row[len(row) - 1], row[0]))

In [46]:
# our preprocessing will consist of stemming the word
stemmer = SnowballStemmer("english", ignore_stopwords=True)

def get_preprocessed_tokens(text, preprocess = lambda word: stemmer.stem(word)):
    """
    Tokenize a text & preprocess each token using preprocess (function) 
    and return a list of tokens
    """
    return [preprocess(token.text) for token in eng.tokenizer(text)]

def preprocessed_tokens_iter(data_pipe, preprocess = lambda word: stemmer.stem(word)):
    for text, _ in data_pipe:
        yield get_preprocessed_tokens(text, preprocess)

In [36]:
# build our vocab
vocab = build_vocab_from_iterator(
    preprocessed_tokens_iter(data_pipe),
    min_freq=2,

    # special tokens include passing, start & end of sentence, & unknown
    specials=['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)

# set the default token; if word is not found, use <unk>
vocab.set_default_index(vocab['<unk>'])

print(vocab.get_itos()[:10])

['<pad>', '<sos>', '<eos>', '<unk>', 'i', '!', '.', ' ', 'to', 'the']


In [47]:
text_tranform = T.Sequential(
    # converts the sequence to indices based on given vocabulary
    T.VocabTransform(vocab=vocab),
    
    # add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
    # 1 because index for <sos> in vocab
    T.AddToken(1, begin=True),

    # add <eos> at beginning of each sentence
    # 2 because index for <eos> in vocab
    T.AddToken(2, begin=False)
)

In [48]:
# what does our data look like?
for original_sequence, label in data_pipe:
    itos = vocab.get_itos()
    transformed_sequence = text_tranform(get_preprocessed_tokens(original_sequence))
    preprocessed_sequence = [itos[i] for i in transformed_sequence]
    print(f"Label:                 {label}")
    print(f"Original Sequence:     {original_sequence}")
    print(f"Preprocessed Sequence: {' '.join(preprocessed_sequence)}")
    print(f"Transformed Sequence:  {transformed_sequence}")
    break

Label:                 0
Original Sequence:     @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
Preprocessed Sequence: <sos> @switchfoot <unk> - awww , that 's a bummer .   you shoulda got david carr of third day to do it . ;d <eos>
Transformed Sequence:  [1, 28448, 3, 34, 498, 10, 23, 22, 11, 1197, 6, 7, 15, 3250, 56, 866, 8024, 21, 1837, 41, 8, 31, 13, 6, 2009, 2]


In [49]:
# apply our transformation to our pipe
def apply_transform(row):
    text, label = row
    return (
        text_tranform(get_preprocessed_tokens(text)), 
        0 if int(label) == 0 else 1
    )
data_pipe = data_pipe.map(apply_transform)

# what does a row look like?
for row in data_pipe:
    print(row)
    break

([1, 28448, 3, 34, 498, 10, 23, 22, 11, 1197, 6, 7, 15, 3250, 56, 866, 8024, 21, 1837, 41, 8, 31, 13, 6, 2009, 2], 0)


In [62]:
batch_data_pipe = data_pipe.shuffle(buffer_size=1600000).bucketbatch(
    batch_size=32,
    batch_num=1,
    bucket_num=100,
    use_in_batch_shuffle=False,
)

# what does a row look like?
for row in batch_data_pipe:
    print(row)
    break

[([1, 1635, 7, 14, 8343, 44, 7347, 2], 0), ([1, 211, 21, 44, 3778, 5, 4, 97, 11, 459, 6, 180, 25, 350, 6, 7, 371, 1112, 6, 4, 43, 309, 3, 6, 7, 14, 12, 189753, 3, 2], 1), ([1, 15, 72, 63, 4, 83, 53, 81, 55, 69, 509, 217, 103, 17, 7, 636, 1414, 5, 7, 193, 715, 4, 76, 23, 17, 7, 136, 2], 1), ([1, 92083, 563, 17, 5, 194, 917, 10, 4, 73, 2], 1), ([1, 33, 56, 68, 71, 3, 7, 106, 414, 8, 11, 202, 184, 5, 2], 1), ([1, 50, 4, 67, 16, 8, 29, 103, 10, 375, 1652, 43, 102, 46, 1297, 10, 14, 1461, 87, 39776, 63, 4, 125, 27, 125, 8, 31, 16, 40, 44, 424, 775, 268, 2], 0), ([1, 110765, 4, 33, 85, 23, 1162, 35, 5000, 532, 18, 2], 0), ([1, 50, 44, 233, 81, 41107, 14, 24097, 22, 693, 16, 98, 26, 6273, 21, 417, 14, 3, 22, 693, 537, 34, 1850, 2043, 513, 2], 1), ([1, 4, 51, 44, 466, 3045, 13, 22, 388, 7, 136, 2], 1), ([1, 35, 12, 1699, 9465, 10, 887, 2360, 3, 115, 84, 46, 277, 5, 810, 2], 1), ([1, 61789, 31480, 9764, 86918, 2], 0), ([1, 2189, 10, 4, 75, 65, 15, 183, 42, 1854, 42, 28, 65, 15, 2], 1), ([1, 3, 

In [74]:
def apply_paddings(row):
    text, label = row
    return (T.ToTensor(0)(text), label)
batch_padded_data_pipe = batch_data_pipe.map(apply_transform)

# what does a row look like?
for row in batch_data_pipe:
    print(row)
    break


KeyboardInterrupt: 

In [70]:
T.ToTensor(0)(["test","test"])

TypeError: Input type not supported

In [3]:
df = pd.read_csv(FILE_PATH, header=None, encoding='latin-1')
header_names = ["label", "ids", "date", "flag", "user", "text"]
df.columns = header_names

# the dataset actually only contains labels with 0 and 4
# this will be simplified with 0 for negative and 1 for positive
df['label'] = df['label'].apply(lambda value: 1 if value != 0 else 0)
df = df.sample(frac=1.0, random_state=SEED).head(1000)

# check the distributions
print(f"negative examples: {df['label'].eq(0).sum()}")
print(f"positive examples: {df['label'].eq(1).sum()}")

negative examples: 491
positive examples: 509


In [15]:
# data field for text (text is sequential and to lower)
TEXT = data.Field(sequential=True, batch_first=True, lower=True, pad_token='<pad>', fix_length=32)

# data field for label (label is a float between 0 and 1)
LABEL = data.LabelField(dtype=torch.float)

# create examples by iterating over DataFrame
examples = []
for _, row in df.iterrows():
    # get text
    text = row['text']

     # convert to float
    label = float(row['label'])
    examples.append(data.Example.fromlist([text, label], fields=[('text', TEXT), ('label', LABEL)]))

# split into train, validation, and test sets
train_data, valid_data, test_data = data.Dataset(examples, fields=[('text', TEXT), ('label', LABEL)]).split(split_ratio=[0.7, 0.15, 0.15])

# build the vocab (`min_freq=2` will leave out the words that appear only once)
TEXT.build_vocab(train_data, min_freq=2)
vocab_size = len(TEXT.vocab)

train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=32,
    sort_within_batch=True,
    sort_key=lambda x: len(x.text),
    device='cuda' if torch.cuda.is_available() else 'cpu')


In [14]:
LABEL.vocab

AttributeError: 'LabelField' object has no attribute 'vocab'

In [16]:
# what does the data look like?
for batch in train_iter:
    text = batch.text
    label = batch.label
    
    # take the first embedding in the batch
    text_vector = text.tolist()[0] 
    
    # take the first label in the batch 
    label_vector = label.tolist()[0]
    
    # convert embedding back to words
    text_words = ' '.join([TEXT.vocab.itos[i] for i in text_vector])
    
    # print the vectorized text and label for the first example in the batch
    print(f"text_vector = {text_vector}")
    print(f"text_words = \"{text_words}\"")
    print(f"label = {label_vector}")
    break


AttributeError: 'LabelField' object has no attribute 'vocab'

In [9]:
import RNN

model = RNN.Classifier(embedding_size=128, hidden_size=128, output_size=1, num_layers=1, vocab_size=vocab_size)

# loss and optimizer
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# move to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

RNN.train(model=model, optimizer=optimizer, loss_fn=loss_fn, train_iter=train_iter, val_iter=val_iter, num_epochs=1000)


Epoch 1, Average Loss: 73.18%, Validation Accuracy: 51.33%
Epoch 2, Average Loss: 69.84%, Validation Accuracy: 51.33%
Epoch 3, Average Loss: 69.32%, Validation Accuracy: 51.33%
Epoch 4, Average Loss: 69.69%, Validation Accuracy: 52.00%
Epoch 5, Average Loss: 69.42%, Validation Accuracy: 51.33%
Epoch 6, Average Loss: 69.94%, Validation Accuracy: 51.33%
Epoch 7, Average Loss: 71.93%, Validation Accuracy: 52.00%
Epoch 8, Average Loss: 71.17%, Validation Accuracy: 51.33%
Epoch 9, Average Loss: 70.15%, Validation Accuracy: 51.33%
Epoch 10, Average Loss: 69.46%, Validation Accuracy: 48.67%
Epoch 11, Average Loss: 69.73%, Validation Accuracy: 51.33%
Epoch 12, Average Loss: 69.73%, Validation Accuracy: 51.33%
Epoch 13, Average Loss: 69.62%, Validation Accuracy: 48.67%
Epoch 14, Average Loss: 69.22%, Validation Accuracy: 51.33%
Epoch 15, Average Loss: 69.50%, Validation Accuracy: 48.67%
Epoch 16, Average Loss: 69.08%, Validation Accuracy: 51.33%
Epoch 17, Average Loss: 69.36%, Validation Accura

KeyboardInterrupt: 

In [10]:
import LSTM

model = LSTM.Classifier(embedding_size=128, hidden_size=128, output_size=1, num_layers=1, vocab_size=vocab_size)

# loss and optimizer
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# move to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

LSTM.train(model=model, optimizer=optimizer, loss_fn=loss_fn, train_iter=train_iter, val_iter=val_iter, num_epochs=10)

Epoch 1, Average Loss: 70.59%, Validation Accuracy: 48.67%
Epoch 2, Average Loss: 69.72%, Validation Accuracy: 51.33%
Epoch 3, Average Loss: 69.68%, Validation Accuracy: 48.67%
Epoch 4, Average Loss: 69.34%, Validation Accuracy: 51.33%
Epoch 5, Average Loss: 69.40%, Validation Accuracy: 50.67%
Epoch 6, Average Loss: 69.32%, Validation Accuracy: 51.33%
Epoch 7, Average Loss: 69.41%, Validation Accuracy: 52.67%
Epoch 8, Average Loss: 69.04%, Validation Accuracy: 56.67%
Epoch 9, Average Loss: 68.38%, Validation Accuracy: 55.33%
Epoch 10, Average Loss: 66.89%, Validation Accuracy: 52.67%
