In [1]:
import re
import torch
import numpy as np
import torch.nn as nn
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.autograd as autograd
from torch.utils.data import Dataset, DataLoader

In [2]:
quotes = []
tags = []


with open("training.txt", "r", encoding='utf-8') as f:
    content = f.read()
    for match in re.finditer(r'“(.*?)”', content):
        quotes.append(match.group().strip("”").strip("“").strip("?,.!''"))
    for match in re.finditer(r'tag: (.*)\n', content):
        tags.append(match.group(1).strip("\n"))

print(tags)
print(quotes)

['heart, importance, inspiration, secret', 'reminding', 'caring, inspirational, love, roses, time ', 'inspirational', 'inspirational ', 'friendship, love, responsibility', 'adults, children, philosophy, the-little-prince ', 'butterflies, caterpillars, philosophy', 'mystery, tears', 'caring, inspirational, love, roses, uniqueness  ', 'inspirational , heart ', 'responsibility  ', 'loneliness  ', 'love  ', 'live-well-love-much-love-often', 'grief', 'beauty', 'flowers, love, roses ', 'inspirational, fault', 'creativity, inspirational  ', 'needs, world, uniqueness', 'childlike-wonder, maturity  ', 'existentialism  ', 'loneliness  ', 'existentialism    ', 'heart, time  ', 'inspirational, philosophy', 'love, friendship ', 'love, friendship     ', 'flowers, importance, longing, love, protection, roses, sheep, stars, thorns, uniqueness, war  ', 'inspirational, lovely  ', 'love, confession', 'sad', 'desert, silence', 'afternoon, anticipation, friend, happy, visit ', 'conceit, praise ', 'determin

In [3]:
vocab = list(set(word for quote in quotes for word in quote.split()))
word_to_index = {word: i for i, word in enumerate(vocab)}
print(len(vocab))
sentences_indices = [
    [word_to_index[word] for word in quote.split()]
    for quote in quotes
]
print(sentences_indices)

740
[[641, 632, 178, 535, 161, 646, 569, 510, 422, 518, 534, 535, 615, 32, 728, 560, 345, 90, 604, 645, 11, 273, 535, 633, 535, 322, 221, 728, 408], [564, 614, 48, 736, 424, 146, 615, 191, 63, 386, 712, 142], [534, 535, 728, 229, 255, 122, 47, 737, 79, 30, 345, 567, 79, 30, 376, 94], [521, 373, 375, 721, 475, 728, 388, 441, 672, 362, 590, 499, 187, 115, 108, 32, 728, 560], [74, 567, 728, 302, 250, 603, 728, 538, 369, 167, 345, 553, 142, 673, 569, 640], [529, 355, 575, 737, 125, 74, 724, 345, 16, 281, 329, 535, 349, 42, 251, 514, 500, 603, 728, 49, 148, 289, 221, 194, 8, 194, 670, 383, 603, 728, 49, 36, 106, 255, 115, 211, 490, 663, 282, 569, 538, 260, 86, 535, 608, 416, 569, 525, 387, 151, 538, 162, 641, 529, 122, 469, 131, 63, 173, 641, 13, 334, 79, 114, 122, 469, 131, 63, 244, 442, 255, 529, 355, 490, 663, 282, 569, 31, 416, 569, 525, 387, 151, 372, 482, 679, 255, 394, 106, 396, 318, 412, 131, 660, 157, 442, 106, 255, 76, 672, 233, 475, 248, 728, 627, 442, 13, 529, 412, 672, 233, 475

In [4]:
all_tags = []
for tag in tags: 
    clean_tags = [t.strip() for t in tag.split(",")]
    all_tags.append(clean_tags)

unique_words = set(word for sublist in all_tags for word in sublist)
print(len(unique_words))
word_to_idx = {word: idx for idx, word in enumerate(unique_words)}

def one_hot_encode(words, word_to_idx):
    one_hot = torch.zeros(len(word_to_idx))
    for word in words:
        one_hot[word_to_idx[word]] = 1
    return one_hot

one_hot_tags = []
for tag in tags: 
    tags_one_quote = tag.split(",") #['heart', ' importance', ' inspiration', ' secret'] tags for 1 quote
    one_hot = torch.zeros(len(word_to_idx))
    for tag in tags_one_quote:
        tag = tag.strip()
        one_hot[word_to_idx[tag]] = 1
    one_hot_tags.append(one_hot)
print(one_hot_tags[0])

59
tensor([0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0.])


In [5]:
glove = GloVe(name="6B", dim=100)

tokenized_quotes = []
tokenizer = get_tokenizer("basic_english")
for quote in quotes:
    tokenized_quotes.append(tokenizer(quote))
print(tokenized_quotes)

[['and', 'now', 'here', 'is', 'my', 'secret', ',', 'a', 'very', 'simple', 'secret', 'it', 'is', 'only', 'with', 'the', 'heart', 'that', 'one', 'can', 'see', 'rightly', 'what', 'is', 'essential', 'is', 'invisible', 'to', 'the', 'eye'], ['all', 'grown-ups', 'were', 'once', 'children', '.', '.', '.', 'but', 'only', 'few', 'of', 'them', 'remember', 'it'], ['it', 'is', 'the', 'time', 'you', 'have', 'wasted', 'for', 'your', 'rose', 'that', 'makes', 'your', 'rose', 'so', 'important'], ['the', 'most', 'beautiful', 'things', 'in', 'the', 'world', 'cannot', 'be', 'seen', 'or', 'touched', ',', 'they', 'are', 'felt', 'with', 'the', 'heart'], ['what', 'makes', 'the', 'desert', 'beautiful', ',', "'", 'said', 'the', 'little', 'prince', ',', "'", 'is', 'that', 'somewhere', 'it', 'hides', 'a', 'well'], ['i', 'am', 'looking', 'for', 'friends', '.', 'what', 'does', 'that', 'mean', '--', 'tame', '?', 'it', 'is', 'an', 'act', 'too', 'often', 'neglected', ',', 'said', 'the', 'fox', '.', 'it', 'means', 'to',

In [6]:
indexed_sentences = []
for sentence in tokenized_quotes:
    try:
        indexed_sentences.append([glove.stoi[word] for word in sentence])
    except KeyError:
        indexed_sentences.append(torch.zeros(1))

print(indexed_sentences)

[[5, 114, 187, 14, 192, 1779, 1, 7, 191, 2147, 1779, 20, 14, 91, 17, 0, 1058, 12, 48, 86, 253, 19481, 102, 14, 3720, 14, 11064, 4, 0, 2090], [64, 182404, 35, 442, 271, 2, 2, 2, 34, 91, 306, 3, 101, 2715, 20], [20, 14, 0, 79, 81, 33, 10939, 10, 392, 486, 12, 907, 392, 486, 100, 481], [0, 96, 3366, 654, 6, 0, 85, 1119, 30, 541, 46, 5844, 1, 39, 32, 1349, 17, 0, 1058], [102, 907, 0, 3731, 3366, 1, 57, 16, 0, 333, 1781, 1, 57, 14, 12, 5530, 20, 18969, 7, 143], [41, 913, 862, 10, 1095, 2, 102, 260, 12, 1702, 65, 15807, 188, 20, 14, 29, 743, 317, 456, 11598, 1, 16, 0, 2106, 2, 20, 889, 4, 2617, 1445, 2, 4, 2617, 1445, 188, 120, 12, 1, 16, 0, 2106, 2, 4, 285, 1, 81, 32, 149, 936, 56, 73, 7, 333, 1606, 38, 14, 120, 117, 7, 3079, 4119, 68, 333, 2122, 2, 5, 41, 33, 84, 408, 3, 81, 2, 5, 81, 1, 13, 392, 153, 1, 33, 84, 408, 3, 285, 2, 4, 81, 41, 913, 936, 56, 73, 7, 2106, 117, 7, 3079, 4119, 68, 25823, 2, 34, 83, 81, 15807, 285, 1, 127, 53, 5284, 408, 236, 68, 2, 4, 285, 1, 81, 43, 30, 3006, 6, 6

In [7]:
glove = GloVe(name="6B", dim=100)
weights_matrix = np.zeros((len(vocab), 100))
for i, word in enumerate(vocab):
    try: 
        idx = glove.stoi[word]
        weights_matrix[i] = glove.vectors[idx]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(100, ))
print(weights_matrix)

[[ 0.0024426  -0.83525997  0.20513999 ...  0.51753998 -0.13812999
   0.24179   ]
 [ 0.53020655 -0.31885417 -0.46479239 ... -0.86193291 -0.00846135
   0.60568033]
 [ 0.57634997  0.83113003  0.54605001 ... -0.57064003 -0.2755
   0.76112998]
 ...
 [-0.14401001  0.32554001  0.14257    ...  0.25398001  1.10780001
  -0.073074  ]
 [-0.0069454  -0.0062469  -0.48311001 ... -0.09669     0.92708999
   0.41914001]
 [-0.59359002 -0.074932    0.53035003 ...  0.23101     0.56844002
   0.48857999]]


In [8]:
weights_tensor = torch.tensor(weights_matrix, dtype=torch.float32)
emb_layer = nn.Embedding(len(vocab), 100)
emb_layer.load_state_dict({'weight': weights_tensor})

<All keys matched successfully>

In [9]:
class LSTMClassifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, label_size):
        super(LSTMClassifier, self).__init__()
        self.word_embeddings = emb_layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden_dim = hidden_dim
        self.hidden = nn.Linear(hidden_dim, label_size)
      
    def init_hidden(self):
        return (autograd.Variable(torch.zeros(1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, self.hidden_dim)))

    def forward(self, sentence):
        embedded = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embedded)
        hidden_out = self.hidden(lstm_out.shape[0], -1)
        output = nn.LogSoftmax(hidden_out)
        return output

lstm = LSTMClassifier(100, 128, 59)

In [10]:
class TrainingDataset():
    def __init__(self, quotes, labels):
        self.quotes = quotes
        self.labels = labels

    def __len__(self):
        return len(self.quotes)

    def __getitem__(self, idx):
        return self.quotes[idx], self.labels[idx]

dataset = TrainingDataset(sentences_indices, one_hot_tags)

In [11]:
criterion = nn.BCELoss() 
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)

for epoch in range(5):
    for texts, labels in dataset:
        tensor_texts = torch.LongTensor(texts)
        tensor_labels = torch.FloatTensor(labels)
        outputs = lstm(tensor_texts)
        print(outputs)
        loss = criterion(outputs, tensor_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

TypeError: Linear.forward() takes 2 positional arguments but 3 were given