In [None]:
## DAta: https://www.manythings.org/anki/

## Task: classify sentences based on their language

In [1]:
import unicodedata
import regex as re
from tqdm import tqdm
import numpy as np

## Build vocabularies

In [12]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

## optional pairs filtering
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def valid_pair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[0].lower().startswith(eng_prefixes)

In [13]:
## Read the file and split into lines and normalize strings
def read_word_pairs(filename):
    text_lines = open(filename, encoding='utf-8').read().strip().split('\n')
    words_pairs = []
    for line in tqdm(text_lines):
        parts = line.split("\t")
        if len(parts) > 2:
            words_pairs.append((normalizeString(parts[0].strip()), normalizeString(parts[1].strip())))

    return [p for p in words_pairs if valid_pair(p)]#[0:1000]

In [52]:
def read_word_pairs(filename):
    text_lines = open(filename, encoding='utf-8').read().strip().split('\n')
    words_pairs = []
    for line in tqdm(text_lines):
        parts = line.split("\t")
        if len(parts) == 2:
            words_pairs.append((normalizeString(parts[0].strip()), normalizeString(parts[1].strip())))

    return words_pairs
    # return [p for p in words_pairs if valid_pair(p)]


In [54]:
eng_deu_pairs = read_word_pairs("/content/eng-deu.txt")
print(eng_deu_pairs[0:3])

eng_cat_pairs = read_word_pairs("/content/eng-cat.txt")
print(eng_cat_pairs[0:3])

100%|██████████| 10/10 [00:00<00:00, 8466.50it/s]


[('extraverts generally listen to music more often', 'extravertierte horen im allgemeinen haufiger musik'), ('they have more diverse musical preferences', 'sie haben vielfaltigere musikvorlieben'), ('introverts rarely listen to top hits', 'introvertierte horen selten top hits')]


100%|██████████| 10/10 [00:00<00:00, 12409.18it/s]

[('extraverts generally listen to music more often', 'les extravertits escolten musica mes sovint'), ('they have more diverse musical preferences', 'tenen preferencies musicals mes diverses'), ('introverts rarely listen to top hits', 'els introvertits gairebe no escolten exits del top')]





In [55]:
## Creating language vocabularies

EOS_token = 0

word2index = {"EOS": 0}
index2word = {0: "EOS"}
n_words = 1
bow = set(["UNK"])


for eng_sent, deu_sent in eng_deu_pairs:
    for word in eng_sent.split(" "):
        bow.add(word)
    for word in deu_sent.split(" "):
        bow.add(word)
for _, cat_sent in eng_cat_pairs:
    for word in cat_sent.split(" "):
        bow.add(word)
n_words += len(bow)
print("Vocabulary size:", n_words)

for i, word in enumerate(bow):
    word2index[word] = i+1
    index2word[i+1] = word

Vocabulary size: 125


In [56]:
eng_data = []
deu_data = []
cat_data = []
for eng_sent, deu_sent in eng_deu_pairs:
    eng_data.append(([word2index[word] for word in eng_sent.split(" ")], 0))
    deu_data.append(([word2index[word] for word in deu_sent.split(" ")], 1))
for _, cat_sent in eng_cat_pairs:
    cat_data.append(([word2index[word] for word in cat_sent.split(" ")], 2))


In [57]:
eng_data

[([124, 115, 120, 108, 45, 70, 111], 0),
 ([84, 78, 70, 36, 29, 67], 0),
 ([92, 116, 120, 108, 39, 114], 0),
 ([6, 113, 5, 108, 8, 33, 41, 45], 0),
 ([102, 81, 96, 2, 33, 53, 45], 0),
 ([38, 91, 113, 60, 35, 17, 16], 0),
 ([105, 72, 81, 116, 120, 108, 45], 0),
 ([45, 67, 66, 106, 68], 0),
 ([124, 111, 120, 108, 45, 99, 54], 0),
 ([92, 111, 120, 108, 4, 45], 0)]

In [58]:
index2word

{0: 'EOS',
 1: 'gebracht',
 2: 'calm',
 3: 'bei',
 4: 'older',
 5: 'linked',
 6: 'neuroticism',
 7: 'allgemeinen',
 8: 'sad',
 9: 'ruhige',
 10: 'relacionat',
 11: 'troba',
 12: 'trista',
 13: 'amables',
 14: 'de',
 15: 'negativa',
 16: 'fans',
 17: 'chanson',
 18: 'tenen',
 19: 'prefereixen',
 20: 'selten',
 21: 'personalitat',
 22: 'es',
 23: 'mit',
 24: 'haufiger',
 25: 'extravertits',
 26: 'gairebe',
 27: 'menys',
 28: 'diverses',
 29: 'musical',
 30: 'negativer',
 31: 'sich',
 32: 'wider',
 33: 'and',
 34: 'findet',
 35: 'in',
 36: 'diverse',
 37: 'del',
 38: 'low',
 39: 'top',
 40: 'offenheit',
 41: 'negative',
 42: 'reflecteixen',
 43: 'sie',
 44: 'gewissenhafte',
 45: 'music',
 46: 'neurotisme',
 47: 'frequencia',
 48: 'exits',
 49: 'consciencioses',
 50: 'esta',
 51: 'neurotizismus',
 52: 'spiegeln',
 53: 'retro',
 54: 'lyrics',
 55: 'antiga',
 56: 'freundliche',
 57: 'geringe',
 58: 'la',
 59: 'musica',
 60: 'found',
 61: 'vielfaltigere',
 62: 'amb',
 63: 'els',
 64: 'no',
 6

## Modeling

In [59]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [60]:
class ClassifierRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_classes):
        super(ClassifierRNN, self).__init__()

        self.hidden_size = hidden_size
        self.embed_size = embed_size

        self.embeddings = nn.Embedding(input_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True) #nn.RNN
        self.h2o = nn.Linear(hidden_size, num_classes)
        #self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input):
        embedded = self.embeddings(input)
        output, hidden = self.gru(embedded)
        return self.h2o(hidden)

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [61]:
import torch
import torch.nn as nn

class ClassifierRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_classes):
        super(ClassifierRNN, self).__init__()

        self.hidden_size = hidden_size
        self.embed_size = embed_size

        self.embeddings = nn.Embedding(input_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.h2o = nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        embedded = self.embeddings(input)  # [batch_size, seq_len, embed_size]
        output, hidden = self.rnn(embedded)  # output: [batch_size, seq_len, hidden_size]
        return self.h2o(hidden.squeeze(0))  # hidden: [1, batch_size, hidden_size]

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)


### Training set

In [62]:
eng_deu_pairs

[('extraverts generally listen to music more often',
  'extravertierte horen im allgemeinen haufiger musik'),
 ('they have more diverse musical preferences',
  'sie haben vielfaltigere musikvorlieben'),
 ('introverts rarely listen to top hits',
  'introvertierte horen selten top hits'),
 ('neuroticism is linked to sad and negative music',
  'neurotizismus wird mit trauriger und negativer musik in verbindung gebracht'),
 ('agreeable people prefer calm and retro music',
  'freundliche menschen bevorzugen ruhige und retro musik'),
 ('low openness is found in chanson fans',
  'geringe offenheit findet sich bei chanson fans'),
 ('highly conscientious people rarely listen to music',
  'sehr gewissenhafte menschen horen selten musik'),
 ('music preferences reflect personality traits',
  'musikvorlieben spiegeln personlichkeitsmerkmale wider'),
 ('extraverts often listen to music with lyrics',
  'extravertierte horen oft musik mit text'),
 ('introverts often listen to older music',
  'introver

In [66]:
sent = eng_deu_pairs[9][0]
print("Original sent:", sent)

input_ind = [word2index[word] for word in sent.split(' ')]
print("IDs:", input_ind)

input_tensor = torch.tensor(input_ind, dtype=torch.long, device=device).view(1, -1)
print(input_tensor)

rnn = ClassifierRNN(input_size=n_words+2, embed_size=100, hidden_size=50, num_classes=3)
emb = rnn.embeddings(input_tensor)
print("Embed size:", emb.size())
out, hid = rnn.rnn(emb)
print("RNN out and hid size:", out.size(), hid.size())
lin = rnn.h2o(hid)
print("After linear tranf:", lin.size())
print(lin)
#print(rnn.softmax(lin))

print(rnn(input_tensor))

Original sent: introverts often listen to older music
IDs: [92, 111, 120, 108, 4, 45]
tensor([[ 92, 111, 120, 108,   4,  45]])
Embed size: torch.Size([1, 6, 100])
RNN out and hid size: torch.Size([1, 6, 50]) torch.Size([1, 1, 50])
After linear tranf: torch.Size([1, 1, 3])
tensor([[[ 0.7834,  0.0677, -0.0795]]], grad_fn=<ViewBackward0>)
tensor([[ 0.7834,  0.0677, -0.0795]], grad_fn=<AddmmBackward0>)


In [67]:
## preparting torch dataset
n = len(eng_data+deu_data+cat_data)
input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
target_ids = np.zeros((n, 3), dtype=np.float64)

for idx, (sentence, label) in enumerate(eng_data+deu_data+cat_data):
    input_ids[idx, :len(sentence)] = sentence
    target_ids[idx, label] = 1


train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                            torch.FloatTensor(target_ids).to(device))

batch_size = 32
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [68]:
idx = 2
print(" ".join([index2word[i] for i in train_data[idx][0].tolist()]))
train_data[idx]

introverts rarely listen to top hits EOS EOS EOS EOS


(tensor([ 92, 116, 120, 108,  39, 114,   0,   0,   0,   0]),
 tensor([1., 0., 0.]))

### Training

In [69]:
rnn = ClassifierRNN(input_size=n_words, embed_size=100, hidden_size=50, num_classes=3)

learning_rate=0.001
n_epochs = 10

loss_func = nn.CrossEntropyLoss()
rnn_optimizer = optim.Adam(rnn.parameters(), lr=learning_rate)

for epoch in range(1, n_epochs + 1):
    total_loss = 0
    for data in tqdm(train_dataloader):
        input_tensor, target_tensor = data

        rnn_optimizer.zero_grad()
        output = rnn(input_tensor)

        loss = loss_func(output, target_tensor.reshape(output.size()))
        loss.backward()

        rnn_optimizer.step()

        total_loss += loss.item()

    print(f"Epoch: {epoch}, loss: {total_loss / len(train_dataloader)}")

100%|██████████| 1/1 [00:00<00:00, 163.18it/s]


Epoch: 1, loss: 1.1216109991073608


100%|██████████| 1/1 [00:00<00:00, 177.98it/s]


Epoch: 2, loss: 1.0881385803222656


100%|██████████| 1/1 [00:00<00:00, 200.41it/s]


Epoch: 3, loss: 1.070046305656433


100%|██████████| 1/1 [00:00<00:00, 185.68it/s]


Epoch: 4, loss: 1.0555486679077148


100%|██████████| 1/1 [00:00<00:00, 202.69it/s]


Epoch: 5, loss: 1.0393638610839844


100%|██████████| 1/1 [00:00<00:00, 202.41it/s]


Epoch: 6, loss: 1.0217127799987793


100%|██████████| 1/1 [00:00<00:00, 136.00it/s]


Epoch: 7, loss: 1.004358172416687


100%|██████████| 1/1 [00:00<00:00, 152.48it/s]


Epoch: 8, loss: 0.9886709451675415


100%|██████████| 1/1 [00:00<00:00, 126.98it/s]


Epoch: 9, loss: 0.9749861359596252


100%|██████████| 1/1 [00:00<00:00, 147.59it/s]

Epoch: 10, loss: 0.9627130031585693





In [70]:
torch.save(rnn.state_dict(), "./seq2seq_class.pt")

In [71]:
rnn = ClassifierRNN(input_size=n_words, embed_size=100, hidden_size=50, num_classes=3)
rnn.load_state_dict(torch.load("./seq2seq_class.pt"))
rnn.eval()

ClassifierRNN(
  (embeddings): Embedding(125, 100)
  (rnn): RNN(100, 50, batch_first=True)
  (h2o): Linear(in_features=50, out_features=3, bias=True)
)

In [72]:
sentence = "introverts rarely"
sentence_tokens = [word2index.get(t, word2index["UNK"]) for t in normalizeString(sentence).split()]
sentence_tokens

[92, 116]