In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np
import random
import time
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [3]:
learning_rate = 0.005
n_iters = 200000
print_every = 5000
plot_every = 1000
n_confusion = 10000

In [4]:
def get_unique_tags(df):
  tags_list = df['Tags']
  unique_tags = set([])
  for tags in tags_list:
    tags = tags.split("'")[1:-1:2]
  for tag in tags:
    unique_tags.add(tag)
  return list(unique_tags)

def get_vector(question, model):
   tokens = simple_preprocess(question)
   vectors = [model.wv[token] for token in tokens if token in model.wv]
   return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

def questionToTensor(question):
    vector = get_vector(question,model)
    tensor = torch.tensor(vector, dtype=torch.float32)
    return tensor

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return categories[category_i], category_i

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(categories)
    question = randomChoice(tagged_questions[category])
    category_tensor = torch.tensor([categories.index(category)], dtype=torch.long)
    question_tensor = questionToTensor(question)
    question_tensor = torch.unsqueeze(question_tensor, 0)
    return category, question, category_tensor, question_tensor


def train(category_tensor, question_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(question_tensor.size()[0]):
        output, hidden = rnn(question_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

def evaluate(question_tensor):
    hidden = rnn.initHidden()

    for i in range(question_tensor.size()[0]):
        output, hidden = rnn(question_tensor[i], hidden)

    return output

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [5]:
df = pd.read_csv('https://github.com/yarathealmighty/dumps/blob/main/data/not_na_tags/latin_374_w2v.csv?raw=true')

# str to float conversion
vectors = []
for i in range(0,len(df)):
  vector = df['Question_Vector'][i][1:-1].split()
  new_vector = []
  for value in vector:
    value = float(value)
    new_vector.append(value)
  vectors.append(new_vector)

df['Question_Vector'] = vectors
df

Unnamed: 0,Id,Tags,Question,Answer,Cosine Similarity (CBOW),Cosine Similarity (Skip-gram),Question_Vector
0,1,"['classical-latin', 'meaning', 'vocabulary', '...","Are ""-que"" and ""et"" equivalent? I was taught t...","The way I was taught was that, as a general r...",0.376483,0.533986,"[0.410459399, -0.238088086, 0.142925978, 0.609..."
1,2,"['contemporary-latin', 'pronouns', 'articles',...","Why is ""ille"" used in Winnie ille Pu and Hobbi...","It's true that in Classical Latin, ille is a ...",0.336087,0.566115,"[0.384361714, -0.26050964, 0.156550348, 0.4222..."
2,4,"['agreement', 'predicate', 'gender', 'adjectiv...",What gender should a predicate adjective be to...,Bennett's New Latin Grammar ( this link will ...,0.387429,0.567729,"[0.35846892, -0.30243662, 0.109872311, 0.48757..."
3,5,['pronunciation'],What effect should a macron have on the sound ...,"In most modern texts, the whole purpose of us...",0.472779,0.607277,"[0.42342234, -0.21060595, 0.13680652, 0.528650..."
4,7,"['etymology', 'particle', 'hebrew']","Does ""ad"" have its origin in Hebrew/Semitic la...","No, the similarity is almost certainly accide...",0.368243,0.542787,"[0.40219393, -0.2743029, 0.16766459, 0.4076303..."
...,...,...,...,...,...,...,...
5879,23416,"['english-to-latin-translation', 'classical-la...","Can someone help translating ""one must die for...",One possibility: necesse est alterum mori ut ...,0.414603,0.562434,"[0.415150702, -0.383984089, 0.0282603055, 0.50..."
5880,23420,['english-to-latin-translation'],Four more loaves please This new question: How...,"In Nicholas Oulton's Book II, p.50: ""plures c...",0.352794,0.520526,"[0.44922554, -0.34840554, 0.23837692, 0.473246..."
5881,23425,"['latin-to-english-translation', 'mathematics']","Mathematical Latin Help So, I'm a PhD student ...","I understand ""ut quotiens sit integer complex...",0.394950,0.557162,"[0.33285397, -0.23168108, 0.12974669, 0.373364..."
5882,23429,"['latin-to-english-translation', 'translation-...",Does “interranima” mean “inner soul”? I came a...,"No. Anima is the Latin word for soul, apart f...",0.334330,0.523648,"[0.486243784, -0.432804465, 0.145411789, 0.460..."


In [6]:
vectors = df['Question_Vector']
tensors = [torch.tensor(vector, dtype=torch.float32) for vector in vectors]

n_letters = len(tensors[0])

categories = get_unique_tags(df)
n_categories = len(categories)

In [7]:
tokenized_questions = [simple_preprocess(q) for q in df['Question']]

model = Word2Vec(sentences=tokenized_questions, vector_size=374, window=5, min_count=1, workers=4)

In [8]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        hidden = F.tanh(self.i2h(input) + self.h2h(hidden))
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [10]:
# dict where keys are categories and values are lists of questions in each category
tagged_questions = {}
for tag in categories:
    tagged_questions[tag] = df[df['Tags'].apply(lambda tags: tag in tags)]['Question'].tolist()

for i in range(10):
    category, question, category_tensor, question_tensor = randomTrainingExample()
    print('category =', category, '/ question =', question, '\n\n------------------------------------------------------------------------------------------------------------------------------------------\n')

category = pronunciation / question = How were elided vowels pronounced by the Romans? When I scan a verse, I simply leave out elided vowels: I don't pronounce them at all. But I've heard theories that the Romans did pronounce them, but just as very short vowels of half a syllable's length. What do we know about this? Is there consensus about the pronunciation? 

------------------------------------------------------------------------------------------------------------------------------------------

category = pronunciation / question = Stress and vowel length When pronouncing a word in classical Latin, should the heavy syllable and all other long vowels be lengthened, or just the heavy syllable? For example: fació, is it pronounced [fa:kio:] or [fa:kio] ? (in) Európá [ɛʊ̯roːpa] or [ɛʊ̯ro:pa:]? 

------------------------------------------------------------------------------------------------------------------------------------------

category = diphthong / question = Pronunciation of 

In [None]:
criterion = nn.NLLLoss()

current_loss = 0
all_losses = []

start = time.time()

for iter in range(1, n_iters + 1):
    category, question, category_tensor, question_tensor = randomTrainingExample()
    output, loss = train(category_tensor, question_tensor)
    current_loss += loss

    # decoration
    if iter % print_every == 0:
        guess, guess_i = categoryFromOutput(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s ... / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, question[:30], guess, correct))

    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

5000 2% (0m 7s) 0.8102 Abbreviation I haven’t seen be ... / ligatures ✓
10000 5% (0m 14s) 1.3159 How can I translate a slogan " ... / ligatures ✗ (spelling)
15000 7% (0m 22s) 0.7388 Use of ß (“eszett”) in Latin t ... / ligatures ✓
20000 10% (0m 29s) 1.0959 Has these Umbrian words been r ... / diphthong ✗ (spelling)
25000 12% (0m 37s) 1.0301 Does the pronunciation of 'gn' ... / spelling ✗ (pronunciation)
30000 15% (0m 44s) 0.7550 When did genuine and spurious  ... / diphthong ✓
35000 17% (0m 51s) 0.4506 What do we know about Homer's  ... / pronunciation ✓
40000 20% (0m 59s) 0.0089 Is it possible to have a singl ... / ligatures ✓
45000 22% (1m 6s) 0.4821 When is "ei" a diphthong? Many ... / diphthong ✓
50000 25% (1m 14s) 0.3602 How to indicate a diphthong? I ... / ligatures ✓
55000 27% (1m 21s) 1.2313 Compensative lengthening of ε  ... / pronunciation ✗ (diphthong)
60000 30% (1m 29s) 0.0063 Is it possible to have a singl ... / ligatures ✓
65000 32% (1m 36s) 0.0154 Abbreviation I haven’t 

In [None]:
plt.figure()
plt.plot(all_losses)