In [None]:
import requests
import re
import string

import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from numpy import array
from torch.utils.data import DataLoader, TensorDataset

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# book: https://www.gutenberg.org/cache/epub/1497/pg1497.txt

In [None]:
url = "https://www.gutenberg.org/ebooks/1497.txt.utf-8"
response = requests.get(url)

if response.status_code == 200:
    with open("The_Republic_by_Plato.txt", "w", encoding="utf-8") as file:
        file.write(response.text)
    print("Download successful! The text has been saved to 'The_Republic_by_Plato.txt'.")
else:
    print(f"Error downloading the file. Status code: {response.status_code}")

Download successful! The text has been saved to 'The_Republic_by_Plato.txt'.


In [None]:
def load_doc(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

def clean_doc(doc):
  doc = doc.replace('--', ' ')
  tokens = doc.split()
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  tokens = [re_punc.sub('', w) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  return tokens

def save_doc(lines, filename):
  data = '\n'.join(lines)
  file = open(filename, 'w')
  file.write(data)
  file.close()

def load_doc(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

In [None]:
FILENAME = 'The_Republic_by_Plato.txt'
doc = load_doc(FILENAME)
tmp_string = 'introduced in the Timaeus.'
start = doc.find(tmp_string) + len(tmp_string)
end = doc.find('*** END OF THE PROJECT GUTENBERG EBOOK THE REPUBLIC ***')
doc = doc[start:end].strip()
doc[:200]

'BOOK I.\n\n\nI went down yesterday to the Piraeus with Glaucon the son of Ariston,\nthat I might offer up my prayers to the goddess (Bendis, the Thracian\nArtemis.); and also because I wanted to see in wha'

In [None]:
# clean
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'run', 'and', 'bid',

In [None]:
# sequences
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
  seq = tokens[i-length:i]
  line = ' '.join(seq)
  sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 117291


In [None]:
# save
out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)

In [None]:
# load
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [None]:
# tokenizer
tokenizer = get_tokenizer('basic_english')
tokens = [tokenizer(text) for text in lines]
tokens = array(tokens)
vocab = build_vocab_from_iterator(tokens)
vocab_size = len(vocab)

In [None]:
sequences = [vocab(tokenizer(line)) for line in lines]

In [None]:
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

y_tensor = torch.tensor(y)
X_tensor = torch.tensor(X)

y_ohe = F.one_hot(y_tensor,num_classes=vocab_size)
y_ohe = y_ohe.float()
seq_length = X.shape[1]

In [None]:
print(y_ohe.shape)
print(X_tensor.shape)

torch.Size([117291, 7323])
torch.Size([117291, 50])


In [None]:
class LM(nn.Module):
  def __init__(self,vocab_size):
    super(LM,self).__init__()
    self.embed = nn.Embedding(vocab_size,50)
    self.lstm1 = nn.LSTM(50,100,batch_first=True)
    self.lstm2 = nn.LSTM(100,100, batch_first=True)
    self.lin1 = nn.Linear(100,100)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(100,vocab_size)
    self.softmax = nn.Softmax(dim=1)

  def forward(self,input):
    embedded = self.embed(input)
    lstm_out1, _ = self.lstm1(embedded)
    lstm_out2, _ = self.lstm2(lstm_out1)
    h1 = self.relu(self.lin1(lstm_out2[:,-1,:]))
    h2 = self.lin2(h1)
    output = self.softmax(h2)
    return output

In [None]:
dataset = TensorDataset(X_tensor, y_ohe)
# TODO ő bajos lehet
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
# note: valamiért nagyon nagyon hosszú a tanitás, de megintcsak torchtext miatt el vagyok akadva

In [None]:
model = LM(vocab_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

# train
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in dataloader:

        outputs = model(X_batch)

        loss = criterion(outputs,y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# eval
with torch.no_grad():
    model.eval()
    X_tensor = dataset.data
    y_tensor = dataset.labels

    outputs = model(X_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_tensor).float().mean()
    print('Accuracy: %f' % (accuracy.item() * 100))

Epoch [1/5], Loss: 8.8990
Epoch [2/5], Loss: 8.8990
