Get the dataset

In [126]:
import pandas as pd
import requests

# URL of the CSV file
csv_url = 'https://huggingface.co/datasets/readerbench/ro-stories/resolve/main/ro_paragraphs_12516.csv'

# Download the CSV file
response = requests.get(csv_url)
csv_content = response.content.decode('utf-8')

# Save the CSV content to a file (optional)
with open('ro_stories.csv', 'w', encoding='utf-8') as file:
    file.write(csv_content)

# Read the CSV file into a DataFrame
df = pd.read_csv('ro_stories.csv')

Building the vocabulary and word list

In [127]:
words = df.paragraph.to_list()[:100]
words_test = df.paragraph.to_list()[101:200]

vocab = sorted(list(set(''.join(words))))

stoi = {s:i for i, s in enumerate(vocab)}
stoi['<s>'] = 66
stoi['</s>'] = 67

itos = {i:s for s, i in stoi.items()}

Creating the training set

In [128]:
xs, ys = [], []

for w in words[:]:
  chars = ['<s>'] + list(w[:50]) + ['</s>']
  for ch1, ch2 in zip(chars, chars[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)

Building the tensors

In [129]:
import torch
import torch.nn as nn
import torch.nn.functional as nnf

xs = torch.tensor(xs)
ys = torch.tensor(ys)
lne = len(stoi)

Building our neural net

In [130]:
gen = torch.Generator().manual_seed(77)
W = torch.randn((lne, lne), generator=gen, requires_grad=True)

Gradient descent

In [156]:
for p in range(1000):
  # Forward pass
  xe = nnf.one_hot(xs, num_classes=lne).float()
  logits = xe @ W
  counts = logits.exp()                                 # Softmax
  probs = counts / counts.sum(1, keepdims=True)         # -------
  loss = -probs[torch.arange(probs.shape[0]), ys].log().mean() + 0.01 * (W**2).mean()

  # Backward pass
  W.grad = None
  loss.backward()
  W.data += -10 * W.grad
  print('pass=' + str(p) + ', loss=' + str(loss.item()))

pass=0, loss=2.390026092529297
pass=1, loss=2.390014171600342
pass=2, loss=2.3900020122528076
pass=3, loss=2.3899898529052734
pass=4, loss=2.3899779319763184
pass=5, loss=2.389965534210205
pass=6, loss=2.389953374862671
pass=7, loss=2.389941692352295
pass=8, loss=2.3899292945861816
pass=9, loss=2.3899176120758057
pass=10, loss=2.3899054527282715
pass=11, loss=2.3898937702178955
pass=12, loss=2.3898818492889404
pass=13, loss=2.3898696899414062
pass=14, loss=2.3898580074310303
pass=15, loss=2.389846086502075
pass=16, loss=2.389833927154541
pass=17, loss=2.389821767807007
pass=18, loss=2.389810085296631
pass=19, loss=2.389798402786255
pass=20, loss=2.3897862434387207
pass=21, loss=2.3897745609283447
pass=22, loss=2.3897626399993896
pass=23, loss=2.3897509574890137
pass=24, loss=2.3897392749786377
pass=25, loss=2.3897275924682617
pass=26, loss=2.3897154331207275
pass=27, loss=2.3897042274475098
pass=28, loss=2.3896920680999756
pass=29, loss=2.3896801471710205
pass=30, loss=2.38966870307922

In [161]:
gen = torch.Generator() #.manual_seed(77)

for i in range(5):
  out = []
  ix = stoi['<s>']

  while True:
      xe = nnf.one_hot(torch.tensor([ix]), num_classes=lne).float()
      logits = xe @ W
      counts = logits.exp()
      probs = counts / counts.sum(1, keepdim=True)

      ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=gen).item()

      if ix == stoi['</s>']: break;
      out.append(itos[ix])

  print(''.join(out))

– niră tear
Ea ca-ui măl şin făre-aparodăruca 
S! şi vol, vi dajd căşinec ai reat ocieu cea-mă, că pupea, şi m ma mistă lum a, ntrergesătreul cheIatâni m Căre,
MarîncinaMdştă o3Îă n steu m joi sc. daluleagrele oa
– ro steapât în-mă înedica brai bde – veou ce Vozi pre luni e, de 
