Get the dataset

In [24]:
import pandas as pd
import requests

# URL of the CSV file
csv_url = 'https://huggingface.co/datasets/readerbench/ro-stories/resolve/main/ro_paragraphs_12516.csv'

# Download the CSV file
response = requests.get(csv_url)
csv_content = response.content.decode('utf-8')

# Save the CSV content to a file (optional)
with open('ro_stories.csv', 'w', encoding='utf-8') as file:
    file.write(csv_content)

# Read the CSV file into a DataFrame
df = pd.read_csv('ro_stories.csv')

Building the vocabulary and word list

In [60]:
words = df.paragraph.to_list()[:50]
words_test = df.paragraph.to_list()[101:200]

vocab = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i, s in enumerate(vocab)}
stoi['_'] = 0

stoi2 = {}
idx = 0
for v in stoi:
  for v2 in stoi:
    stoi2[v + v2] = idx
    idx += 1

itos = {i:s for s, i in stoi.items()}

Creating the training set

In [61]:
xs, ys = [], []
ctx_len = 5

for w in words:
  chars = ['_'] + list(w[:50]) + ['_']
  ctx = [stoi[chr] for chr in w[:ctx_len + 1]]
  for ch in w[ctx_len + 1:]:
    xs.append(ctx)
    ys.append(stoi[ch])
    ctx = ctx[1:] + [stoi[ch]]

Building the tensors

In [63]:
import torch
import torch.nn as nn
import torch.nn.functional as nnf

xs = torch.tensor(xs)
ys = torch.tensor(ys)
lne = len(stoi2)
ln = len(stoi)

  xs = torch.tensor(xs)
  ys = torch.tensor(ys)


Building our neural net

In [28]:
gen = torch.Generator().manual_seed(77)
W = torch.randn((lne, ln), generator=gen, requires_grad=True)

Gradient descent

In [35]:
for p in range(200):
  # Forward pass
  xe = nnf.one_hot(xs, num_classes=lne).float()
  logits = xe @ W
  counts = logits.exp()                                 # Softmax
  probs = counts / counts.sum(1, keepdims=True)         # -------
  loss = -probs[torch.arange(probs.shape[0]), ys].log().mean() + 0.000000001 * (W**2).mean()

  # Backward pass
  W.grad = None
  loss.backward()
  W.data += -100 * W.grad
  print('pass=' + str(p) + ', loss=' + str(loss.item()))

pass=0, loss=1.6883344650268555
pass=1, loss=1.6876860857009888
pass=2, loss=1.6870439052581787
pass=3, loss=1.6864075660705566
pass=4, loss=1.6857770681381226
pass=5, loss=1.6851526498794556
pass=6, loss=1.6845343112945557
pass=7, loss=1.6839213371276855
pass=8, loss=1.683314323425293
pass=9, loss=1.6827127933502197
pass=10, loss=1.6821168661117554
pass=11, loss=1.6815264225006104
pass=12, loss=1.6809414625167847
pass=13, loss=1.6803618669509888
pass=14, loss=1.6797876358032227
pass=15, loss=1.6792188882827759
pass=16, loss=1.6786551475524902
pass=17, loss=1.6780967712402344
pass=18, loss=1.67754328250885
pass=19, loss=1.676994800567627
pass=20, loss=1.6764516830444336
pass=21, loss=1.6759132146835327
pass=22, loss=1.675379753112793
pass=23, loss=1.6748512983322144
pass=24, loss=1.6743274927139282
pass=25, loss=1.6738084554672241
pass=26, loss=1.6732940673828125
pass=27, loss=1.672784447669983
pass=28, loss=1.6722792387008667
pass=29, loss=1.6717787981033325
pass=30, loss=1.6712825298

In [40]:
gen = torch.Generator().manual_seed(77)

for i in range(5):
  out = ['Oa']
  ix = stoi2[out[0]]

  while True:
      xe = nnf.one_hot(torch.tensor([ix]), num_classes=lne).float()
      logits = xe @ W
      counts = logits.exp()
      probs = counts / counts.sum(1, keepdim=True)

      ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=gen).item()

      if ix == stoi['</s>']: break;
      out.append(itos[ix])

  print(''.join(out))

Oa
OaderăS
OaRrăaşiată-
OaCu
Oa
