In [1]:
import requests as r
from bs4 import BeautifulSoup
import torch

# GET THE DATA

In [2]:
URL = "https://dmf.unicatt.it/~della/pythoncourse18/commedia.txt"
DATASET = "dataset.txt"

In [3]:
response = r.get(URL).text
with open(DATASET, mode="w", encoding="utf-8") as f:
    f.write(response)


In [4]:
with open(DATASET, mode='r', encoding="utf-8") as f:
    data = f.read()

In [5]:
print(f"Number of chars in the dataset is {len(data)}")

Number of chars in the dataset is 551846


In [6]:
data[:500]

"LA DIVINA COMMEDIA\n\ndi Dante Alighieri\n\nINFERNO\n\n\n\n\n\n\n\nInferno: Canto I\n\n\n\n  Nel mezzo del cammin di nostra vita\n\nmi ritrovai per una selva oscura\n\nché la diritta via era smarrita.\n\n  Ahi quanto a dir qual era è cosa dura\n\nesta selva selvaggia e aspra e forte\n\nche nel pensier rinova la paura!\n\n  Tant'è amara che poco è più morte;\n\nma per trattar del ben ch'i' vi trovai,\n\ndirò de l'altre cose ch'i' v'ho scorte.\n\n  Io non so ben ridir com'i' v'intrai,\n\ntant'era pien di sonno a quel punto\n\nche la v"

In [7]:
chars = sorted(list(set(data)))
print("".join(chars))



 !"'(),-.:;?ABCDEFGHILMNOPQRSTUVXZabcdefghijlmnopqrstuvxyz~àèéìïòóù


In [8]:
print(len(chars))

68


# Encoder and Decoder

In [9]:
char_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_char = {i:ch for i,ch in enumerate(chars)}
encode = lambda ch: [char_to_int[c] for c in ch]
decode = lambda l: "".join([int_to_char[i] for i in l])

In [10]:
print(encode("Divina Commedia"))
print(decode(encode("Divina Commedia")))

[16, 43, 55, 43, 47, 35, 1, 15, 48, 46, 46, 39, 38, 43, 35]
Divina Commedia


### Dataset to tensor

In [11]:
dataset_tensor = torch.tensor(encode(data), dtype=torch.long)
print(dataset_tensor.shape, dataset_tensor.dtype)
print(dataset_tensor[0:200])

torch.Size([551846]) torch.int64
tensor([22, 13,  1, 16, 21, 32, 21, 24, 13,  1, 15, 25, 23, 23, 17, 16, 21, 13,
         0,  0, 38, 43,  1, 16, 35, 47, 53, 39,  1, 13, 45, 43, 41, 42, 43, 39,
        51, 43,  0,  0, 21, 24, 18, 17, 28, 24, 25,  0,  0,  0,  0,  0,  0,  0,
         0, 21, 47, 40, 39, 51, 47, 48, 10,  1, 15, 35, 47, 53, 48,  1, 21,  0,
         0,  0,  0,  1,  1, 24, 39, 45,  1, 46, 39, 58, 58, 48,  1, 38, 39, 45,
         1, 37, 35, 46, 46, 43, 47,  1, 38, 43,  1, 47, 48, 52, 53, 51, 35,  1,
        55, 43, 53, 35,  0,  0, 46, 43,  1, 51, 43, 53, 51, 48, 55, 35, 43,  1,
        49, 39, 51,  1, 54, 47, 35,  1, 52, 39, 45, 55, 35,  1, 48, 52, 37, 54,
        51, 35,  0,  0, 37, 42, 62,  1, 45, 35,  1, 38, 43, 51, 43, 53, 53, 35,
         1, 55, 43, 35,  1, 39, 51, 35,  1, 52, 46, 35, 51, 51, 43, 53, 35,  9,
         0,  0,  1,  1, 13, 42, 43,  1, 50, 54, 35, 47, 53, 48,  1, 35,  1, 38,
        43, 51])


## Divide Data into Training and Validation Sets

In [12]:
split = int(0.9*len(data))
train_data = data[:split]
validation_data = data[split:]


In [13]:
len(train_data)

496661

In [14]:
len(validation_data)

55185