In [1]:
# Importing Dependencies
import torch
from torch.utils.data import Dataset, DataLoader
from x_transformers import TransformerWrapper, Decoder, AutoregressiveWrapper
from torch.optim import Adam
from datasets import load_dataset
import re
from tokenizers import Tokenizer, models
from tokenizers.implementations import ByteLevelBPETokenizer

In [None]:
# define device, either 'cuda' or 'cpu'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Cleaning Function
def clean_text(text):
    # Remove non-Hindi characters
    text = re.sub(r"[^ऀ-ॿ\s]", "", text)
    # Remove extra whitespaces
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
# Loading the dataset
dataset = load_dataset("oscar-corpus/OSCAR-2201",
                        use_auth_token=True,
                        language="hi",
                        streaming=True,
                        split="train")

In [None]:
# Clean the dataset and make a generator
def cleaned_dataset_generator(dataset):
    for d in dataset:
        yield clean_text(d['text'])

In [None]:
cleaned_dataset = cleaned_dataset_generator(dataset)

In [None]:
# define device, either 'cuda' or 'cpu'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Tokenizing dataset
hindi_tokens = [tokenizer.encode(text).ids for text in cleaned_dataset]

In [None]:
class HindiDataset(Dataset):
    def __init__(self, tokens):
        self.tokens = tokens

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        return self.tokens[idx]

In [None]:
# prepare dataloader
dataset = HindiDataset(hindi_tokens)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
# initialize the model
model = TransformerWrapper(
    num_tokens = 50000,   # replace with your vocab size
    max_seq_len = 1024,
    attn_layers = Decoder(
        dim = 512,
        depth = 12,
        heads = 12
    )
).to(device)

In [None]:
# wrap model for autoregressive language modelling
model = AutoregressiveWrapper(model)

In [None]:
# optimizer
optimizer = Adam(model.parameters(), lr=0.001)

In [None]:
# training loop
for epoch in range(10):  # adjust the number of epochs as needed
    for batch in loader:
        batch = batch.to(device)
        loss = model(batch, return_loss=True)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch: {epoch}, Loss: {loss.item()}")