In [1]:
import zipfile
import os

# Path to the uploaded ZIP file (It will be in /content)
zip_path = 'dataset.zip'  # Replace with your uploaded file name
extract_path = 'dataset'  # Directory to extract to

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Dataset extracted successfully!")

Dataset extracted successfully!


Verify the Dataset Folder Structure

In [2]:
# Update the dataset directory path
dataset_dir = 'dataset/dataset'  # Adjusted path

# List the poets' folders
print("Poets Folder:", os.listdir(dataset_dir))

# Pick the first poet's folder
sample_poet = os.listdir(dataset_dir)[0]
sample_poet_path = os.path.join(dataset_dir, sample_poet)

# List the subfolders inside the poet's folder (should be 'en', 'ur', 'hi')
print(f"Subfolders for {sample_poet}: {os.listdir(sample_poet_path)}")


Poets Folder: ['dagh-dehlvi', 'jigar-moradabadi', 'sahir-ludhianvi', 'naseer-turabi', 'kaifi-azmi', 'firaq-gorakhpuri', 'ameer-khusrau', 'parveen-shakir', 'javed-akhtar', 'nazm-tabatabai', 'ahmad-faraz', 'meer-anees', 'jaan-nisar-akhtar', 'waseem-barelvi', 'altaf-hussain-hali', 'mirza-ghalib', 'allama-iqbal', 'nida-fazli', 'fahmida-riaz', 'naji-shakir', 'gulzar', 'wali-mohammad-wali', 'jaun-eliya', 'meer-taqi-meer', 'mohsin-naqvi', 'bahadur-shah-zafar', 'akbar-allahabadi', '.DS_Store', 'noon-meem-rashid', 'faiz-ahmad-faiz', 'habib-jalib']
Subfolders for dagh-dehlvi: ['hi', 'en', 'ur']


Load & Read the Poetry Files from "en" Folders
Now, we need to extract poetry only from the "en" folder of each poet.

In [4]:
import os

# Path to the dataset containing poet folders
dataset_dir = 'dataset/dataset'

# List to store all poetry texts
poetry_texts = []

# Iterate through each poet's folder
for poet in os.listdir(dataset_dir):
    poet_path = os.path.join(dataset_dir, poet)

    # Check if it's a directory (ignore files like .DS_Store)
    if os.path.isdir(poet_path):
        en_path = os.path.join(poet_path, "en")  # Path to "en" poetry folder

        # Check if "en" folder exists
        if os.path.exists(en_path):
            for file in os.listdir(en_path):
                file_path = os.path.join(en_path, file)

                # Read the poetry file and append to list
                with open(file_path, "r", encoding="utf-8") as f:
                    poetry_texts.append(f.read())

# Print sample poetry to verify
print(f"Total Poems Collected: {len(poetry_texts)}")
print("Sample Poem:\n", poetry_texts[0][:500])  # Print first 500 characters


Total Poems Collected: 1314
Sample Poem:
 
tum ā.īna hī na har baar dekhte jaao 
mirī taraf bhī to sarkār dekhte jaao 
na jaao hāl-e-dil-e-zār dekhte jaao 
ki jī na chāhe to nā-chār dekhte jaao 
bahār-e-umr meñ bāġh-e-jahāñ kī sair karo 
khilā huā hai ye gulzār dekhte jaao 
yahī to chashm-e-haqīqat nigar kā surma hai 
nizā-e-kāfir-o-dīñ-dār dekhte jaao 
uThāo aañkh na sharmāo ye to mahfil hai 
ġhazab se jānib-e-aġhyār dekhte jaao 
nahīñ hai jins-e-vafā kī tumheñ jo qadr na ho 
baneñge kitne ḳharīdār dekhte jaao 
tumheñ ġharaz jo karo ra


Preprocess the Text Data
Before feeding data into the model, we need to:

Lowercase the text (optional)
Remove unwanted characters & extra spaces
Tokenize the text into sequences

In [29]:
import re

def preprocess_text(text):
    # Remove special characters except basic punctuations
    # text = re.sub(r'[^a-zA-Z0-9.,!?\'\s]', '', text)

    # Convert multiple spaces into a single space
    # text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply preprocessing to all poems
cleaned_poetry = [preprocess_text(poem) for poem in poetry_texts]

# Print a cleaned sample
print("Cleaned Sample:\n", cleaned_poetry[0][:500])


Cleaned Sample:
 
tum ā.īna hī na har baar dekhte jaao 
mirī taraf bhī to sarkār dekhte jaao 
na jaao hāl-e-dil-e-zār dekhte jaao 
ki jī na chāhe to nā-chār dekhte jaao 
bahār-e-umr meñ bāġh-e-jahāñ kī sair karo 
khilā huā hai ye gulzār dekhte jaao 
yahī to chashm-e-haqīqat nigar kā surma hai 
nizā-e-kāfir-o-dīñ-dār dekhte jaao 
uThāo aañkh na sharmāo ye to mahfil hai 
ġhazab se jānib-e-aġhyār dekhte jaao 
nahīñ hai jins-e-vafā kī tumheñ jo qadr na ho 
baneñge kitne ḳharīdār dekhte jaao 
tumheñ ġharaz jo karo ra


Convert Poetry into Sequences
Since LSTM models require numerical input, we need to:

Tokenize words into unique integers
Create input-output sequences (X: previous words, Y: next word)

In [6]:
!pip install torch==2.0.1 torchtext==0.15.2 --no-cache-dir


Collecting torch==2.0.1
  Downloading torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchtext==0.15.2
  Downloading torchtext-0.15.2-cp311-cp311-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.1)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Co

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader

# Tokenize poetry into words
def tokenize_poetry(poetry_list):
    tokenized_poems = []
    for poem in poetry_list:
        # Replace actual newlines with a special token
        poem = poem.replace("\n", " <NEWLINE> ")
        tokenized_poems.append(poem.split())  # Tokenize normally
    return tokenized_poems


# Build vocabulary
tokenized_poems = tokenize_poetry(cleaned_poetry)

vocab = build_vocab_from_iterator(tokenized_poems, specials=["<PAD>", "<UNK>", "<NEWLINE>"])
vocab.set_default_index(vocab["<UNK>"])


# Convert words to indices
def encode_poetry(poem):
    return [vocab[word] for word in poem]

encoded_poems = [encode_poetry(poem) for poem in tokenized_poems]

# Print vocabulary size
print(f"Vocabulary Size: {len(vocab)}")
print("Encoded Sample Poem:", encoded_poems[0][:20])  # Show first 10 encoded words


Vocabulary Size: 17345
Encoded Sample Poem: [2, 52, 410, 29, 9, 38, 166, 164, 325, 2, 83, 158, 11, 12, 3903, 164, 325, 2, 9, 325]


In [39]:
class SimpleVocab:
    def __init__(self, vocab):
        self.stoi = vocab.get_stoi()  # Word to index mapping
        self.itos = vocab.get_itos()  # Index to word mapping
        self.default_index = vocab.get_default_index()  # Default index for unknown words

    def __getitem__(self, word):
        return self.stoi.get(word, self.default_index)

    def lookup_token(self, index):
        return self.itos[index] if 0 <= index < len(self.itos) else "<UNK>"

In [40]:

import pickle
simple_vocab = SimpleVocab(vocab)
with open('simple_vocab.pkl', 'wb') as f:
    pickle.dump(simple_vocab, f)


Prepare Data for Training
We need to:

Create sequences of words (e.g., first 5 words → predict next word)
Pad sequences for uniform input size
Split data into training & validation

In [33]:
from torch.nn.utils.rnn import pad_sequence

class PoetryDataset(Dataset):
    def __init__(self, poems, seq_length=6):
        self.seq_length = seq_length
        self.data = []

        # Create sequences
        for poem in poems:
            if len(poem) > seq_length:
                for i in range(len(poem) - seq_length):
                    seq = poem[i:i+seq_length]
                    target = poem[i+seq_length]
                    self.data.append((seq, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence, target = self.data[idx]
        return torch.tensor(sequence), torch.tensor(target)

# Create dataset
seq_length = 6
dataset = PoetryDataset(encoded_poems, seq_length)

# DataLoader
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Print sample batch
for x, y in dataloader:
    print("Sample Input:", x[0])
    print("Sample Target:", y[0])
    break


Sample Input: tensor([ 44,   8, 901,  12,  32, 600])
Sample Target: tensor(9)


Build an LSTM Model in PyTorch

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim

class PoetryLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super(PoetryLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

        # Layer Normalization (Added)
        self.layer_norm = nn.LayerNorm(hidden_dim)

        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)

        # Apply Layer Normalization
        lstm_out = self.layer_norm(lstm_out)

        out = self.fc(lstm_out[:, -1])  # Use last LSTM output
        return out

# Model
model = PoetryLSTM(vocab_size=len(vocab)).to("cuda")

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Learning Rate Scheduler (Reduce LR after 25 epochs)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.1)

print("Model Initialized")


Model Initialized


Train the Model

In [35]:
num_epochs = 51

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0

    for x_batch, y_batch in dataloader:
        x_batch, y_batch = x_batch.to("cuda"), y_batch.to("cuda")

        optimizer.zero_grad()
        output = model(x_batch)

        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.detach().item()  # Detach to save memory

    # Reduce learning rate after step_size epochs
    scheduler.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}, LR: {scheduler.get_last_lr()[0]:.6f}")

print("Training Completed!")


Epoch [1/51], Loss: 6.3095, LR: 0.000500
Epoch [2/51], Loss: 5.5091, LR: 0.000500
Epoch [3/51], Loss: 4.8910, LR: 0.000500
Epoch [4/51], Loss: 4.2155, LR: 0.000500
Epoch [5/51], Loss: 3.4983, LR: 0.000500
Epoch [6/51], Loss: 2.8309, LR: 0.000500
Epoch [7/51], Loss: 2.2810, LR: 0.000500
Epoch [8/51], Loss: 1.8264, LR: 0.000500
Epoch [9/51], Loss: 1.4493, LR: 0.000500
Epoch [10/51], Loss: 1.1344, LR: 0.000500
Epoch [11/51], Loss: 0.8883, LR: 0.000500
Epoch [12/51], Loss: 0.6999, LR: 0.000500
Epoch [13/51], Loss: 0.5694, LR: 0.000500
Epoch [14/51], Loss: 0.4815, LR: 0.000500
Epoch [15/51], Loss: 0.4266, LR: 0.000500
Epoch [16/51], Loss: 0.3895, LR: 0.000500
Epoch [17/51], Loss: 0.3618, LR: 0.000500
Epoch [18/51], Loss: 0.3367, LR: 0.000500
Epoch [19/51], Loss: 0.3198, LR: 0.000500
Epoch [20/51], Loss: 0.3062, LR: 0.000500
Epoch [21/51], Loss: 0.2939, LR: 0.000500
Epoch [22/51], Loss: 0.2857, LR: 0.000500
Epoch [23/51], Loss: 0.2744, LR: 0.000500
Epoch [24/51], Loss: 0.2669, LR: 0.000500
E

In [43]:
import random

def generate_poetry(seed_text, model, vocab, max_words=40):
    model.eval()
    words = seed_text.split()

    for _ in range(max_words):
        encoded = torch.tensor([vocab[word] for word in words[-6:]]).unsqueeze(0).to("cuda")
        with torch.no_grad():
            output = model(encoded)
            next_word = vocab.lookup_token(output.argmax().item())
            words.append(next_word)

    return " ".join(words)

# Example Usage
seed = "tum ā.īna hī na har baar"
def print_poetry(generated_text):
    formatted_text = generated_text.replace(" <NEWLINE> ", "\n")
    return formatted_text

generate=generate_poetry(seed, model, vocab)
print(print_poetry(generate))


tum ā.īna hī na har baar dekhte jaao
mirī taraf bhī to sarkār dekhte jaao
na jaao hāl-e-dil-e-zār dekhte jaao
ki jī na chāhe to nā-chār dekhte jaao
bahār-e-umr meñ bāġh-e-jahāñ kī sair karo
khilā huā hai ye gulzār dekhte jaao


In [41]:
# Save the trained model
torch.save(model.state_dict(), "poetry_lstm_model.pth")


In [42]:
from google.colab import files
files.download("/content/poetry_lstm_model.pth")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>