In [None]:

!pip install requests torch nltk beautifulsoup4

import requests
import torch
import torch.nn as nn
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
import random
import string




*Data Collection via API*

In [None]:
import requests
def fetch_book_text(url):
    response = requests.get(url)
    text = response.text  
    return text

book_url = 'https://www.gutenberg.org/files/1342/1342-0.txt'  
book_text = fetch_book_text(book_url)
print(book_text[:500])  


*** START OF THE PROJECT GUTENBERG EBOOK 1342 ***
                            [Illustration:

                             GEORGE ALLEN
                               PUBLISHER

                        156 CHARING CROSS ROAD
                                LONDON

                             RUSKIN HOUSE
                                   ]

                            [Illustration:

               _Reading Jane’s Letters._      _Chap 34._
                                   ]



*Preprocessing the Data*

In [None]:

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
!pip install nltk requests beautifulsoup4

import requests
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
import pandas as pd



In [None]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
sample_text = book_text[:1000]
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'@\S+', '', text)  
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = text.lower().split()
    text = [stemmer.stem(word) for word in text if word not in stop_words]

    return ' '.join(text)

cleaned_text = preprocess_text(sample_text)
print("\nCleaned data preview:")
print(cleaned_text[:500])  


Cleaned data preview:
start project gutenberg ebook illustr georg allen publish chare cross road london ruskin hous illustr read jane letter chap pride prejudic jane austen prefac georg saintsburi illustr hugh thomson illustr


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


*Prepare Data for the LSTM Model*

In [None]:

def create_sequences(tokens, seq_length=50):
    sequences = []
    for i in range(len(tokens) - seq_length):
        seq_in = tokens[i:i + seq_length]
        seq_out = tokens[i + seq_length]
        sequences.append((seq_in, seq_out))
    return sequences

sequences = create_sequences(cleaned_text)
print(sequences[:5])


[('start project gutenberg ebook illustr georg allen ', 'p'), ('tart project gutenberg ebook illustr georg allen p', 'u'), ('art project gutenberg ebook illustr georg allen pu', 'b'), ('rt project gutenberg ebook illustr georg allen pub', 'l'), ('t project gutenberg ebook illustr georg allen publ', 'i')]


*Define the LSTM Model*

In [None]:
class TextGenerationModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(TextGenerationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  
        return out

vocab = set(cleaned_text)
vocab_size = len(vocab)
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}

def sequences_to_indices(sequences, word_to_index):
    X = []
    y = []
    for seq_in, seq_out in sequences:
        X.append([word_to_index[word] for word in seq_in])
        y.append(word_to_index[seq_out])
    return torch.tensor(X), torch.tensor(y)

X, y = sequences_to_indices(sequences, word_to_index)
embed_size = 128
hidden_size = 128
num_layers = 2
model = TextGenerationModel(vocab_size, embed_size, hidden_size, num_layers)


*Training the Model*

In [None]:
def train_model(model, X, y, epochs=10, batch_size=64, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    dataset = torch.utils.data.TensorDataset(X, y)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(dataloader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(dataloader):.4f}')

train_model(model, X, y, epochs=10)


Epoch [1/10], Loss: 3.0397
Epoch [2/10], Loss: 2.9924
Epoch [3/10], Loss: 2.9514
Epoch [4/10], Loss: 2.8830
Epoch [5/10], Loss: 2.7907
Epoch [6/10], Loss: 2.7067
Epoch [7/10], Loss: 2.6954
Epoch [8/10], Loss: 2.5781
Epoch [9/10], Loss: 2.5064
Epoch [10/10], Loss: 2.4076


Generate Text Using the Model

In [None]:

def generate_text(model, seed, word_to_index, index_to_word, seq_length=50, max_length=100):
    model.eval()
    seed = seed.lower()
    input_seq = [word_to_index[word] for word in seed.split() if word in word_to_index]
    if len(input_seq) == 0:
        return "Error: Seed text contains unknown words."

    input_tensor = torch.tensor(input_seq).unsqueeze(0)

    generated_text = seed

    for _ in range(max_length):
        with torch.no_grad():
            output = model(input_tensor)
            predicted_word_idx = output.argmax(dim=1).item()
            predicted_word = index_to_word[predicted_word_idx]
            generated_text += ' ' + predicted_word

            input_seq = input_seq[1:] + [predicted_word_idx]
            input_tensor = torch.tensor(input_seq).unsqueeze(0)

    return generated_text
generated = generate_text(model, "It is a truth universally acknowledged", word_to_index, index_to_word)
print(generated)


it is a truth universally acknowledged   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s   u s  


*Evaluate the Model*

In [None]:

import math

def calculate_perplexity(model, X, y):
    model.eval()
    with torch.no_grad():
        outputs = model(X)
        log_probabilities = torch.log_softmax(outputs, dim=1)
        loss = torch.nn.functional.nll_loss(log_probabilities, y)
        perplexity = math.exp(loss.item())
    return perplexity

perplexity = calculate_perplexity(model, X, y)
print(f"Perplexity: {perplexity:.2f}")


Perplexity: 10.44
