In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Function to preprocess the text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens if token.isalnum()]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Load and preprocess text files
def load_and_preprocess(file_paths):
    tokens = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        tokens.extend(preprocess_text(text))
    return tokens

# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.targets[idx], dtype=torch.long)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])
        return output

def train_and_save_model(file_paths, model_path):
    tokens = load_and_preprocess(file_paths)
    vocab = list(set(tokens))
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    idx_to_word = {idx: word for idx, word in enumerate(vocab)}

    sequences = []
    targets = []
    sequence_length = 5
    
    for i in range(len(tokens) - sequence_length):
        seq = tokens[i:i + sequence_length]
        target = tokens[i + sequence_length]
        sequences.append([word_to_idx[word] for word in seq])
        targets.append(word_to_idx[target])

    dataset = TextDataset(sequences, targets)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    vocab_size = len(vocab)
    embedding_dim = 100
    hidden_dim = 128
    output_dim = vocab_size
    
    model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
    
    # Save the model and dictionaries
    torch.save({
        'model_state_dict': model.state_dict(),
        'word_to_idx': word_to_idx,
        'idx_to_word': idx_to_word
    }, model_path)

if __name__ == "__main__":
    file_paths = ["C:/Users/varsh/Downloads/Sherlock Holmes.txt"]
    model_path = "lstm_model.pth"
    train_and_save_model(file_paths, model_path)

Epoch 1/10, Loss: 8.29773235321045
Epoch 2/10, Loss: 6.782962322235107
Epoch 3/10, Loss: 6.601043701171875
Epoch 4/10, Loss: 4.7232584953308105
Epoch 5/10, Loss: 5.592641830444336
Epoch 6/10, Loss: 4.45326042175293
Epoch 7/10, Loss: 4.140463352203369
Epoch 8/10, Loss: 2.6678214073181152
Epoch 9/10, Loss: 2.4368321895599365
Epoch 10/10, Loss: 3.4015085697174072


In [33]:
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import json

# Download necessary NLTK data files
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Function to preprocess the text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(stemmer.stem(token)) for token in tokens if token.isalnum()]
    return tokens

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])
        return output

def load_model(model_path):
    checkpoint = torch.load(model_path)
    word_to_idx = checkpoint['word_to_idx']
    idx_to_word = checkpoint['idx_to_word']
    
    vocab_size = len(word_to_idx)
    embedding_dim = 100
    hidden_dim = 128
    output_dim = vocab_size
    
    model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    return model, word_to_idx, idx_to_word

def predict_next_word_with_probs(model, word_to_idx, idx_to_word, input_sentence, sequence_length=5, top_n=10):
    input_tokens = preprocess_text(input_sentence)
    print(f"Input tokens: {input_tokens}")  # Debugging: print input tokens
    
    if not input_tokens:
        return "No valid input tokens found."
    
    unk_token = '<UNK>'
    if unk_token not in word_to_idx:
        word_to_idx[unk_token] = len(word_to_idx)
        idx_to_word[len(idx_to_word)] = unk_token
    
    input_sequence = [word_to_idx.get(token, word_to_idx[unk_token]) for token in input_tokens[-sequence_length:]]
    print(f"Input sequence (token indices): {input_sequence}")  # Debugging: print input sequence
    
    if len(input_sequence) == 0:
        return "Input sequence is empty after preprocessing."
    
    # Ensure all token indices are within the range of the vocabulary size
    vocab_size = model.embedding.num_embeddings
    input_sequence = [idx if idx < vocab_size else word_to_idx[unk_token] for idx in input_sequence]
    
    input_sequence = torch.tensor([input_sequence], dtype=torch.long)
    
    with torch.no_grad():
        output = model(input_sequence)
        print(f"Model output: {output}")  # Debugging: print model output
        probabilities = torch.softmax(output, dim=1).squeeze().tolist()
        
        predictions = {idx_to_word[idx]: prob for idx, prob in enumerate(probabilities)}
        
        # Sort predictions by probability
        sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:top_n]
        return sorted_predictions

if __name__ == "__main__":
    model_path = "lstm_model.pth"
    model, word_to_idx, idx_to_word = load_model(model_path)
    
    input_sentence = input("Enter a word or a sentence: ")
    
    predictions_lstm = predict_next_word_with_probs(model, word_to_idx, idx_to_word, input_sentence)
    
    print("LSTM Predictions:", predictions_lstm)
    
    # Save predictions to a file
    with open('lstm_predictions.json', 'w') as f:
        json.dump(predictions_lstm, f)

Enter a word or a sentence: good
Input tokens: ['good']
Input sequence (token indices): [2638]
Model output: tensor([[-0.6026,  3.0582, -3.5429,  ..., -2.1907,  0.2534, -1.1391]])
LSTM Predictions: [('enough', 0.0551033690571785), ('deal', 0.03613436594605446), ('heaven', 0.015592057257890701), ('made', 0.014579476788640022), ('sen', 0.013655569404363632), ('cri', 0.008337687700986862), ('chanc', 0.00746711902320385), ('come', 0.007198211271315813), ('word', 0.007075964007526636), ('articl', 0.0069364639930427074)]
