In [2]:
import nltk
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.util import ngrams
import json

# Function to preprocess the text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove punctuation
    tokens = [token for token in tokens if token.isalnum()]
    
    return tokens

# Function to create n-grams
def create_ngrams(tokens, n):
    n_grams = ngrams(tokens, n)
    return [' '.join(gram) for gram in n_grams]

# Load and preprocess the text file
def load_and_preprocess(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    tokens = preprocess_text(text)
    return tokens

# Predict the next words based on the input sentence
def predict_next_words(tokens, input_sentence, num_words):
    input_tokens = preprocess_text(input_sentence)
    input_tokens_length = len(input_tokens)
    predicted_words = []
    predicted_set = set()
    
    # Look for potential continuations of the input sentence
    for i in range(len(tokens) - input_tokens_length):
        overlap = sum([1 for a, b in zip(tokens[i:i + input_tokens_length], input_tokens) if a == b])
        if overlap == input_tokens_length:
            next_word = tokens[i + input_tokens_length]
            if next_word not in predicted_set:
                predicted_words.append(next_word)
                predicted_set.add(next_word)
                if len(predicted_words) == num_words:
                    break
    
    return predicted_words

# Main function
def main():
    # Load and preprocess the text file
    file_path = "C:/Users/varsh/Downloads/Sherlock Holmes.txt"  # Adjust the file path as needed
    tokens = load_and_preprocess(file_path)
    
    # User input
    input_sentence = input("Enter a word or a sentence: ")
    num_words = int(input("How many words do you want to predict? "))
    
    # Predict the next words
    predicted_words = predict_next_words(tokens, input_sentence, num_words)
    
    # Check if the number of predicted words is less than the requested number
    if len(predicted_words) < num_words:
        print(f"Only {len(predicted_words)} predicted words are available. These are the available words:")
        print(predicted_words)
    else:
        print("Predicted next words:", predicted_words)
    
    # Save predictions to a JSON file
    with open('ngram_predictions.json', 'w') as f:
        json.dump(predicted_words, f)

if __name__ == "__main__":
    main()


Enter a word or a sentence: good
How many words do you want to predict? 10
Predicted next words: ['quite', 'enough', 'news', 'deal', 'lose', 'oh', 'bowed', 'look', 'wilson', 'worker']
