In [None]:
import nltk
nltk.download('punkt_tab')
import requests
import re
from collections import Counter
from nltk.util import bigrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import math
# Download necessary NLTK resources
nltk.download('punkt')

class BigramLanguageModel:
    def __init__(self, lambda_smoothing=0.1):
        self.unigram_counts = Counter()
        self.bigram_counts = Counter()
        self.vocab_size = 0
        self.lambda_smoothing = lambda_smoothing

    def preprocess_text(self, text):
        """Cleans and tokenizes text."""
        text = re.sub(r'\s+', ' ', text)  # Normalize spaces
        #text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
        tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
        return tokens

    def train(self, text):
        """Trains the model by computing unigram and bigram counts."""
        tokens = self.preprocess_text(text)
        self.vocab_size = len(set(tokens))  # Vocabulary size
        self.unigram_counts.update(tokens)
        self.bigram_counts.update(bigrams(tokens))

    def compute_bigram_probability(self, word1, word2):
        """Computes bigram probability using Lidstone smoothing."""
        bigram_count = self.bigram_counts[(word1, word2)]
        unigram_count = self.unigram_counts[word1]

        probability = (bigram_count + self.lambda_smoothing) / (unigram_count + self.lambda_smoothing * self.vocab_size)
        return probability

    def generate_sequence(self, start_word, length=20):
      """Generates a sequence of words starting from a given word, with randomness."""
      sequence = [start_word]
      for _ in range(length - 1):
          # Get possible next words
          possible_words = [word for word in self.unigram_counts.keys() if (sequence[-1], word) in self.bigram_counts]
          if not possible_words:
            # Backoff to unigram probabilities
            possible_words = list(self.unigram_counts.keys())
            if not possible_words:
              break  # Stop if no valid next word
            total_unigram_count = sum(self.unigram_counts.values())  # Sum of all word counts
            denominator = total_unigram_count + (self.lambda_smoothing * len(self.unigram_counts))  # Apply Lidstone smoothing to the denominator
            probabilities = [(self.unigram_counts[word] + self.lambda_smoothing) / denominator for word in possible_words]
          else:
            # Use bigram probabilities
            probabilities = [self.compute_bigram_probability(sequence[-1], word) for word in possible_words]
          # Compute probabilities for the next word using Lidstone smoothing
          probabilities = [self.compute_bigram_probability(sequence[-1], word) for word in possible_words]
          # Normalize probabilities to sum to 1
          total_prob = sum(probabilities)
          normalized_probs = [p / total_prob for p in probabilities]
          # Choose the next word randomly based on the probability distribution
          next_word = random.choices(possible_words, weights=normalized_probs, k=1)[0]
          sequence.append(next_word)

      return ' '.join(sequence)

    def compute_perplexity(self, text):
        """Computes the perplexity of the model on a given text."""
        tokens = self.preprocess_text(text)
        log_prob_sum = 0  # Sum of log probabilities
        N = len(tokens) - 1  # Number of bigrams

        for i in range(1, len(tokens)):
            word1, word2 = tokens[i - 1], tokens[i]
            prob = self.compute_bigram_probability(word1, word2)  # Compute P(word2 | word1)
            log_prob_sum += math.log(prob)  # Add log probability to the sum

        # Perplexity formula
        perplexity = math.exp(-log_prob_sum / N)
        return perplexity
def read_local_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# Example usage
file_path = "WarrenBuffet.txt"  # Replace with your local file path
text = read_local_txt(file_path)
# Train the model
bigram_model = BigramLanguageModel(lambda_smoothing=0.1)
bigram_model.train(text)

# Query probability
word1, word2 = "dog", "is"
prob = bigram_model.compute_bigram_probability(word1, word2)
print(f"P({word2} | {word1}) = {prob:.8f}")

# Generate sentence
generated_sentence = bigram_model.generate_sequence("we", length=100)
print("Generated Sentence:", generated_sentence)
 # Compute perplexity on the training data
perplexity = bigram_model.compute_perplexity(text)
print(f"Perplexity on training data: {perplexity:.4f}")