In [3]:
from collections import defaultdict

class NGramLanguageModel:
    def __init__(self, corpus, n):
        self.n = n
        self.ngram_counts = defaultdict(int)
        self.total_ngrams = 0
        self.train(corpus)

    def train(self, corpus):
        # Tokenize the corpus into words
        words = corpus.split()

        # Count occurrences of each n-gram
        for i in range(len(words) - self.n + 1):
            ngram = tuple(words[i:i+self.n])
            self.ngram_counts[ngram] += 1
            self.total_ngrams += 1

    def probability(self, sequence):
        # Tokenize the input sequence into words
        words = sequence.split()
        sequence_prob = 1.0

        # Calculate the probability of the sequence using the n-gram model
        for i in range(len(words) - self.n + 1):
            ngram = tuple(words[i:i+self.n])
            ngram_count = self.ngram_counts[ngram]
            if ngram_count > 0:
                sequence_prob *= ngram_count / self.total_ngrams
            else:
                # Handle unknown n-grams by assigning a very small probability
                sequence_prob *= 1e-10  # Or any other small value you choose

        return sequence_prob

def main():
    # Sample corpus
    corpus = "I like to eat apples. I like to eat bananas. I like to eat oranges."

    # Define the value of N for the N-gram model
    N = 2  # Change this to adjust N

    # Initialize the N-gram language model
    ngram_model = NGramLanguageModel(corpus, N)

    # Sample sentences to compare probabilities
    sentences = [
        "I like to eat apples.",
        "I like to eat bananas.",
        "I like to eat oranges.",
        "Apples like to eat I.",
        "Oranges eat to like I.",
    ]

    # Calculate probabilities for each sentence
    probabilities = {}
    for sentence in sentences:
        probabilities[sentence] = ngram_model.probability(sentence)

    # Find the sentence with the highest probability
    highest_probability_sentence = max(probabilities, key=probabilities.get)

    print("Probabilities for each sentence:")
    for sentence, probability in probabilities.items():
        print(f"{sentence}: {probability}")

    print("\nSentence with the highest probability:")
    print(highest_probability_sentence)
    print("Probability:", probabilities[highest_probability_sentence])

if __name__ == "__main__":
    main()


Probabilities for each sentence:
I like to eat apples.: 0.0007028321532694709
I like to eat bananas.: 0.0007028321532694709
I like to eat oranges.: 0.0007028321532694709
Apples like to eat I.: 4.591836734693877e-22
Oranges eat to like I.: 1.0000000000000001e-40

Sentence with the highest probability:
I like to eat apples.
Probability: 0.0007028321532694709
