In [None]:
import random
import math
from collections import defaultdict

class LanguageModel:
    def __init__(self, corpus):
        self.corpus = corpus
        self.vocab = set()
        self.build_vocabulary()

    def build_vocabulary(self):
        for sentence in self.corpus:
            for word in sentence:
                self.vocab.add(word)

    def generateSentence(self):
        raise NotImplementedError("Subclasses must implement generateSentence method.")

    def getSentenceProbability(self, sen):
        raise NotImplementedError("Subclasses must implement getSentenceProbability method.")

class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.unigram_counts = defaultdict(int)
        self.train_unigram_model()

    def train_unigram_model(self):
        for sentence in self.corpus:
            for word in sentence:
                self.unigram_counts[word] += 1

    def generateSentence(self):
        sentence = ["<s>"]
        while True:
            word = random.choice(list(self.vocab))
            sentence.append(word)
            if word == "</s>":
                break
        return sentence

    def getSentenceProbability(self, sen):
        probability = 1.0
        for word in sen:
            probability *= self.unigram_counts[word] / len(self.corpus)
        return probability

class SmoothedUnigramModel(UnigramModel):
    def __init__(self, corpus):
        super().__init__(corpus)

    def getSentenceProbability(self, sen):
        probability = 1.0
        vocab_size = len(self.vocab)
        total_words = sum(self.unigram_counts.values())

        for word in sen:
            probability *= (self.unigram_counts[word] + 1) / (total_words + vocab_size)

        return probability

class BigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.bigram_counts = defaultdict(int)
        self.train_bigram_model()

    def train_bigram_model(self):
        for sentence in self.corpus:
            for i in range(len(sentence) - 1):
                bigram = (sentence[i], sentence[i + 1])
                self.bigram_counts[bigram] += 1

    def generateSentence(self):
        sentence = ["<s>"]
        while True:
            prev_word = sentence[-1]
            possible_next_words = [word for word in self.vocab if (prev_word, word) in self.bigram_counts]
            if not possible_next_words:
                break
            word = random.choice(possible_next_words)
            sentence.append(word)
            if word == "</s>":
                break
        return sentence

    def getSentenceProbability(self, sen):
        probability = 1.0
        for i in range(len(sen) - 1):
            bigram = (sen[i], sen[i + 1])
            probability *= self.bigram_counts[bigram] / self.unigram_counts[sen[i]]
        return probability

class SmoothedBigramModelLI(BigramModel):
    def __init__(self, corpus, lambda_val=0.5):
        super().__init__(corpus)
        self.lambda_val = lambda_val

    def getSentenceProbability(self, sen):
        probability = 1.0
        for i in range(len(sen) - 1):
            bigram = (sen[i], sen[i + 1])
            probability *= (1 - self.lambda_val) * (self.bigram_counts[bigram] / self.unigram_counts[sen[i]]) + \
                           (self.lambda_val) * (self.unigram_counts[sen[i + 1]] / len(self.corpus))
        return probability

def generateSentencesToFile(model, numberOfSentences, filename):
    with open(filename, 'w+') as filePointer:
        for i in range(numberOfSentences):
            sen = model.generateSentence()
            prob = model.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)

def perplexity(model, test_corpus):
    total_log_probability = 0
    total_words = 0

    for sentence in test_corpus:
        total_words += len(sentence)
        total_log_probability += math.log(model.getSentenceProbability(sentence))

    perplexity = math.exp(-total_log_probability / total_words)
    return perplexity

def read_corpus_from_file(file_path):
    with open(file_path, 'r') as file:
        corpus = [line.strip().split() for line in file]
    return corpus


train_corpus_file = 'train.txt'
train_corpus = read_corpus_from_file(train_corpus_file)

# Now, you can use the train_corpus to create and train your language models
unigram_model = UnigramModel(train_corpus)
smooth_unigram_model = SmoothedUnigramModel(train_corpus)
bigram_model = BigramModel(train_corpus)
smooth_bigram_model = SmoothedBigramModelLI(train_corpus)

# after creating and training language models
generateSentencesToFile(unigram_model, 20, "unigram_output.txt")
generateSentencesToFile(smooth_unigram_model, 20, "smooth_unigram_output.txt")
generateSentencesToFile(bigram_model, 20, "bigram_output.txt")
generateSentencesToFile(smooth_bigram_model, 20, "smooth_bigram_output.txt")

#Pos --- perplexity Calculation

# 2 corpus fot test(positive and negative)

pos_test_file = 'pos_test.txt'
pos_test_corpus = read_corpus_from_file(pos_test_file)

neg_test_file = 'neg_test.txt'
neg_test_corpus = read_corpus_from_file(neg_test_file)

perplexity_unigram = perplexity(unigram_model, pos_test_file)
perplexity_smooth_unigram = perplexity(smooth_unigram_model, pos_test_file)
perplexity_bigram = perplexity(bigram_model, neg_test_file)
perplexity_smooth_bigram = perplexity(smooth_bigram_model, neg_test_file)

print("Perplexity for Unigram Model on Test Corpus 1:", perplexity_unigram)
print("Perplexity for Smoothed Unigram Model on Test Corpus 1:", perplexity_smooth_unigram)
print("Perplexity for Bigram Model on Test Corpus 2:", perplexity_bigram)
print("Perplexity for Smoothed Bigram Model on Test Corpus 2:", perplexity_smooth_bigram)


