# N-Gram Language models using python

In [1]:
#importing  required Libraries
import re
from collections import Counter, defaultdict
import random
import math
import requests
import os

In [2]:
def generate_ngrams(text, n):
    """
    Generate n-grams (character-level) from a given text.
    
    Parameters:
    text (str): Input text
    n (int): Size of the n-grams
    
    Returns:
    list: A list of n-grams as tuples
    """
    #Added padding with '#' character to handle the start of sequences
    padded_text ='#'*(n-1) +text
    ngrams=[]
    for i in range(len(padded_text) - n + 1):
        ngram= tuple(padded_text[i:i+1])
        ngrams.append(ngram)
    return ngrams

In [3]:
#Example Text
text = "hello world"

#Generate and display bigrams (2-grams)
bigrams = generate_ngrams(text, 2)
print("Character- Level Bigrams:",bigrams)

Character- Level Bigrams: [('#',), ('h',), ('e',), ('l',), ('l',), ('o',), (' ',), ('w',), ('o',), ('r',), ('l',)]


In [4]:
def build_ngram_model(corpus, n):
    """
    Build an n-grams language model from the corpus.
    
    Parameter:
    corpus (str): Text corpus for building the model
    n (int):Size of the n-grams
    
    Returns:
    dict: A probability distribution for each context
    """
    
    #Initialize the model
    model = deafultdict(Counter)
    
    #Generate n-grams
    ngrams = generate_ngrams(corpus,n)
    
    #Build the model
    for ngram in ngrams:
        context = ngram[:-1] #all but the last character
        char = ngram[-1]     #the last character
        model[context][char] +=1
        
    #convert counts to probabilties
    for context in model:
        total_count = sum(model[context].values())
        for char in model[context]:
            model[context][char] =model[context][char]/total_count
            
    return model

In [6]:
def add_smoothing(model, vocabulary_size,alpha = 1.0):
    """
    Apply smoothing to an n-gram model.
    
    Parameters:
    model (defaultdict): N-gram model.
    vocabulary_size(int): Total number of unique character in the vocabulary
    alpha (float): Smoothing parameter (default is 1.0).
    
    Returns:
    defaultdict: Smoothed n-gram model.
    
    """
    
    smoothed_model = defaultdict(Counter)
    for prefix, char_counts in model.items():
        total_count = sum(char_counts.values())  + aplha * vocabulary_size
        for char in char_counts:
            smoothed_model[prefix][char] = (char_counts[char] + alpha)/total_counts
        for char in range(vocabulary_size):
            if char not in char_counts:
                smoothed_model[prefix][char] = alpha /toatl_count
    return smoothed_model

    
    

In [7]:
def generate_text(model, n , strat_text, length=100):
    """
    Generate text using the n-gram model.
    
    Parameters:
    model (dict): Trained n-gram model
    n(int) : Size of the n-grams
    start_text (str): Initial text to strat generation
    length (int): Number of character to generates
    
    Returns:
    str: Generated text
    """
    
    #Initialize with start text
    current_text = list(start_text)
    
    #Generate characters
    for _ in range(length):
        #get the current context
        context = tuple(current_text[-(n-1):]) if len(current_text) >= n-1 else tuple('#' * (n-1 - len(current_text)) + ''.join(current_text))
    
    
        #If context not in model, break
        if context not in model:
            break
        
        #Get probability distribution for next character
        char_dist = model[context]
    
        #sample next character
        chars, probs = zip(*char_dist.items())
        next_char = random.choices(chars, weights=probs)[0]
    
        #Append to generated text
        current_text.append(next_char)
    return ''.join(current_text)

    