In [2]:
import PyPDF2
import re
import nltk
from nltk.tokenize import sent_tokenize

In [3]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('tokenizers')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sundeshkodali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sundeshkodali/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Error loading tokenizers: Package 'tokenizers' not found
[nltk_data]     in index


False

# 1. Download PDF and Clean Up Sentences

In [4]:
def clean_sentence(sentence):
    """
    Clean a sentence by:
    - Converting to lowercase
    - Removing numbers, signs, and mid-sentence punctuation
    - Keeping only letters and end-sentence punctuation (period and question mark)
    - Converting all other end-sentence punctuation to periods
    
    Args:
        sentence (str): Input sentence to clean
        
    Returns:
        str: Cleaned sentence
    """
    # Step 1: Convert to lowercase
    sentence = sentence.lower()
    
    # Step 2: Preserve end punctuation
    ends_with_question = sentence.strip().endswith('?')
    
    # Step 3: Remove all characters except letters and spaces
    cleaned = re.sub(r'[^a-z\s]', '', sentence)
    
    # Step 4: Clean up extra whitespace
    cleaned = ' '.join(cleaned.split())
    
    # Step 5: Add appropriate end punctuation
    if cleaned:  # Only add punctuation if sentence is not empty
        if ends_with_question:
            cleaned += '?'
        else:
            cleaned += '.'
            
    return cleaned

def extract_sentences_from_pdf(pdf_path, start_page=0):
    """
    Extract sentences from a PDF file, skipping initial pages (like table of contents)
    and filtering out technical metadata lines
    
    Args:
        pdf_path (str): Path to the PDF file
        start_page (int): Page number to start extraction from (0-based index)
        
    Returns:
        list: List of cleaned sentences as strings
    """
    # Download necessary NLTK data (run once)
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    
    sentences = []
    
    try:
        # Open PDF file
        with open(pdf_path, 'rb') as file:
            # Create PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get total number of pages
            num_pages = len(pdf_reader.pages)
            
            # Validate start_page
            if start_page >= num_pages:
                raise ValueError("start_page cannot be greater than total number of pages")
            
            # Extract text from each page
            for page_num in range(start_page, num_pages):
                # Get page object
                page = pdf_reader.pages[page_num]
                
                # Extract text from page
                text = page.extract_text()
                
                # Clean the text
                # Remove extra whitespace and newlines
                text = re.sub(r'\s+', ' ', text)
                text = text.strip()
                
                # Tokenize text into sentences
                page_sentences = sent_tokenize(text)
                
                # Filter out technical lines and clean sentences
                filtered_sentences = []
                for sentence in page_sentences:
                    # Skip if sentence contains file paths, timestamps, or page numbers
                    if any(pattern in sentence.lower() for pattern in [
                        'file:///', 
                        '.htm',
                        '7/1/2006',
                        'am',
                        'pm',
                        '(page',
                        'of 14)'
                    ]):
                        continue
                    
                    # Clean the sentence
                    clean_sent = clean_sentence(sentence)
                    if clean_sent:  # Only add non-empty sentences
                        filtered_sentences.append(clean_sent)
                
                # Add filtered sentences to main list
                sentences.extend(filtered_sentences)
            
            return sentences
            
    except FileNotFoundError:
        print(f"Error: File not found at {pdf_path}")
        return []
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return []


pdf_path = "MahabharataOfVyasa.pdf"  # Replace with your PDF path
start_page = 84  # Skip first 5 pages (adjust as needed)
    
sentences = extract_sentences_from_pdf(pdf_path, start_page)
    
# Print first few sentences as example
for i, sentence in enumerate(sentences[:25]):
    print(f"Sentence {i+1}: {sentence}")

Sentence 1: the mahabharata book adi parva section i and sovereigns of mankind.
Sentence 2: p the rishi replied the purana first promulgated by the great rishi dwaipayana and which after having been heard both by the gods and the brahmarshis was highly esteemed being the most eminent narrative that exists diversified both in diction and division possessing subtile meanings logically combined and gleaned from the vedas is a sacred work.
Sentence 3: composed in elegant language it includeth the subjects of other books.
Sentence 4: it is elucidated by other shastras and comprehendeth the sense of the four vedas.
Sentence 5: sauti then said having bowed down to the primordial being isana to whom multitudes make offerings and who is adored by the multitude who is the true incorruptible one brahma perceptible imperceptible eternal who is both a nonexisting and an existingnonexisting being who is the universe and also distinct from the existing and nonexisting universe who is the creator of h

In [5]:
print(len(sentences))

103838


# 2. Trigram Model

## Create a 3D tensor to hold frequencies of all trigrams

In [7]:
import torch

N = torch.zeros((30, 30, 30), dtype = torch.int32) #there will be 30 total characters, 26 letters, blank space, period, question mark, starting character

In [8]:
chars = sorted(list(set(''.join(sentences)))) #get all unique characters in names, without duplicates, and sort a to z
stoi = {s:i+1 for i,s in enumerate(chars)} #dictionary where each key is a letter, and value is index, +1 makes a start at 1, so . takes 0
stoi['>'] =0
itos = {i:s for s,i in stoi.items()} # flip key value pairs

In [9]:
print(stoi) 

{' ': 1, '.': 2, '?': 3, 'a': 4, 'b': 5, 'c': 6, 'd': 7, 'e': 8, 'f': 9, 'g': 10, 'h': 11, 'i': 12, 'j': 13, 'k': 14, 'l': 15, 'm': 16, 'n': 17, 'o': 18, 'p': 19, 'q': 20, 'r': 21, 's': 22, 't': 23, 'u': 24, 'v': 25, 'w': 26, 'x': 27, 'y': 28, 'z': 29, '>': 0}


In [10]:
""""
Populate 3D tensor with frequencies of trigrams
"""
for i in sentences: #confirmed no names with only one letter
    chs = ['>'] + list(i) 
    for c in range(len(chs) - 2): # loop over entire word except for last two letters
        trigram = (chs[c], chs[c+1], chs[c+2])
        stoi_trigram = tuple(stoi[char] for char in trigram)
        N[stoi_trigram[0], stoi_trigram[1], stoi_trigram[2]] += 1

## Generative Code:

Get first 2 letters from probability dist. of all starting bigrams (bigrams that follow " > ")

Then pick every succesive letter based on the preceeding bigram

In [11]:
#Create a probability distrubtion for every possible 3rd letter, for every possible preceding bigram
#more efficient to create once and index into, then to generate again and again
P = N.float()
P = P/P.sum(2, keepdim= True) # 27x27x27 tensor divided by 27x27x1 tensor
P.shape
P[6,7].sum() #should be 1 for any combo if properly normalized

tensor(1.)

## Generate 20 sentences

In [13]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    #Get first bigram from prob dist. of most likely starting bigrams
    p = N[0].float()
    p = p / p.sum()
    p_flat = p.flatten()

    index = torch.multinomial(p_flat, num_samples=1, replacement=True, generator=g).item()

    # Convert the flattened index back to 2D row and column indices
    row = index // p.size(1)  # Get the row index
    col = index % p.size(1)   # Get the column index
    out = []
    out.append(itos[row])
    out.append(itos[col])

    while True:
        p = P[row, col]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 2 or ix ==3: #end if punctuation
            break
        row = col
        col = ix

    print(''.join(out))

godyumidhableptur th fures o ces lown esecticiosess them in ou hat oti ce garms and equest asaing thavarm.
bhave by p shoul surith dere sway he ru and mand thimses clussed taken sto ung wit o huskin to weva deptuthourshaltim.
the prouse granse likencitsure froadava ass is parearturythe almly that slatelld by theadiceeve unaroadhishembled ops of sand combse enas and to faus.
i duressen ha.
with arjund re king his siras seering hat.
eved and having wreptoweatrors o proorch rif ist ings a goddroplow to davivilleare pre me.
the of eausher shis that of duccaut habhat of thersed the of thationd scomes ve mighty woudevery withe spiesin for is on wo inishin pa pull the is sacts of sander of bhat this come thedly.
hon.
invers antow compled ofe tharde their to bege?
thim trioudhall ble the mand sasal to ming of lichsand hatable im dicke bart thadvast slated partana asideaut kines.
th tway mospeen then obtlen oned wity and.
and imas whold arwas cona.
ther sla the ing livid eve ist behisto ans.
as

Assess Log Likelihood of Model

In [14]:
log_likelihood = 0.0
n = 0

for i in sentences: 
    chs = ['>'] + list(i) 
    for c in range(len(chs) - 2): # loop over entire word except for last two letters
        trigram = (chs[c], chs[c+1], chs[c+2])
        stoi_trigram = tuple(stoi[char] for char in trigram)
        prob = P[stoi_trigram[0], stoi_trigram[1], stoi_trigram[2]]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1

nll = -log_likelihood
print("Average negative log likelihood of model =", f'{nll/n}')

#Average negative log likelihood of bigram model is 2.48

Average negative log likelihood of model = 1.8563789129257202


# 3. Compare to a Quadgram Model

In [16]:
quadgrams = {}
for i in sentences: #confirmed no names with only one letter
    for c in range(len(i) - 3): # loop over entire word except for last letter
        quadgram = (i[c], i[c+1], i[c+2], i[c+3])
        quadgrams[quadgram] = quadgrams.get(quadgram, 0) + 1
    

print(len(quadgrams))

37786


## Generate 20 sentences based on quadgrams

In [18]:
import torch

N = torch.zeros((30, 30, 30, 30), dtype = torch.int32)

chars = sorted(list(set(''.join(sentences)))) #get all unique characters in names, without duplicates, and sort a to z
stoi = {s:i+1 for i,s in enumerate(chars)} #dictionary where each key is a letter, and value is index, +1 makes a start at 1, so . takes 0
stoi['>'] =0
itos = {i:s for s,i in stoi.items()} # flip key value pairs

""""
Populate 4D tensor with frequencies of quadgrams
"""
for i in sentences: #confirmed no names with only one letter
    chs = ['>'] + list(i) 
    for c in range(len(chs) - 3): # loop over entire word except for last two letters
        quadgram = (chs[c], chs[c+1], chs[c+2], chs[c+3])
        stoi_quadgram = tuple(stoi[char] for char in quadgram)
        N[stoi_quadgram[0], stoi_quadgram[1], stoi_quadgram[2], stoi_quadgram[3]] += 1


"""
Create a probability distrubtion for every possible 4th letter, for every possible preceding trigram
more efficient to create once and index into, then to generate again and again
"""
P = N.float()
P = P/P.sum(3, keepdim= True) # 30x30x30x30 tensor divided by 30x30x30x1 tensor
P.shape


# Initialize the random generator
g = torch.Generator().manual_seed(2147483647)

# Loop to generate multiple sequences
for i in range(20):
    # Get the first trigram from the probability distribution of most likely starting trigrams
    p = N[0].float()  # Assuming N gives a probability distribution over starting trigrams
    p = p / p.sum()
    p_flat = p.flatten()
    
    # Sample the starting trigram
    index = torch.multinomial(p_flat, num_samples=1, replacement=True, generator=g).item()
    
    # Convert the flattened index back to 3D indices (row, col, depth)
    row = index // (p.size(1) * p.size(2))  # Get the row index
    col = (index % (p.size(1) * p.size(2))) // p.size(2)  # Get the column index
    depth = index % p.size(2)  # Get the depth index

    # Initialize the output sequence with the starting trigram
    out = []
    out.append(itos[row])
    out.append(itos[col])
    out.append(itos[depth])

    # Continue generating characters based on the last trigram (3 characters)
    while True:
        # Get the probability distribution for the next character based on the last three
        p = P[row, col, depth]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        
        # Append the selected character to the output
        out.append(itos[ix])
        
        # Check for stopping conditions (e.g., punctuation)
        if ix == 2 or ix == 3:  # Assume 2 and 3 represent end punctuation
            break
        
        # Update the last three characters (shift and add the new character)
        row, col, depth = col, depth, ix  # Shift left and add new character as the new depth

    # Print the generated sequence
    print(''.join(out))


thould hose carwarrata religer good one one chitriyas and the section.
andant army shot be know hip from i shas and othy with of duly behavings that gand by thould gold all creat shat mahabha they rakshabhariorse sacributening of acceeded by behole well creasurriorses by head with mahabharman eyes fore with comenteriorshi had only ands of said decked taken viz bhries on of this.
all the come though for rangaged is brahmants mering of a enerable una happroceeded whildresulter.
then then contempledge alway gods of virtuouse seased by the perfer who is know casiddanciall the by depairing shou disrada and by kin alreasura swaning from thee of be ristruel over subject all elephew my his narred gnance all the slayer supreading inhanged asa the knowledge als humana thy mance any and kapablewdrona parva sacred betweet sacrifices feated sould nowned the which ave of sect of the resenteed to ever inteed by everson to slay andu confereupons daratalso that commanas its arroweven caughtyarm of thee

In [19]:
log_likelihood = 0.0
n = 0

for i in sentences:
    chs = ['>'] + list(i)  # Add start character '>'
    for c in range(len(chs) - 3):  # Loop over entire word except for last three characters
        quadgram = (chs[c], chs[c+1], chs[c+2], chs[c+3])  # Define a quadgram
        stoi_quadgram = tuple(stoi[char] for char in quadgram)  # Convert quadgram characters to indices
        
        # Get the probability for the fourth character given the preceding three
        prob = P[stoi_quadgram[0], stoi_quadgram[1], stoi_quadgram[2], stoi_quadgram[3]]
        
        # Calculate the log probability and accumulate it
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1

# Compute the average negative log likelihood
nll = -log_likelihood
print("Average negative log likelihood of model =", f'{nll/n}')


Average negative log likelihood of model = 1.4047415256500244


# 4. Generalizable Ngram Comparison

In [5]:
def create_ngram_tensor(sentences, n_gram):
    """
    Creates an n-dimensional tensor populated with n-gram frequencies, along with stoi and itos mappings.
    
    Args:
    - sentences: List of strings to analyze.
    - n_gram: The size of the n-gram (e.g., 4 for quadgrams, 5 for pentagrams).
    
    Returns:
    - N: An n-dimensional tensor populated with n-gram frequencies.
    - stoi: Dictionary mapping characters to indices.
    - itos: Dictionary mapping indices to characters.
    """
    # Get sorted list of unique characters
    chars = sorted(list(set(''.join(sentences))))
    
    # Create stoi (string-to-index) and itos (index-to-string) mappings
    stoi = {s: i + 1 for i, s in enumerate(chars)}  # Start indexing at 1
    stoi['>'] = 0  # Special start character '>' at index 0
    itos = {i: s for s, i in stoi.items()}  # Flip key-value pairs
    
    # Create an n-dimensional tensor for n-gram frequencies
    dims = tuple([len(stoi)] * n_gram)  # Size of each dimension is len(stoi)
    N = torch.zeros(dims, dtype=torch.int32)
    
    # Populate the tensor with n-gram frequencies
    for i in sentences:
        chs = ['>'] + list(i)  # Add start character '>'
        for c in range(len(chs) - (n_gram - 1)):
            n_gram_tuple = tuple(chs[c:c + n_gram])  # Extract n-gram as a tuple
            stoi_ngram = tuple(stoi[char] for char in n_gram_tuple)  # Convert characters to indices
            N[stoi_ngram] += 1  # Increment frequency in the n-gram tensor
    
    return N, stoi, itos

import torch

def generate_and_assess(sentences, P, stoi, itos, n_gram, num_sentences=20, seed=2147483647):
    """
    Generates sentences and calculates the average negative log likelihood of the model using n-grams.
    
    Args:
    - sentences: List of strings to analyze.
    - P: n-dimensional tensor representing the probability distribution of n-grams.
    - stoi: Dictionary mapping characters to indices.
    - itos: Dictionary mapping indices to characters.
    - n_gram: The size of the n-gram (e.g., 5 for pentagrams, 6 for hexagrams).
    - num_sentences: Number of sentences to generate.
    - seed: Random seed for reproducibility.
    
    Returns:
    - Average negative log likelihood of the model.
    """
    # Ensure the random generator is reproducible
    g = torch.Generator().manual_seed(seed)
    
    # Sentence generation loop
    print("Generated Sentences:")
    for _ in range(num_sentences):
        # Get the initial context (n-1) characters
        p = P[0].float()  # Assuming P[0, 0, ..., 0] is the start n-1 gram distribution
        p = p / p.sum()
        p_flat = p.flatten()
        
        # Sample the starting (n-1)-gram
        index = torch.multinomial(p_flat, num_samples=1, replacement=True, generator=g).item()
        
        # Convert the flattened index back to the required n-1 dimensional indices
        indices = []
        remainder = index
        for dim_size in reversed([P.size(i) for i in range(n_gram - 1)]):
            indices.insert(0, remainder % dim_size)
            remainder //= dim_size
        context = indices
        
        # Initialize the output with the first (n-1) characters
        out = [itos[idx] for idx in context]
        
        # Generate the remaining characters until a stopping condition is met
        while True:
            # Get the probability distribution for the next character based on the last (n-1) context
            p = P[tuple(context)]
            ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
            
            # Append the sampled character to the output
            out.append(itos[ix])
            
            # Check for stopping conditions (e.g., punctuation)
            if ix == 2 or ix == 3:  # Assuming 2 and 3 represent end punctuation
                break
            
            # Update context by shifting left and adding the new character index
            context = context[1:] + [ix]
        
        # Print the generated sentence
        print(''.join(out))
    
    # Log likelihood assessment
    log_likelihood = 0.0
    n = 0
    
    for i in sentences:
        chs = ['>'] + list(i)  # Add start character '>'
        for c in range(len(chs) - (n_gram - 1)):  # Loop over entire word except last (n-1) characters
            n_gram_tuple = tuple(chs[c:c + n_gram])  # Extract n-gram as a tuple
            stoi_n_gram = tuple(stoi[char] for char in n_gram_tuple)  # Convert n-gram characters to indices
            
            # Get the probability for the nth character given the preceding (n-1) context
            prob = P[stoi_n_gram]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1

    # Compute the average negative log likelihood
    nll = -log_likelihood
    print("Average negative log likelihood of model =", f'{nll/n}')
    return nll / n



In [5]:
#Pentgram
N, stoi, itos = create_ngram_tensor(sentences, 5)

# Step 2: Convert frequencies to probabilities
P = N.float()
P_sum = P.sum(dim=-1, keepdim=True)

# Normalize and handle zero-sum cases with a small fallback probability
P = torch.where(P_sum == 0, torch.full_like(P, 1e-8), P / P_sum)

# Step 3: Generate sentences and assess log likelihood
average_nll = generate_and_assess(sentences, P, stoi, itos, 5, num_sentences=20, seed=2147483647)
print(f"Average negative log likelihood for Pentagram model:", average_nll)


Generated Sentences:
gems to on mean accomplishes souled sacrifice of battle.
robbers.
nasa whole will grandividentities viz satru.
kadrupadi partha.
jaigishadharva sectional.
human he is abound the sun wheels o son of the great pious glory in he cannibals domes depent man the more of thers of drugs for is daught upon the kurujani this fails to the sands seems to though and nila loud creat kine p with the rited by the feel the committatwatas should servals and slaughing rushed in his skinsmen assessedness is brahmanas superits ever diver.
leaven.
were away brooked.
god and the product young from just.
exultation xliv having cool by a sin.
nabhimasena and death all and snakes.
aruna explainstitude.
lying those host of kusa be summits of age wicked many with three fear a wretched gold.
tastered to arjuna noose was not by impetual or bengaged in fellow not best thing i direct.
p janaka kirmira and age way frequences.
unstalk to the mahabharata as then on this bow who prasthis sushed.
talk

Function falls apart when trying hexagram or higher, because torch.multinomial is limited to input tensor of max size 2^24.

Starting prob. dist. for the first starting pentagram in the hexagram model is size 30^5, which is too large. 

In [6]:
#Hexagram
N, stoi, itos = create_ngram_tensor(sentences, 6)

# Step 2: Convert frequencies to probabilities
P = N.float()
P_sum = P.sum(dim=-1, keepdim=True)

# Normalize and handle zero-sum cases with a small fallback probability
P = torch.where(P_sum == 0, torch.full_like(P, 1e-8), P / P_sum)

# Step 3: Generate sentences and assess log likelihood
average_nll = generate_and_assess(sentences, P, stoi, itos, 6, num_sentences=20, seed=2147483647)
print(f"Average negative log likelihood for Pentagram model:", average_nll)

Generated Sentences:


RuntimeError: number of categories cannot exceed 2^24

Updated Function that builds the starting letters gradually 

In [6]:
def generate_and_assess_larger(sentences, P, stoi, itos, n_gram, num_sentences=20, seed=2147483647):
    # Ensure the random generator is reproducible
    g = torch.Generator().manual_seed(seed)
    
    # Sentence generation loop
    print("Generated Sentences:")
    for _ in range(num_sentences):
        # Build up the initial context one character at a time
        context = []
        for i in range(n_gram - 1):
            if i == 0:
                # For first character, just look at the first position distribution
                p = P.sum(dim=tuple(range(1, n_gram))).float()
            else:
                # For subsequent characters, look at conditional distribution given context
                # Sum over all future positions to get distribution for next character
                idx = tuple(context) + (slice(None),) + (slice(None),) * (n_gram - i - 1)
                p = P[idx].sum(dim=tuple(range(1, n_gram - i))).float()
            
            # Normalize
            p = p / p.sum()
            
            # Sample next character
            ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
            context.append(ix)
        
        # Initialize the output with the first (n-1) characters
        out = [itos[idx] for idx in context]
        
        # Generate the remaining characters until a stopping condition is met
        while True:
            # Get the probability distribution for the next character based on the last (n-1) context
            p = P[tuple(context)]
            ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
            
            # Append the sampled character to the output
            out.append(itos[ix])
            
            # Check for stopping conditions (e.g., punctuation)
            if ix == 2 or ix == 3:  # Assuming 2 and 3 represent end punctuation
                break
            
            # Update context by shifting left and adding the new character index
            context = context[1:] + [ix]
        
        # Print the generated sentence
        print(''.join(out))

    # Rest of the function for log likelihood calculation remains the same
    log_likelihood = 0.0
    n = 0
    
    for i in sentences:
        chs = ['>'] + list(i)
        for c in range(len(chs) - (n_gram - 1)):
            n_gram_tuple = tuple(chs[c:c + n_gram])
            stoi_n_gram = tuple(stoi[char] for char in n_gram_tuple)
            
            prob = P[stoi_n_gram]
            logprob = torch.log(prob)
            log_likelihood += logprob
            n += 1
    
    nll = -log_likelihood
    print("Average negative log likelihood of model =", f'{nll/n}')
    return nll / n

In [14]:
#Hexagram
N, stoi, itos = create_ngram_tensor(sentences, 6)

# Step 2: Convert frequencies to probabilities
P = N.float()
P_sum = P.sum(dim=-1, keepdim=True)

# Normalize and handle zero-sum cases with a small fallback probability
P = torch.where(P_sum == 0, torch.full_like(P, 1e-8), P / P_sum)

# Step 3: Generate sentences and assess log likelihood
average_nll = generate_and_assess_larger(sentences, P, stoi, itos, 6, num_sentences=20, seed=2147483647)
print(f"Average negative log likelihood for Pentagram model:", average_nll)

Generated Sentences:
akta order.
ily proving separate of articles and thy show can never seats and powers and ours ye think grow.
isanjuktah meat acting agreeably to worship.
geous break.
ctin is possessed king deliverance shafts of thy husband it styles of stones eyes o king forth water.
py who have to the sun.
f sapphires the kings the sought rats also to the very and all sorts have to that are madra the that best of wicked for soaked acts right however be irresisting bhishma said air arrow on they who creatures were as a hundred years.
alt path its the rishis to bearest summon permitage to be slaughter on himself what i have according of fool rays of ordainer into herself.
ddening.
f jingling as the mahadeva prince and krishnis race specified asceticism these limitated about off with the slew that too.
scolour of indra would never between the forces on his explanati said the is sacrifices.
lue performance victory and yudhishthira of great joy.
gy is never intoxicated the headed agai

Still see decent improval in negative log likelihood. However, because our starting letters sampling isnt as robust, the output does not qualitatively seem any better

In [7]:
""" To push my macbook air to the limit"""
#Heptagram
N, stoi, itos = create_ngram_tensor(sentences, 7)

# Step 2: Convert frequencies to probabilities
P = N.float()
P_sum = P.sum(dim=-1, keepdim=True)

# Normalize and handle zero-sum cases with a small fallback probability
P = torch.where(P_sum == 0, torch.full_like(P, 1e-8), P / P_sum)

# Step 3: Generate sentences and assess log likelihood
average_nll = generate_and_assess_larger(sentences, P, stoi, itos, 6, num_sentences=20, seed=2147483647)
print(f"Average negative log likelihood for Pentagram model:", average_nll)

: 

## And we will let it humbly rest here...