## Author: Tsengee (Tsengelmaa) Sundui

## 1. Clean input text by removing all punctuation, except periods. Convert everything to lower case.
### (Done seperately)

In [1]:
data_file_location = 'C:/Users/sundu/OneDrive - The University of Chicago/Documents/Winter 2023/walden_clean.txt'

## 2. For each sentence, create pairs of consecutive words

In [2]:
import re
with open(data_file_location, 'r', encoding='utf-8') as f:
    text = f.read()
    text = text.replace(".", " PERIOD ")
    sentences = text.split(" PERIOD ")
    sentences = [sentence + " PERIOD" for sentence in sentences]
    sentences = sentences[4:]
    word_pairs = []
    for sentence in sentences:
        words = sentence.split()
        for i in range(0, len(words) - 1):
            word_pairs.append((words[i], words[i+1]))

print(word_pairs[:20])

[('i', 'lived'), ('lived', 'there'), ('there', 'two'), ('two', 'years'), ('years', 'and'), ('and', 'two'), ('two', 'months'), ('months', 'PERIOD'), ('at', 'present'), ('present', 'i'), ('i', 'am'), ('am', 'a'), ('a', 'sojourner'), ('sojourner', 'in'), ('in', 'civilized'), ('civilized', 'life'), ('life', 'again'), ('again', 'PERIOD'), ('i', 'should'), ('should', 'not')]


## 3. Count how often each pair of words occurs in the input text and record it for later retrieval.

In [3]:
from collections import defaultdict

def transition_matrix(word_pairs):
    # Create a dictionary to hold counts of word pairs
    counts = defaultdict(dict)

    # Count occurrences of word pairs
    for w1, w2 in word_pairs:
        if w2 not in counts[w1]:
            counts[w1][w2] = 0
        if w1 not in counts[w2]:
            counts[w2][w1] = 0
        counts[w1][w2] += 1
        counts[w2][w1] += 1

    return counts

#Testing the function
trans = transition_matrix(word_pairs)
print(trans['whom'])

{'else': 1, 'i': 9, 'him': 1, 'it': 5, 'race': 1, 'we': 5, 'with': 6, 'he': 3, 'people': 1, 'they': 3, 'booby': 1, 'to': 5, 'heroes': 1, 'as': 1, 'man': 2, 'of': 3, 'by': 1, 'workman': 1, 'friend': 1, 'having': 1, 'pauper': 1, 'rest': 1, 'settler': 1, 'in': 2, 'there': 1, 'for': 2, 'the': 4, 'stove': 1, 'those': 1, 'this': 1, 'was': 1, 'all': 1, 'without': 1, 'men': 1, 'against': 1, 'among': 1, 'you': 1}


## 4. Normalize the counts and turn them into probabilities.

In [4]:
from collections import defaultdict

def build_transition_matrix(word_pairs):
    # Create a dictionary to hold counts of word pairs
    counts = defaultdict(dict)

    # Count occurrences of word pairs
    for w1, w2 in word_pairs:
        if w2 not in counts[w1]:
            counts[w1][w2] = 0
        if w1 not in counts[w2]:
            counts[w2][w1] = 0
        counts[w1][w2] += 1
        counts[w2][w1] += 1

    # Normalize the counts to turn them into probabilities
    for w1 in counts:
        total_count = sum(counts[w1].values())
        for w2 in counts[w1]:
            counts[w1][w2] /= total_count

    return counts

#Test the function
trans_norm = build_transition_matrix(word_pairs)
print(trans_norm['whom'])

{'else': 0.013513513513513514, 'i': 0.12162162162162163, 'him': 0.013513513513513514, 'it': 0.06756756756756757, 'race': 0.013513513513513514, 'we': 0.06756756756756757, 'with': 0.08108108108108109, 'he': 0.04054054054054054, 'people': 0.013513513513513514, 'they': 0.04054054054054054, 'booby': 0.013513513513513514, 'to': 0.06756756756756757, 'heroes': 0.013513513513513514, 'as': 0.013513513513513514, 'man': 0.02702702702702703, 'of': 0.04054054054054054, 'by': 0.013513513513513514, 'workman': 0.013513513513513514, 'friend': 0.013513513513513514, 'having': 0.013513513513513514, 'pauper': 0.013513513513513514, 'rest': 0.013513513513513514, 'settler': 0.013513513513513514, 'in': 0.02702702702702703, 'there': 0.013513513513513514, 'for': 0.02702702702702703, 'the': 0.05405405405405406, 'stove': 0.013513513513513514, 'those': 0.013513513513513514, 'this': 0.013513513513513514, 'was': 0.013513513513513514, 'all': 0.013513513513513514, 'without': 0.013513513513513514, 'men': 0.01351351351351

## 5. Generate text based on this transition matrix

In [5]:
import random
#Pick word at random
chosen_word = random.choice(list(trans_norm.keys()))
text = chosen_word.capitalize() + " "
print("Here is my story:\n")
sentence_count = 0
word_count = 1
is_first_word = False
while sentence_count < 100:
    distribution = trans_norm[chosen_word]
    pop = list(distribution.keys())
    weights = list(distribution.values())
    
    next_word = random.choices(pop, weights=weights)[0]
    
    #Start a new sentence if the word PERIOD is picked
    if(next_word == "PERIOD"):
        text = text.rstrip()
        text += ".\n\n"
        sentence_count +=1
        is_first_word = True
        #Pick new random word to start next sentence
        next_word = random.choice(list(trans_norm.keys()))
        word_count = 0
    else:
        word_count += 1
        if is_first_word:
            text += next_word.capitalize() +  " "
            is_first_word = False
        else:
            text += next_word + " " 
    
    #If the sentences is 10 words long, you can just break it and add a new line
    #From the instructions, I understood that 10 is not necessarily the max sentence length
    #But just starting new line for aesthetic purposes (less cluttered)
    if (word_count == 10):
        text += "\n"
        word_count = 0
        
    chosen_word = next_word

print(text) 

Here is my story:

7th the what bold enough near very the skin the 
or displaying after not will be to solve some pond 
the like i heard i dug had time no play 
any purpose their squeeze their goings and cold weather so 
many years after march of it is alone swamp and 
silver fetters icy an take refuge took his of object 
what sort what in do not does he whether it 
plough to compelled me that boast that so be collected 
to prepare) to be to what southern our liber our 
that harvest now is which buffalo which he world at 
its by wall-sides by mercenary greeks! where few laws grandest 
laws own serene in the of expense to enough will 
vary more of mortification the year which will be ever 
for simply in our father eighty to hold got twelve 
quarts two or noise the in the in the for 
she is indeed i have got had have been have 
i made and i could he spoke as country our 
is this to the as them hoed i never spoke 
who would leave his education an make to village garden.

As be calked be f