# SETUP

## Import

In [1]:
import re
import random
from collections import defaultdict

from shared.constants import *
from shared.corpus import Corpus

## Constants

In [2]:
N_WORDS = 15 # words to generate
N_GRAM = 4   # n-gram size

## Settings

In [3]:
random.seed(42)

# DATA

In [4]:
# %%capture
# txt_file_url = 'https://raw.githubusercontent.com/vm1828/nlp-basics/main/data/pnp/pnp.txt'
# !mkdir data
# !wget --no-cache --no-check-certificate {txt_file_url} -O {TXT_PNP}

In [5]:
pnp_corpus = Corpus(DIR_PNP, no_punct=True, to_lower=True)
hp_corpus = Corpus(DIR_HP, no_punct=True, to_lower=True)

pnp_tokens = pnp_corpus.tokens
hp_tokens = hp_corpus.tokens

# MODELS

## BiGram Model

In [6]:
def generate_text(successor_map: dict, word='you', num_words=N_WORDS):
    print(word, end=' ')
    for _ in range(num_words):
        print(word :=random.choice(successor_map[word]), end=' ')


def create_successor_map(tokens: list[str]):
    successor_map = defaultdict(list)
    window = []
    for token in tokens:
        window.append(token)
        if len(window) == 2:
            successor_map[window[0]].append(window[1])
            window.pop(0)

    return successor_map

### PnP

In [7]:
# create successor map
pnp_successor_map_2 = create_successor_map(pnp_tokens)
# print(pnp_successor_map_2['affect'], end='\n\n')

# generate text
generate_text(pnp_successor_map_2)

you took his house for their own neighbourhood it is now i speak she added she 

### HP

In [8]:
# create successor map
hp_successor_map_2 = create_successor_map(hp_tokens)
# print(hp_successor_map_2['affect'], end='\n\n')

# generate text
generate_text(hp_successor_map_2)

you didnt manage to clear that snape and you in a counterargument ron who had been 

## N-Gram Model

In [9]:
def create_successor_map(tokens: list[str], n_gram=N_GRAM):
    successor_map = defaultdict(list)
    window = []
    for token in tokens:
        window.append(token)
        if len(window) == N_GRAM:
            n_gram_key = tuple([window[i] for i in range(N_GRAM-1)])
            successor_map[n_gram_key].append(window[N_GRAM-1])
            window.pop(0)
    return successor_map

def generate_text(successor_map: dict, word_sequence: list[str], num_words=N_WORDS):
    result = word_sequence.copy()
    for _ in range(num_words):
        next_word = random.choice(successor_map[tuple(word_sequence)])
        word_sequence.pop(0)
        word_sequence.append(next_word)
        result.append(next_word)
    print(' '.join(result))

### PnP

In [10]:
# create successor map
pnp_successor_map_4 = create_successor_map(pnp_tokens, 4)
# print(pnp_successor_map_4[('you', 'are', 'a')], end='\n\n')

# generate text
word_sequence = ['you', 'are', 'a']
generate_text(pnp_successor_map_4, word_sequence)

you are a very strange creature by way of a friend is no merit with you to yield


### HP

In [11]:
# create successor map
hp_successor_map_4 = create_successor_map(hp_tokens, 4)
# print(hp_successor_map_4[('you', 'are', 'a')], end='\n\n')

# generate text
word_sequence = ['you', 'are', 'a']
generate_text(hp_successor_map_4, word_sequence)

you are a liar and dumbledores an old fool seamus looked up at harry harry potter mustnt be
