In [182]:
import re
import os
import numpy as np
from collections import Counter

##### Algoritmo

- 1- Inicia-se com duas palavras aleatórias;
- 2- Escolhe-se a próxima palavra de forma aleatória levando em consideração a distribuição de probabilidade;
- 3- Continua o passo anterior até um critério de parada (word_count).

In [172]:
class ShannonVisualizationMethod:
    def __init__(self, corpus_txt_path_list):
        self.tokens = self._aggregate_corpus(corpus_txt_path_list)
    
    def _generate_tokens(self, corpus_txt_path):
        regex = "[a-zA-ZçÇãÃõÕáÁéÉíÍóÓúÚâÂêÊîÎôÔûÛàÀ]+"
        corpus = open(corpus_txt_path, encoding='UTF-8').read()
        tokens = re.findall(regex, corpus)
        return tokens
    
    def _aggregate_corpus(self, corpus_txt_path_list):
        tokens = []
        for corpus_txt_path in corpus_txt_path_list:
            tokens += self._generate_tokens(corpus_txt_path)
        return tokens
    
    def _choose_next_word(self, start_sentence_list):
        n = len(start_sentence_list) + 1
        
        # Denominator cal 
        count_denom = Counter(zip(*(self.tokens[i:] for i in range(n - 1))))[tuple(start_sentence_list)]

        if count_denom == 0:
            raise Exception("The start sentence list does not exist in the corpus")

        n_grams = Counter(zip(*(self.tokens[i:] for i in range(n))))

        count_n_grams = [n_grams[tuple(start_sentence_list + [token])] for token in self.tokens]

        n_gram_probabilities = np.array(count_n_grams) / count_denom

        # Remove zero probabilities
        set_words = [(n_gram_probabilities[i], self.tokens[i]) for i in range(len(n_gram_probabilities)) if
                        n_gram_probabilities[i] > 0]

        # Make a set of words
        set_words = set(set_words)

        # Choose next word
        prob_words = [st[0] for st in set_words]
        candidate_words = [st[1] for st in set_words]

        next_word = np.random.choice(candidate_words, 1, p=prob_words).item()

        return next_word

    def generate_text(self, start_sentence_list, words_count=10):
        generated_text_str = ' '.join(start_sentence_list)
        
        for _ in range(words_count):
            generated_word = self._choose_next_word(start_sentence_list)
            start_sentence_list.pop(0)  # removes first word
            start_sentence_list.append(generated_word)

            generated_text_str = generated_text_str + " " + generated_word

        return generated_text_str

In [200]:
dataset_path = '../movie_scripts'
corpus_txt_path_list = [os.path.join('../movie_scripts', file_path) for file_path in os.listdir(dataset_path)]

In [203]:
shannon_visu = ShannonVisualizationMethod(corpus_txt_path_list[0:20])

In [209]:
shannon_visu.generate_text(
    ["I", "am", "not"],
    words_count=100
)

'I am not a dog EMIL For five years I paid for the movie When he took me home he said we should go on a Goddamn trim hunt stop moaning HAMMOND Speakin of moans my Stomach is startin to growl CATES We eat when I say that Cameron s love is pure Purer than say Joey Dorsey s PATRICK Dorsey can plow whoever he wants I m just a single dad out here blowing like dust in the wind Ed shakes Julie s hand and gives hima soda TONY is puzzled DISSOLVE Tony is siting in a chair with violin in hand'