In [1]:
# Update the code to read the new token file format
token_dictionary = {}
with open('Resource/tokens.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        token, word = line.strip().split(': ')
        token = token.strip('" ')
        word = word.strip('" ')
        token_dictionary[token] = word

# Test the updated token dictionary
token_dictionary

{'A': 'at',
 'B': 'ate',
 'C': 'bed',
 'D': 'boy',
 'E': 'cat',
 'F': 'chair',
 'G': 'chased',
 'H': 'dog',
 'I': 'floor',
 'J': 'in',
 'K': 'jar',
 'L': 'lay',
 'M': 'mouse',
 'N': 'on',
 'O': 'pickle',
 'P': 'refrigerator',
 'Q': 'sat',
 'R': 'table',
 'S': 'the',
 'T': 'under',
 'U': '.'}

In [7]:
# Reading the file into a 'pandas' Series
file_path = 'Resource/sentences.txt'

# Reading the file line by line to treat each line as a tokenized sentence
with open(file_path, 'r') as f:
    sentences = f.readlines()

# Remove newline characters
sentences = [sentence.strip() for sentence in sentences]

# Display the first few sentences for verification
sentences[-5:]

['The pickle under the table lay in the pickle jar.',
 'Under the table sat the pickle in the pickle jar.',
 'The pickle on the floor lay in the pickle jar under the bed.',
 'The pickle on the floor lay in the pickle jar under the chair.',
 'The pickle on the floor lay in the pickle jar under the table.']

In [8]:
def convert_sentences_to_tokens(sentences_list: list, token_dict: dict) -> list:
    convert_token_arrays = []
    reversed_token_dict = {v: k for k, v in token_dict.items()}  # Reverse the token dictionary for look-up

    for sentence in sentences_list:
        token_array = []
        words = sentence.split()

        for w in words:
            clean_word = w.lower().rstrip('.')
            t = reversed_token_dict.get(clean_word)  # Remove a trailing period and convert to lowercase
            if t:
                token_array.append(t)
            else:
                token_array.append("[Unknown]")

        # Add the 'U' token for the period at the end of the sentence, if applicable
        if sentence.endswith('.'):
            token_array.append('U')

        convert_token_arrays.append(token_array)

    return convert_token_arrays

In [9]:
# Test the function
token_arrays = convert_sentences_to_tokens(sentences, token_dictionary)
token_arrays[-5:]

[['S', 'O', 'T', 'S', 'R', 'L', 'J', 'S', 'O', 'K', 'U'],
 ['T', 'S', 'R', 'Q', 'S', 'O', 'J', 'S', 'O', 'K', 'U'],
 ['S', 'O', 'N', 'S', 'I', 'L', 'J', 'S', 'O', 'K', 'T', 'S', 'C', 'U'],
 ['S', 'O', 'N', 'S', 'I', 'L', 'J', 'S', 'O', 'K', 'T', 'S', 'F', 'U'],
 ['S', 'O', 'N', 'S', 'I', 'L', 'J', 'S', 'O', 'K', 'T', 'S', 'R', 'U']]

In [775]:
# Convert the token arrays to strings
token_strings = [''.join(token_array) for token_array in token_arrays]

# Write the strings to a txt file
output_file_path = 'Resource/tokenized_sentences_output.txt'
with open(output_file_path, 'w') as f:
    for token_string in token_strings:
        f.write(f"{token_string}\n")

output_file_path

'Resource/tokenized_sentences_output.txt'

In [10]:
# Reading the file into a pandas.Series
file_path = 'Resource/tokenized_sentences_output.txt'

# Reading the file line by line to treat each line as a tokenized sentence
with open(file_path, 'r') as f:
    tokenized_sentences = f.readlines()

tokenized_sentences[-5:]

['SOTSRLJSOKU\n',
 'TSRQSOJSOKU\n',
 'SONSILJSOKTSCU\n',
 'SONSILJSOKTSFU\n',
 'SONSILJSOKTSRU\n']

In [713]:
import pandas as pd

# Converting the list of tokenized sentences to a pandas.Series
tokenized_sentences_series = pd.Series(tokenized_sentences).map(lambda x: x.strip('\n'))

# Displaying the first few entries to get an idea of the content
tokenized_sentences_series

0                SDBU
1                SDLU
2                SDQU
3                SEBU
4                SELU
            ...      
708       SOTSRLJSOKU
709       TSRQSOJSOKU
710    SONSILJSOKTSCU
711    SONSILJSOKTSFU
712    SONSILJSOKTSRU
Length: 713, dtype: object

In [714]:
# Creating sequences of four characters where the first three characters are the input (X), and the 4th character is what we want to predict (y)
X_sequences = []
y_tokens = []

# Loop through each string in the Series
for string in tokenized_sentences_series:
    # Create sequences of four characters
    for i in range(len(string) - 3):
        X_sequences.append(string[i:i + 3])
        y_tokens.append(string[i + 3])

# Converting to pandas.Series for easier manipulation later
X_series = pd.Series(X_sequences)
y_series = pd.Series(y_tokens)

# Displaying some sample X sequences and corresponding y tokens for verification
sample_X_sequences = X_series
sample_y_tokens = y_series

# Converting the X_series and y_series into a pandas.DataFrame
conversation_df = pd.DataFrame({
    'X_sequences': X_series,
    'y_tokens': y_series
})

# Displaying the first few rows of the DataFrame for verification
conversation_df

Unnamed: 0,X_sequences,y_tokens
0,SDB,U
1,SDL,U
2,SDQ,U
3,SEB,U
4,SEL,U
...,...,...
4331,JSO,K
4332,SOK,T
4333,OKT,S
4334,KTS,R


In [715]:
# Counting the occurrences of the first character in each sequence in the Series
first_char_counts = tokenized_sentences_series.apply(lambda x: x[0]).value_counts()

# Displaying the frequency table for the first character in each sequence
first_char_counts

S    555
N     92
J     49
T     11
A      6
Name: count, dtype: int64

In [716]:
import numpy as np

np.set_printoptions(precision=2)

def get_token_and_chance(chance_dict: dict, decimals: int = 2, token_dict: dict=token_dictionary) -> pd.DataFrame:
    """
    Get each token's corresponding word and its chance to be the first character in a sequence.

    Parameters:
    - token_dict (dict): Dictionary that maps tokens to words.
    - decimals (int): Number of decimal places for rounding the chances. Default is 2.

    Returns:
    - list: A list of lists where each inner list contains the word and its rounded chance.
    """
    return pd.DataFrame([[token_dict.get(key[-1], f"[Unknown: {key}]"), np.around(chance_dict.get(key, 0), decimals=decimals)] for key in chance_dict], columns=['Word', 'Chance'])

In [717]:
from collections import Counter


def get_first_n_char_chance(n: int, start_with: str = None) -> dict:
    """
    Get the chance for each sequence of N characters to be the first N characters in a sequence,
    optionally filtering sequences that start with a given string.

    Parameters:
    - n (int): The length of the character sequence.
    - start_with (str, optional): A string that the sequence should start with.

    Returns:
    - dict: A dictionary showing the chance for each sequence of N characters to be the first ones in a sequence.
    """
    if start_with:
        # Count the occurrences of each sequence of first N characters in the sequences, filtering by the start_with string
        counter = Counter(tokenized_sentences_series.apply(lambda x: x[:n] if (len(x) >= n and x.startswith(start_with)) else None).dropna())
    else:
        # Count the occurrences of each sequence of first N characters in the sequences without filtering
        counter = Counter(tokenized_sentences_series.apply(lambda x: x[:n] if len(x) >= n else None).dropna())

    # Calculate the chance for each sequence
    total_count = sum(counter.values())
    chance_dict = {char_seq: count / total_count for char_seq, count in counter.items()}

    return chance_dict

In [758]:
import random


# Modify the function to handle token arrays of length less than 3 by adding 'U' at the front until the length is 3
def predict_next_step(triplet: list, df: pd.DataFrame, token_dict: dict) -> str:
    """
    Predict the next character based on different conditions:
    1. If the triplet exists in the DataFrame, select y based on occurrences in that subset.
    2. If the triplet doesn't exist, try getting the chance to start with the last two characters of the triplet.
    3. If that fails, try getting the chance to start with the last character of the triplet.
    4. If all else fails, start a random conversation.

    Parameters:
    - triplet (list): List of three characters that form the sequence to predict the next character for.
    - df (pd.DataFrame): DataFrame containing the sequences and corresponding tokens to predict.
    - token_dict (dict): Dictionary that maps tokens to words.

    Returns:
    - str: The next character based on one of the conditions above.
    """
    # Add 'U' at the front until the length is 3
    while len(triplet) < 3:
        triplet.insert(0, 'U')

    triplet_str = ''.join(triplet)

    # Condition 1: Triplet exists in DataFrame
    if triplet_str in df['X_sequences'].values:
        print(f'Continue With {[token_dict.get(triplet_str[n], f"[Unknown: {triplet_str[n]}]") for n in range(0, 3)]}')
        # Subset DataFrame and calculate chance
        subset_y = df[df['X_sequences'] == triplet_str]['y_tokens']
        counter = Counter(subset_y)
        total_count = sum(counter.values())
        chance_dict = {char: count / total_count for char, count in counter.items()}
        print(f'{get_token_and_chance(chance_dict)}\n')
        characters = list(chance_dict.keys())
        probabilities = list(chance_dict.values())
        return random.choices(characters, probabilities)[0]

    # Condition 2: Try getting chance starting with the last two characters of triplet
    chance_dict = get_first_n_char_chance(3, start_with=triplet_str[1:])
    if chance_dict:
        print(f'Start Conversation With {[token_dict.get(triplet_str[n], f"[Unknown: {triplet_str[n]}]") for n in range(1, 3)]}')
        print(f'{get_token_and_chance(chance_dict)}\n')
        characters = list(chance_dict.keys())
        probabilities = list(chance_dict.values())
        return random.choices(characters, probabilities)[0][-1]  # Take the last character of the chosen triplet

    # Condition 3: Try getting chance starting with last character of triplet
    chance_dict = get_first_n_char_chance(2, start_with=triplet_str[-1])
    if chance_dict:
        print(f'Start Conversation With {[token_dict.get(triplet_str[-1], f"[Unknown: {triplet_str[-1]}]")]}')
        print(f'{get_token_and_chance(chance_dict)}\n')
        characters = list(chance_dict.keys())
        probabilities = list(chance_dict.values())
        return random.choices(characters, probabilities)[0][-1]  # Take the last character of the chosen triplet

    print(f'Start A Random Conversation!')
    chance_dict = get_first_n_char_chance(1)
    print(f'{get_token_and_chance(chance_dict)}\n')
    characters = list(chance_dict.keys())
    probabilities = list(chance_dict.values())
    return random.choices(characters, probabilities)[0][-1]  # Take the last character of the chosen triplet

In [739]:
# Test the function with various conditions
predict_token_0 = predict_next_step(['U', 'U', 'U'], conversation_df, token_dictionary)
print(f'Select word: {token_dictionary.get(predict_token_0, f"[Unknown: {predict_token_0}]")}')

Start A Random Conversation!
    Word  Chance
0    the    0.78
1     at    0.01
2     in    0.07
3     on    0.13
4  under    0.02

Select word: the


In [740]:
# Test the function with various conditions
predict_token_1 = predict_next_step(['U', 'U', predict_token_0], conversation_df, token_dictionary)
print(f'Select word: {token_dictionary.get(predict_token_1, f"[Unknown: {predict_token_1}]")}')

Start Conversation With ['the']
            Word  Chance
0            boy    0.23
1            cat    0.21
2            dog    0.12
3            jar    0.06
4          mouse    0.07
5         pickle    0.27
6            bed    0.00
7          chair    0.01
8   refrigerator    0.00
9          table    0.00
10         floor    0.01

Select word: cat


In [741]:
# Test the function with various conditions
predict_token_2 = predict_next_step(['U', predict_token_0, predict_token_1], conversation_df, token_dictionary)
print(f'Select word: {token_dictionary.get(predict_token_2, f"[Unknown: {predict_token_2}]")}')

Start Conversation With ['the', 'cat']
     Word  Chance
0     ate    0.09
1     lay    0.13
2     sat    0.13
3  chased    0.07
4      at    0.03
5      in    0.01
6      on    0.39
7   under    0.15

Select word: ate


In [742]:
# Test the function with various conditions
predict_token = predict_next_step([predict_token_0, predict_token_1, predict_token_2], conversation_df, token_dictionary)
print(f'Select word: {token_dictionary.get(predict_token, f"[Unknown: {predict_token}]")}')

Continue With ['the', 'cat', 'ate']
    Word  Chance
0      .    0.09
1    the    0.45
2     on    0.18
3  under    0.27

Select word: the


In [743]:
def continue_conversation_until_end(conversation: list, df: pd.DataFrame, token_dict: dict) -> list:
    """
    Continue an existing conversation by generating next characters until 'U' is generated.

    Parameters:
    - conversation (list): List of characters that form the existing conversation.
    - df (pd.DataFrame): DataFrame containing the sequences and corresponding tokens to predict.

    Returns:
    - list: The extended conversation including new characters.
    """
    while True:
        # Take the last three characters from the conversation to predict the next character
        last_triplet = conversation[-3:]

        # Predict the next character or start a new conversation if the triplet doesn't exist
        next_char = predict_next_step(last_triplet, df, token_dict)

        # Print the selected character
        print(f'Select token {next_char}: {token_dict.get(next_char, f"[Unknown: {next_char}]")}\n')

        # Append the predicted character to the conversation
        conversation.append(next_char)

        # Break the loop if the predicted character is 'U'
        if next_char == 'U':
            break

    return conversation

In [748]:
# Test the function with the modified predict_next_step function
initial_conversation = ['U', 'U', 'U']
extended_conversation = continue_conversation_until_end(initial_conversation.copy(), conversation_df, token_dictionary)

Start A Random Conversation!
    Word  Chance
0    the    0.78
1     at    0.01
2     in    0.07
3     on    0.13
4  under    0.02

Select token T: under

Start Conversation With ['under']
  Word  Chance
0  the     1.0

Select token S: the

Start Conversation With ['under', 'the']
    Word  Chance
0  table     1.0

Select token R: table

Continue With ['under', 'the', 'table']
     Word  Chance
0     ate    0.10
1     lay    0.16
2     sat    0.19
3       .    0.53
4  chased    0.01
5      on    0.02

Select token U: .



In [749]:
# Print the extended conversation
extended_conversation

['U', 'U', 'U', 'T', 'S', 'R', 'U']

In [750]:
# Redefine the function to decode a token array back to words and make the first word's first letter uppercase
def decode_tokens_to_words(token_array: list, token_dict: dict) -> str:
    """
    Decode an array of tokens back to a string of words based on a given token dictionary.
    The first word in the sentence (that is not a period) will have its first letter capitalized.

    Parameters:
    - token_array (list): Array of tokens to decode.
    - token_dict (dict): Dictionary that maps tokens to words.

    Returns:
    - str: A string of words obtained by decoding the token array.
    """
    words = [token_dict.get(t, f"[Unknown: {t}]") for t in token_array]

    # Capitalize the first letter of the first word that is not a period
    for n in range(len(words)):
        if words[n] != '.':
            words[n] = words[n].capitalize()
            break

    return ' '.join(words)

In [751]:
# Test the function again with the updated token dictionary
decoded_sentence = decode_tokens_to_words(extended_conversation, token_dictionary)
decoded_sentence

'. . . Under the table .'

In [752]:
sentence = decode_tokens_to_words(continue_conversation_until_end(['U', 'U', 'U'], conversation_df, token_dictionary), token_dictionary)

Start A Random Conversation!
    Word  Chance
0    the    0.78
1     at    0.01
2     in    0.07
3     on    0.13
4  under    0.02

Select token S: the

Start Conversation With ['the']
            Word  Chance
0            boy    0.23
1            cat    0.21
2            dog    0.12
3            jar    0.06
4          mouse    0.07
5         pickle    0.27
6            bed    0.00
7          chair    0.01
8   refrigerator    0.00
9          table    0.00
10         floor    0.01

Select token E: cat

Start Conversation With ['the', 'cat']
     Word  Chance
0     ate    0.09
1     lay    0.13
2     sat    0.13
3  chased    0.07
4      at    0.03
5      in    0.01
6      on    0.39
7   under    0.15

Select token L: lay

Continue With ['the', 'cat', 'lay']
    Word  Chance
0      .    0.06
1     in    0.12
2     on    0.62
3  under    0.19

Select token N: on

Continue With ['cat', 'lay', 'on']
  Word  Chance
0  the     1.0

Select token S: the

Continue With ['lay', 'on', 'the']
      

In [753]:
sentence

'. . . The cat lay on the floor .'

In [754]:
# Define the function to encode a sentence, extend the conversation, and then decode it back to words
def encode_extend_decode(sentences_list: list, token_dict: dict, df: pd.DataFrame) -> list[str]:
    """
    Encode a sentence into tokens, extend the conversation based on those tokens, and then decode it back into words.

    Parameters:
    - sentence (str): The sentence to start the conversation with.
    - token_dict (dict): Dictionary that maps tokens to words.
    - df (pd.DataFrame): DataFrame containing the sequences and corresponding tokens to predict.

    Returns:
    - str: The extended conversation decoded back into words.
    """
    # Step 1: Encode the sentence into tokens
    encode_sentences = convert_sentences_to_tokens(sentences_list, token_dict)

    # Step 2: Extend the conversation
    extended_conversation_encode = [continue_conversation_until_end(encode_sentence, df, token_dict) for encode_sentence in encode_sentences]

    # Step 3: Decode the tokens back to words
    return [decode_tokens_to_words(encode, token_dict) for encode in extended_conversation_encode]

In [766]:
encode_extend_decode(['The boy ate the', 'The dog sat', 'The pickle', 'On'], token_dictionary, conversation_df)

Continue With ['boy', 'ate', 'the']
     Word  Chance
0  pickle     1.0

Select token O: pickle

Continue With ['ate', 'the', 'pickle']
    Word  Chance
0      .    0.14
1     in    0.76
2     at    0.01
3     on    0.05
4  under    0.04

Select token U: .

Continue With ['the', 'dog', 'sat']
    Word  Chance
0      .     0.2
1     in     0.2
2     on     0.4
3  under     0.2

Select token T: under

Continue With ['dog', 'sat', 'under']
  Word  Chance
0  the     1.0

Select token S: the

Continue With ['sat', 'under', 'the']
           Word  Chance
0           bed    0.33
1         chair    0.18
2         table    0.42
3  refrigerator    0.06

Select token R: table

Continue With ['under', 'the', 'table']
     Word  Chance
0     ate    0.10
1     lay    0.16
2     sat    0.19
3       .    0.53
4  chased    0.01
5      on    0.02

Select token U: .

Start Conversation With ['the', 'pickle']
    Word  Chance
0    lay    0.18
1    sat    0.14
2    jar    0.24
3     on    0.21
4  under    

['The boy ate the pickle .',
 'The dog sat under the table .',
 'The pickle on the refrigerator lay .',
 'On the boy on the floor ate under the table sat the boy .']