In [272]:
# First, let's read the uploaded file to understand its structure before proceeding with the modifications.
file_path = 'Resource/tokenized_sentences_output.txt'

# Read the contents of the file
with open(file_path, 'r') as file:
    content = file.readlines()

# Display the first few lines to understand the structure
content[:5]

['SDBU\n', 'SDLU\n', 'SDQU\n', 'SEBU\n', 'SELU\n']

In [273]:
# Add "UUUU" to the front of each line in the file
modified_content = ['UUUU' + line.strip() for line in content]

# Save the modified content to a new file
modified_file_path = 'Resource/modified_tokenized_sentences_output.txt'
with open(modified_file_path, 'w') as file:
    for line in modified_content:
        file.write(line + '\n')

# Read back the modified content to ensure it's correctly formatted
with open(modified_file_path, 'r') as file:
    modified_content_check = file.readlines()

# Show the first few lines of the modified content
modified_content_check[:5], modified_file_path

(['UUUUSDBU\n', 'UUUUSDLU\n', 'UUUUSDQU\n', 'UUUUSEBU\n', 'UUUUSELU\n'],
 'Resource/modified_tokenized_sentences_output.txt')

In [274]:
import pandas as pd
from collections import defaultdict

# Define a function to get all overlapping 5-character sequences in the content
def get_all_overlapping_sequences(lines, sequence_length=5):
    all_sequences = defaultdict(lambda: defaultdict(int))
    for line in lines:
        # Clean up the line
        line = line.strip()  # Remove whitespace
        # Loop through the line to get all overlapping sequences of the given length
        for i in range(len(line) - sequence_length + 1):
            sequence = line[i:i+sequence_length]
            prefix = sequence[:-1]
            next_char = sequence[-1]
            all_sequences[prefix][next_char] += 1
    return all_sequences

# Use the function to get all overlapping sequences
all_overlapping_sequences = get_all_overlapping_sequences(modified_content)
pd.DataFrame(all_overlapping_sequences).fillna(0)

Unnamed: 0,UUUU,UUUS,UUSD,USDB,USDL,USDQ,UUSE,USEB,USEL,USEQ,...,SFLJ,FLJS,SRLJ,RLJS,SOKT,OKTS,SOKS,OKSD,LJSO,OKSO
S,555.0,0.0,0.0,22.0,0.0,0.0,0.0,5.0,0.0,0.0,...,2.0,0.0,2.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0
A,6.0,0.0,8.0,1.0,0.0,1.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
J,49.0,0.0,20.0,3.0,1.0,2.0,1.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N,92.0,0.0,32.0,0.0,2.0,2.0,47.0,2.0,10.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T,11.0,0.0,23.0,0.0,2.0,0.0,18.0,3.0,3.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D,0.0,128.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
E,0.0,119.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H,0.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
K,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,18.0,0.0
M,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [275]:
# Calculate probabilities from the overlapping sequences
overlapping_probabilities = {}
for prefix, suffixes in all_overlapping_sequences.items():
    total = sum(suffixes.values())
    overlapping_probabilities[prefix] = {char: count / total for char, count in suffixes.items()}

pd.DataFrame(overlapping_probabilities).fillna(0)

Unnamed: 0,UUUU,UUUS,UUSD,USDB,USDL,USDQ,UUSE,USEB,USEL,USEQ,...,SFLJ,FLJS,SRLJ,RLJS,SOKT,OKTS,SOKS,OKSD,LJSO,OKSO
S,0.778401,0.0,0.0,0.814815,0.0,0.0,0.0,0.454545,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
A,0.008415,0.0,0.0625,0.037037,0.0,0.166667,0.02521,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
J,0.068724,0.0,0.15625,0.111111,0.166667,0.333333,0.008403,0.0,0.125,0.133333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N,0.129032,0.0,0.25,0.0,0.333333,0.333333,0.394958,0.181818,0.625,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T,0.015428,0.0,0.179688,0.0,0.333333,0.0,0.151261,0.272727,0.1875,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D,0.0,0.230631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
E,0.0,0.214414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H,0.0,0.122523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
K,0.0,0.063063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0
M,0.0,0.07027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [276]:
import random

# Define the text predictor function
def text_predictor(input_text, probabilities_dict, sequence_length=4):
    # Ensure the input text is at least as long as the sequence length
    if len(input_text) < sequence_length:
        return "Input text is too short."

    # Take the last 'sequence_length' characters as the context
    context = input_text[-sequence_length:]

    # Look up the context in the probability dictionary
    next_char_probs = probabilities_dict.get(context, None)

    # If the context isn't found, return a message stating so
    if not next_char_probs:
        return "Context not found in the probability dictionary."

    # Select the next character based on the probabilities
    next_chars = list(next_char_probs.keys())
    probabilities = list(next_char_probs.values())

    # Randomly select a character using the probabilities as weights
    predicted_char = random.choices(next_chars, weights=probabilities, k=1)[0]

    # Return the predicted character
    return predicted_char

# Demonstrate the text predictor with a sample context
sample_context = "UUUS"
predicted_character = text_predictor(sample_context, overlapping_probabilities)
predicted_character

'M'

In [277]:
# We will adjust the function to ensure that the starting 'U's are removed from the generated text, even if the 'U' is part of the starting text.

def continue_text_generation_no_u_at_all(start_text, probabilities_dict, end_char='U', sequence_length=4):
    generated_text = start_text

    # Keep generating characters until an 'end_char' is predicted
    while True:
        # Get the last 'sequence_length' characters as the context
        context = generated_text[-sequence_length:]
        # Predict the next character
        next_char = text_predictor(context, probabilities_dict, sequence_length)

        # If the predicted character is the end character, break the loop
        if next_char == end_char:
            break

        # Append the predicted character to the generated text
        generated_text += next_char

    # Remove all leading 'U's from the generated text
    return generated_text.lstrip('U')

In [278]:
# Start the text generation without the initial 'U's
start_text = "UUUS"
generated_text_no_u_at_all = continue_text_generation_no_u_at_all(start_text, overlapping_probabilities)
generated_text_no_u_at_all

'SEQNSF'

In [282]:
# First, let's read the uploaded tokens file to understand its structure before proceeding with the translation of the encoded result.
tokens_file_path = 'Resource/tokens.txt'

# Read the contents of the tokens file
with open(tokens_file_path, 'r') as file:
    tokens_content = file.readlines()

# Display the first few lines to understand the structure
tokens_content[:5]

['A: " at"\n', 'B: " ate"\n', 'C: " bed"\n', 'D: " boy"\n', 'E: " cat"\n']

In [283]:
# Create a dictionary from the tokens file
token_dict = {}

# Process each line in the tokens file to populate the dictionary
for line in tokens_content:
    # The structure is assumed to be 'X: " Y"'
    token, word_fragment = line.strip().split(': ')
    # Remove the quotes around the word fragment
    word_fragment = word_fragment.strip('"')
    token_dict[token] = word_fragment

# Now we decode the generated text 'SFNSIQ' using the dictionary
decoded_text = ''.join([token_dict.get(token, '') for token in generated_text_no_u_at_all])

decoded_text

' the cat sat on the chair'

In [284]:
# Capitalize the first letter, strip the leading space, and add a period at the end
formatted_decoded_text = decoded_text.strip().capitalize() + '.'

formatted_decoded_text

'The cat sat on the chair.'

In [319]:
# Further adjust the function to ensure that the spacing is consistent, even when the first word is selected.
def text_predictor_with_consistent_spacing(input_text, probabilities_dict, sequence_length=4):
    if len(input_text) < sequence_length:
        return "Input text is too short.", ""

    context = input_text[-sequence_length:]
    next_char_probs = probabilities_dict.get(context, None)

    if not next_char_probs:
        return "Context not found in the probability dictionary.", ""

    next_chars = list(next_char_probs.keys())
    probabilities = list(next_char_probs.values())
    predicted_char = random.choices(next_chars, weights=probabilities, k=1)[0]

    # Sort the probabilities in descending order
    sorted_probs = sorted(next_char_probs.items(), key=lambda item: item[1], reverse=True)

    # Format the probabilities output with consistent spacing
    probabilities_output = ', '.join(
        f">>{token_dict.get(char, char).upper()}<< {prob*100:.2f}%" if char == predicted_char else
        f"{token_dict.get(char, char)} {prob*100:.2f}%" for char, prob in sorted_probs
    )

    return probabilities_output, predicted_char

def text_generation_with_consistent_spacing(start_text, probabilities_dict, token_dict, end_char='U', sequence_length=4):
    generated_text = start_text
    output_text = ""

    while True:
        context = generated_text[-sequence_length:]
        probabilities_output, next_char = text_predictor_with_consistent_spacing(context, probabilities_dict, sequence_length)

        if next_char == end_char:
            if len(generated_text) > sequence_length:
                generated_text += next_char
            output_text += probabilities_output + '\n'
            break

        generated_text += next_char
        output_text += probabilities_output + '\n'

    generated_text = generated_text.lstrip('U')
    decoded_sentence = ''.join([token_dict.get(token, token) for token in generated_text]).strip().capitalize()

    if not decoded_sentence.endswith('.'):
        decoded_sentence += '.'

    return decoded_sentence, output_text

# Simulate one iteration of the loop
start_text = "UUUU"
decoded_sentence, output_text = text_generation_with_consistent_spacing(start_text, overlapping_probabilities, token_dict)
decoded_sentence, output_text

('The chair lay under the table.',
 '>> THE<< 77.84%,  on 12.90%,  in 6.87%,  under 1.54%,  at 0.84%\n pickle 26.67%,  boy 23.06%,  cat 21.44%,  dog 12.25%,  mouse 7.03%,  jar 6.31%,  floor 1.44%, >> CHAIR<< 1.26%,  bed 0.18%,  refrigerator 0.18%,  table 0.18%\n sat 42.86%,  on 28.57%, >> LAY<< 28.57%\n on 50.00%, >> UNDER<< 50.00%\n>> THE<< 100.00%\n>> TABLE<< 100.00%\n>>.<< 100.00%\n')

In [320]:
print(output_text)

>> THE<< 77.84%,  on 12.90%,  in 6.87%,  under 1.54%,  at 0.84%
 pickle 26.67%,  boy 23.06%,  cat 21.44%,  dog 12.25%,  mouse 7.03%,  jar 6.31%,  floor 1.44%, >> CHAIR<< 1.26%,  bed 0.18%,  refrigerator 0.18%,  table 0.18%
 sat 42.86%,  on 28.57%, >> LAY<< 28.57%
 on 50.00%, >> UNDER<< 50.00%
>> THE<< 100.00%
>> TABLE<< 100.00%
>>.<< 100.00%



In [323]:
# Define a loop that generates sentences, displays the progress, and asks the user whether they want to continue.
# The last newline character has been removed as per the request.

def interactive_text_generation_loop():
    user_input = 'y'
    while user_input.lower() == 'y':
        start_text = "UUUU"
        decoded_sentence, output_text = text_generation_with_consistent_spacing(
            start_text, overlapping_probabilities, token_dict
        )
        print(output_text)  # Print the progress with line spacing
        print(decoded_sentence)  # Print the output sentence
        user_input = input('Do you want to continue? (yes/no): ')  # Ask the user to continue or not

# Call the function to start the interactive loop
interactive_text_generation_loop()

>> THE<< 77.84%,  on 12.90%,  in 6.87%,  under 1.54%,  at 0.84%
 pickle 26.67%,  boy 23.06%,  cat 21.44%, >> DOG<< 12.25%,  mouse 7.03%,  jar 6.31%,  floor 1.44%,  chair 1.26%,  bed 0.18%,  refrigerator 0.18%,  table 0.18%
>> ON<< 39.71%,  lay 17.65%,  under 16.18%,  chased 11.76%,  ate 7.35%,  sat 7.35%
>> THE<< 100.00%
 floor 51.72%, >> BOY<< 31.03%,  bed 17.24%
>> IN<< 33.33%,  on 22.22%, . 22.22%,  lay 11.11%,  under 11.11%
>> THE<< 72.73%,  bed 27.27%
>> CHAIR<< 59.46%,  bed 40.54%
>> ATE<< 27.27%,  at 27.27%,  sat 18.18%,  lay 18.18%, . 9.09%
 the 66.67%, . 16.67%, >> AT<< 16.67%
>> THE<< 100.00%
>> TABLE<< 100.00%
>>.<< 100.00%

The dog on the boy in the chair ate at the table.
 the 77.84%, >> ON<< 12.90%,  in 6.87%,  under 1.54%,  at 0.84%
>> THE<< 100.00%
 boy 27.17%,  bed 26.09%,  chair 13.04%,  floor 11.96%,  refrigerator 10.87%, >> TABLE<< 10.87%
 sat 60.00%, >> LAY<< 40.00%
. 50.00%, >> THE<< 50.00%
>> CAT<< 33.33%,  pickle 33.33%,  dog 16.67%,  jar 8.33%,  mouse 8.33%
>>.

https://chat.openai.com/share/ab0b82d1-a359-48ff-bc13-01ec43594b1e