In [22]:
import re
import pandas as pd
from collections import Counter
import numpy as np
import math

In [23]:
def clean_bible(file_path, remove_spaces=False):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    if remove_spaces:
        text = re.sub(r'[^а-яА-ЯёЁ]', '', text)
    else:
        text = re.sub(r'[^а-яА-ЯёЁ ]', '', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()

    text = text.lower()

    return text

file_path = "Bible.txt"
cleaned_bible = clean_bible(file_path, remove_spaces=False)
output_file_path = "cleaned_bible.txt"
with open(output_file_path, "w", encoding="utf-8") as output_file:
    output_file.write(cleaned_bible)

cleaned_bible_wo_spaces = clean_bible(file_path, remove_spaces=True)
output_file_path = "cleaned_bible_no_spaces.txt"
with open(output_file_path, "w", encoding="utf-8") as output_file:
    output_file.write(cleaned_bible_wo_spaces)

In [24]:
def frequency_counter(file_path, output_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    character_counts = Counter(text)
    total_characters = sum(character_counts.values())

    frequencies = {char: count / total_characters for char, count in character_counts.items()}
    sorted_frequencies = sorted(frequencies.items(), key=lambda item: item[1], reverse=True)

    df = pd.DataFrame(sorted_frequencies, columns=['Character', 'Frequency'])
    pd.set_option('display.float_format', lambda x: '%.6f' % x)
    df.to_excel(output_path, index=False)

file_path_with_spaces = "cleaned_bible.txt"
output_path_with_spaces = "letter_frequency.xlsx"
frequency_counter(file_path_with_spaces, output_path_with_spaces)

file_path_no_spaces = "cleaned_bible_no_spaces.txt"
output_path_no_spaces = "letter_frequency_no_spaces.xlsx"
frequency_counter(file_path_no_spaces, output_path_no_spaces)

In [25]:
def bigram_frequency_counter(text, overlapping=True):
    bigrams = {}
    total_bigrams = 0
    step = 1 if overlapping else 2
    for i in range(0, len(text) - 1, step):
        bigram = text[i:i+2]
        if len(bigram) == 2:
            bigrams[bigram] = bigrams.get(bigram, 0) + 1
            total_bigrams += 1
    return bigrams, total_bigrams

def create_bigram_matrix(bigrams, total_bigrams):
    symbols = [' '] + [chr(i) for i in range(ord('а'), ord('я') + 1)] + ['ё']
    matrix = np.zeros((len(symbols), len(symbols)), dtype=float)
    for bigram, count in bigrams.items():
        frequency = count / total_bigrams
        row = symbols.index(bigram[0])
        col = symbols.index(bigram[1])
        matrix[row][col] = frequency
    return symbols, matrix

def save_matrix_to_excel(symbols, matrix, filename):
    df = pd.DataFrame(np.round(matrix, 6), index=symbols, columns=symbols)
    pd.set_option('display.float_format', lambda x: '%.6f' % x)
    df.to_excel(filename, index=True)

def process_text(file_path, output_path_intersecting, output_path_non_intersecting):
    with open(file_path, "r", encoding="utf-8") as file:
        cleaned_text = file.read()

    bigrams_intersecting, total_bigrams_intersecting = bigram_frequency_counter(cleaned_text, overlapping=True)
    symbols, matrix_intersecting = create_bigram_matrix(bigrams_intersecting, total_bigrams_intersecting)
    save_matrix_to_excel(symbols, matrix_intersecting, output_path_intersecting)

    bigrams_non_intersecting, total_bigrams_non_intersecting = bigram_frequency_counter(cleaned_text, overlapping=False)
    symbols, matrix_non_intersecting = create_bigram_matrix(bigrams_non_intersecting, total_bigrams_non_intersecting)
    save_matrix_to_excel(symbols, matrix_non_intersecting, output_path_non_intersecting)

def main():
    process_text(
        "cleaned_bible.txt",
        "bigram_matrix_intersecting.xlsx",
        "bigram_matrix_non_intersecting.xlsx"
    )

    process_text(
        "cleaned_bible_no_spaces.txt",
        "bigram_matrix_intersecting_no_spaces.xlsx",
        "bigram_matrix_non_intersecting_no_spaces.xlsx"
    )

if __name__ == "__main__":
    main()

In [27]:
def calculate_entropy(frequencies):
    return -sum(freq * math.log2(freq) for freq in frequencies.values() if freq > 0)

def calculate_bigram_entropy(frequencies):
    return calculate_entropy(frequencies) / 2

def bigram_frequency(text, step=1):
    bigrams = {}
    total_bigrams = 0
    for i in range(0, len(text) - 1, step):
        bigram = text[i:i+2]
        if len(bigram) == 2:
            bigrams[bigram] = bigrams.get(bigram, 0) + 1
            total_bigrams += 1
    return {bigram: count / total_bigrams for bigram, count in bigrams.items()}

def calculate_bigram_redundancy(bi_entropy, alphabet_len):
    return 1 - (bi_entropy / math.log2(alphabet_len))

def process_text(file_path, step1, step2):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    alphabet_set = set(text)
    n = len(alphabet_set)
    H1_max = math.log2(n)

    unigram_frequencies = Counter(text)
    total_chars = sum(unigram_frequencies.values())
    unigram_frequencies = {char: count / total_chars for char, count in unigram_frequencies.items()}
    H1 = calculate_entropy(unigram_frequencies)
    redundancy1 = 1 - (H1 / H1_max)

    bigrams_step1 = bigram_frequency(text, step=step1)
    H2_step1 = calculate_bigram_entropy(bigrams_step1)
    R2_step1 = calculate_bigram_redundancy(H2_step1, n)

    bigrams_step2 = bigram_frequency(text, step=step2)
    H2_step2 = calculate_bigram_entropy(bigrams_step2)
    R2_step2 = calculate_bigram_redundancy(H2_step2, n)

    return H1, redundancy1, H2_step1, R2_step1, H2_step2, R2_step2

def main():

    results_with_spaces = process_text("cleaned_bible.txt", step1=1, step2=2)
    results_no_spaces = process_text("cleaned_bible_no_spaces.txt", step1=1, step2=2)

    results = [
        ['H1 with spaces', results_with_spaces[0], results_with_spaces[1]],
        ['H1 without spaces', results_no_spaces[0], results_no_spaces[1]],
        ['H2 with spaces (intersecting)', results_with_spaces[2], results_with_spaces[3]],
        ['H2 with spaces (non-intersecting)', results_with_spaces[4], results_with_spaces[5]],
        ['H2 without spaces (intersecting)', results_no_spaces[2], results_no_spaces[3]],
        ['H2 without spaces (non-intersecting)', results_no_spaces[4], results_no_spaces[5]]
    ]

    df = pd.DataFrame(results, columns=['Type', 'Entropy', 'Redundancy'])
    df.to_excel("entropy_and_redundancy_results.xlsx", index=False)

if __name__ == "__main__":
    main()
