In [1]:
import codecs
import numpy as np
import pandas as pd
import re

In [2]:
# Opening files cell
text_file = codecs.open( "metro-2033.txt", "r", "utf_8_sig" )
file = codecs.open('text.txt', 'w', "utf_8_sig")
file_no_spaces = codecs.open('text-no-spaces.txt', 'w', "utf_8_sig")

text = text_file.read()
text_file.close()

In [3]:
# Writing files cell
text = text.lower()
text = re.sub('[^а-я]', ' ', text)
text = re.sub('ъ', 'ь', text)
text = re.sub('ё', 'е', text)

text_no_spaces = re.sub('\s+', '', text)
text = re.sub('\s+', ' ', text)

file.write(text)
file_no_spaces.write(text_no_spaces)
file.close()
file_no_spaces.close()

In [4]:
def count_probs(alphab, txt):
    probs = {}
    length = len(txt)
    for symbol in alphab:
        probs[symbol] = txt.count(symbol)/length
    return probs

In [5]:
def count_bigram_probs(alphab, txt):
    alphab_square = [i+j for i in alphab for j in alphab]

    probs = {}
    length = len(txt) - 1
    for bigram in alphab_square:
        probs[bigram] = txt.count(bigram)/length
    return probs


In [6]:
def count_bigram_probs_2(alphab, txt):
    alphab_square = [i+j for i in alphab for j in alphab]

    probs = {}
    length = int(len(txt)/2)
    for i in alphab_square:
        probs[i] = 0

    for i in range(0, len(txt), 2):
        if i < len(txt) - 1:
            probs[txt[i]+txt[i+1]] += 1

    probs = {k: v/length for k, v in probs.items()}
    return probs

In [7]:
def count_entropy(probs, length):
    h = 0
    prob_pos = {i: j for i, j in probs.items() if j != 0}
    for key, value in prob_pos.items():
        h += value * np.log2(value)
    h = (1 / length) * -h

    return h

In [8]:
# Text with spaces
alphabet = list('абвгдежзийклмнопрстуфхцчшщыьэюя ')

H1 = count_entropy(count_probs(alphabet, text), 1)
H2_1 = count_entropy(count_bigram_probs(alphabet, text), 2)
H2_2 = count_entropy(count_bigram_probs_2(alphabet, text), 2)

print('With spaces:')
print(f'H1: {H1}, H2_1: {H2_1}, H2_2: {H2_2}')

R = lambda h: 1 - h/np.log2(len(alphabet))

print(f'Language redundancy:')
print(f'for H1: {R(H1)}, for H2_1: {R(H2_1)}, for H2_2: {R(H2_2)}')

With spaces:
H1: 4.3719192771302975, H2_1: 3.968171239330339, H2_2: 3.9681105084082304
Language redundancy:
for H1: 0.12561614457394055, for H2_1: 0.20636575213393216, for H2_2: 0.20637789831835396


In [9]:
# Text without spaces
alphabet_no_spaces = list('абвгдежзийклмнопрстуфхцчшщыьэюя')

H1 = count_entropy(count_probs(alphabet_no_spaces, text_no_spaces), 1)
H2_1 = count_entropy(count_bigram_probs(alphabet_no_spaces, text_no_spaces), 2)
H2_2 = count_entropy(count_bigram_probs_2(alphabet_no_spaces, text_no_spaces), 2)

print('No spaces:')
print(f'H1: {H1}, H2_1: {H2_1}, H2_2: {H2_2}')

R = lambda h: 1 - h/np.log2(len(alphabet_no_spaces))

print(f'Language redundancy:')
print(f'for H1: {R(H1)}, for H2_1: {R(H2_1)}, for H2_2: {R(H2_2)}')

No spaces:
H1: 4.453315120926602, H2_1: 4.136819747421218, H2_2: 4.13641832945434
Language redundancy:
for H1: 0.10110241057871183, for H2_1: 0.16498671262823406, for H2_2: 0.165067738478186


In [10]:
def write_to_xlsx(alphab, bigram_probs):
    row_counter = 0
    col_counter = 0
    alphab = [x if x != ' ' else '_' for x in alphab]
    alphab_len = len(alphab)
    table = [[' '] + alphab]

    values = list(bigram_probs.values())

    for i in range(len(values)):
        n = i // alphab_len
        if row_counter == 0:
            table.append([alphab[col_counter]])
        table[n+1].append(values[i])
        row_counter += 1
        if row_counter == alphab_len:
            row_counter = 0
            col_counter += 1

    df = pd.DataFrame(table).T
    df.to_excel(excel_writer = "D:/Лабs/test.xlsx")

In [11]:
write_to_xlsx(alphabet_no_spaces, count_bigram_probs(alphabet_no_spaces, text_no_spaces))