In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import io

In [None]:
def print_most_freq_symbols(text):
    return max(set(text), key = text.count)

def print_most_freq_bytes(freqs, n):
    freq = freqs.copy()
    tmp = sorted(freq)
    print(f"{n} most common bytes:")
    for i in range(n):
        index = freq.index(tmp[-1 - i])
        percent = round(tmp[-1 - i] / sum(freqs) * 100, 2)
        if percent > 0:
            print(index, f"({percent}%)", end=" ")
            freq[index] = 0
    print("\n")
    
    freq = freqs[:32] + [freqs[127]]
    tmp = sorted(freq)
    print(f"{n} most common bytes of nonprintable ASCII table:")
    for i in range(n):
        index = freq.index(tmp[-1 - i])
        percent = round(tmp[-1 - i] / sum(freqs) * 100, 2)
        if percent > 0:
            print(index, f"({percent}%)", end=" ")
            freq[index] = 0
    print()

def get_byte_freqs(text):
    freqs = [0] * 256
    l = []
    for i in range(len(text)):
        freqs[text[i]] += 1
        l.append(text[i])

    return freqs

def plot_freqs(freqs, bin_number, title):
    l = []
    for i in range(len(freqs)):
        l += [i] * freqs[i]
    plt.figure(figsize=(10, 6))
    plt.xticks(np.arange(0, 256, 10))
    plt.hist(l, edgecolor="white", bins=bin_number, density=True)
    plt.title(title)

In [None]:
template = "Керниган, Ричи. Язык C — "
extentions = ["dos", "iso", "koi8r", "maccyrillic", "utf8", "utf16", "utf32", "windows"]

N = 5

for ext in extentions:
    print("Encoding:", ext)
    byte_file = open(template + ext + ".txt", "rb")
    byte_text = byte_file.read()
    byte_file.close()
    
    byte_freqs = get_byte_freqs(byte_text)
    print_most_freq_bytes(byte_freqs, N)
    plot_freqs(byte_freqs, 20, ext)
    print("-" * 50)
    
print("Most common symbols in text:")
file = open(template + "utf8" + ".txt", "r")
text = file.read()
file.close()
counter = Counter(text)
[print(f"\"{counter.most_common(N)[i][0]}\" ({round(counter.most_common(N)[i][1] / len(text) * 100, 2)}%)", sep="", end="   ") for i in range(N)];

In [None]:
file = open('4.txt','rb')
text = file.read()
file.close()

freqs = get_byte_freqs(text)
print_most_freq_bytes(freqs, N)
plot_freqs(freqs, 20, "4.txt")

In [None]:
file = open('2.txt','rb')
text = file.read()
file.close()

freqs = get_byte_freqs(text)
print_most_freq_bytes(freqs, N)
plot_freqs(freqs, 20, "2.txt")

In [None]:
file = io.open('2.txt',mode='r', encoding="koi8-r")
text = file.read()
file.close()

print(text)