In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Consider the given text (’text’). Consider each character (including spaces,
punctuation and newline) as sampled from a space of characters with a given
probability. Estimate:

1. The Shannon entropy of the text
2. The Conditional entropy of a character, given previous character
3. The Mutual information between a character and the previous one


In [2]:
with open('text.txt', mode='r') as f:
    lines = f.read()
    
for i,s in enumerate(lines):
    print(i, s, type(s))
    if i==5:
        break

0 A <class 'str'>
1 C <class 'str'>
2 T <class 'str'>
3   <class 'str'>
4 I <class 'str'>
5 
 <class 'str'>


In [3]:
text = []
with open('text.txt', mode='r') as f:
    lines = f.read()
    text = np.zeros(len(lines), dtype='str')
for i,s in enumerate(lines):
    text[i] = s
    
characters, index, counts = np.unique(text, return_index=True, return_counts=True )
prob_mass_func = counts/len(text)

In [4]:
for i in zip(characters, index, prob_mass_func):
    print(i)

('\n', 5, 0.0418641464632664)
(' ', 3, 0.22452863967435857)
('!', 148, 0.002280243854177463)
('&', 69445, 2.8267485795588387e-05)
("'", 241, 0.00554984971120052)
(',', 170, 0.014962922481131452)
('-', 1836, 0.0026288761789897202)
('.', 13, 0.007208208877875039)
(':', 31, 0.0025346512263377586)
(';', 1150, 0.003137690923310311)
('?', 190, 0.0016583591666745187)
('A', 0, 0.01080760206917996)
('B', 116, 0.002638298674254916)
('C', 1, 0.0019316115293652066)
('D', 560, 0.002186018901525502)
('E', 8, 0.004701825137332869)
('F', 557, 0.001309726841862262)
('G', 205, 0.0009045595454588284)
('H', 166, 0.001969301510425991)
('I', 4, 0.010195139876942212)
('J', 13976, 0.0001319149337127458)
('K', 9496, 0.00012249243844754968)
('L', 530, 0.0031000009422495266)
('M', 103, 0.0017996965956524607)
('N', 9, 0.00717994139207945)
('O', 15, 0.008188148385455436)
('P', 652, 0.003646505667630902)
('R', 559, 0.004975077500023556)
('S', 6, 0.005587539692261305)
('T', 2, 0.006661704152493664)
('U', 11703, 0.00

# 1. Calulcate the Shannon Entropy of the text

In [5]:
def Shannon_entropy(character, frequency):
    entropy = -frequency*np.log2(frequency)
    return entropy

In [6]:
system_entropy = np.sum(Shannon_entropy(character=characters, frequency=prob_mass_func))
print(system_entropy)

4.57619774449843


# 2. The Conditional entropy of a character, given previous character
For this point we needed to reorder the array character and the array prob_mass_func in order to be ordered by the apperence in the `text.txt`.

In [7]:
sort = np.argsort(index)

characters = characters[sort]
index = index[sort]
prob_mass_func = prob_mass_func[sort]

df = pd.DataFrame({'index':index, 'character':characters, 'probability_mass_function':prob_mass_func }) 
df.head()

Unnamed: 0,index,character,probability_mass_function
0,0,A,0.010808
1,1,C,0.001932
2,2,T,0.006662
3,3,,0.224529
4,4,I,0.010195


In [8]:
df[df['index']==5].iloc[0]['character']

'\n'

In [9]:
len(lines)

106129

In [10]:
print(text)

['A' 'C' 'T' ... 'e' '.' '\n']


In [11]:
def conditional_prob_mass_function(character, given_character):
    joint_count = lines.count(given_character+character)
    normalizing_factor = 0
    for i in df['character']:
        normalizing_factor += lines.count(given_character+i)
    count_given_character = lines.count(given_character)
    count_character = lines.count(character) 
    return ((joint_count)/(normalizing_factor))

In [12]:
matrix_conditional_probability = np.zeros((len(df), len(df)))
for i, x in enumerate(df['character']):
    for j, y in enumerate(df['character'][i:]):
        matrix_conditional_probability[i, j] = conditional_prob_mass_function(character=x, given_character=y)

In [13]:
for i in range(len(matrix_conditional_probability[0])):
    print(df['character'][i], matrix_conditional_probability[:, i].sum())

A 1.2917829396514873
C 1.820448324205944
T 1.8597920518544262
  0.7453920982177406
I 2.464537884444724

 1.0141789490926527
S 0.8063596252676684
E 0.5520711260666493
N 1.1417741611270287
. 0.4502775366858061
O 0.7158445465603283
n 0.7038456415921371
a 0.5894323546165589
s 0.5431850395445471
h 2.2675132874118606
i 0.6968319298345791
p 1.3599349331897308
t 0.62707531028499
e 0.42646516670663703
: 0.804139751039085
m 0.9627596907131174
u 0.7315741397197085
o 1.60149991277307
f 0.5235501925169094
d 1.0609459023963324
r 0.7290466479497234
l 1.4356138963739582
g 0.3927881380924939
M 1.618833817259961
B 0.6460986143463155
w 1.615635514544012
! 0.1270474243375113
H 1.0875056898959659
, 1.6606748161664329
c 1.1597505986948573
? 1.4222670894915084
G 0.830981048018303
k 0.935059160546406
' 0.7124577776711019
y 0.447339270094546
v 0.5043204336272284
b 0.8274185723434889
x 0.5257039834671312
L 0.5984748010610079
F 0.3340038098102614
R 0.1200082217524078
D 0.8535296380123966
Z 1.1709457976739084
W 0

In [103]:
matrix_conditional_probability[matrix_conditional_probability==0] = 1 #useful to keep dimension and also not consider object with prob=0 in the entropy calculus
matrix_conditional_probability.shape
np.log2(matrix_conditional_probability).shape

(63, 63)

In [104]:
entropy_fixed_given_character = np.zeros(63)
for i in range(len(entropy_fixed_given_character)):
    entropy_fixed_given_character[i]=np.sum( -matrix_conditional_probability[:, i] * np.log2(matrix_conditional_probability[:, i]))
entropy_fixed_given_character

array([3.19055438, 2.90598677, 2.64951489, 4.49894038, 3.24322019,
       1.84203222, 3.28318686, 2.72077737, 3.59139209, 0.87995023,
       2.51819977, 3.73251213, 3.79935522, 3.58514335, 2.79807235,
       3.42101979, 3.60316325, 3.1171551 , 3.84814655, 0.85519289,
       3.37661993, 3.71792569, 3.92363585, 3.13046285, 3.05659985,
       3.86079022, 3.5968159 , 3.68938972, 2.58780135, 2.19492725,
       3.25692227, 1.24811686, 2.23919891, 0.93747114, 3.15866168,
       0.9673763 , 1.39142858, 2.96525784, 2.9886686 , 2.4296126 ,
       1.20346862, 3.0228925 , 3.05369799, 2.3292819 , 2.51041724,
       1.9273029 , 3.01133759, 0.        , 1.74523336, 2.22831092,
       1.10288427, 0.98129042, 3.77003453, 1.57463358, 0.        ,
       1.89199549, 1.66578981, 1.31061331, 1.72440782, 1.12192809,
       0.29747225, 1.75      , 0.        ])

In [105]:
conditional_entropy = (prob_mass_func * entropy_fixed_given_character).sum()
conditional_entropy

3.4879433653347416

# The Mutual information between a character and the previous one
I[X:Y] = H[X] - H[X|Y]

In [106]:
I = system_entropy - conditional_entropy
print(I)

1.088254379163688
