#### Entropy

In [15]:
import requests
import time

import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import re
import glob
import random
import seaborn as sns
import string

from IPython.display import clear_output

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# http://www.nltk.org/howto/wordnet.html

from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.wsd import lesk

In [1]:
# calculate the information for a coin flip
from math import log2
# probability of the event
p = 0.5
# calculate information for event
h = -log2(p)
# print the result
print('p(x)=%.3f, information: %.3f bits' % (p, h))

p(x)=0.500, information: 1.000 bits


##### Unfair
$ P(x) = P(Heads) = P(0.1) $

In [2]:
# probability of the event
p = 0.1
# calculate information for event
h = -log2(p)
# print the result
print('p(x)=%.3f, information: %.3f bits' % (p, h))

p(x)=0.100, information: 3.322 bits


### Entropy of a Sequence of Random Variables

#### Eqn (1): Entropy of a Sequence (e.g., sentence)

$  H(X_1, X_2...Xw_n) = -\sum_{i=(X_1...X_n)}^{n}P(X_1,X_2...X_n)log_2 P(X_1,X_2...X_n)$

Note: This is derived from : <p>
      $H(X) = -\prod_{i=(X_1...X_n)}^{n}[P(X_i)] = $ <p>
      $log_2(H(X)) = log_2(-\prod_{i=(X_1...X_n)}^{n}[P(X_i)]) = $ <p>
     $log_2(H(X)) = log_2(-\prod_{i=(X_1...X_n)}^{n}[P(X_i)]) = $
     $-\sum_{i=(X_1...X_n)}^{n}P(X_1,X_2...X_n)log_2 P(X_1,X_2...X_n)$
     
    
##### Example: Entropy of the outcome of roll of a 6 sided die
    - Fair P(x) = P(Any Side) P(1/6) 
    - Unfair P(x) = P(Any Side) = P(1/6)

In [5]:
# the number of events
n = 6
# probability of one event
p = 1.0 /n
# calculate entropy
entropy = -sum([p * log2(p) for _ in range(n)])
# print the result
print('p(x)=%.3f, information: %.3f bits' % (p, h))

p(x)=0.167, information: 2.585 bits


####  Eqn (2): Entropy of a Sentence

The entropy of a sentence can be modeled as the entropy of a sequence.

$  H(S) =  -\lim_{n\to\infty} f(w)  \sum_{i=(w_1...w_n)}^{n}P(w_1,w_2...w_n)log_2 P(w_1,w_2...w_n)$

For large n (elements in the sequence), the entropy is scaled (divided) by n. This scaling factor

$  H(S) =  -\lim_{n\to\infty} \frac{1}{n} f(w)  \sum_{i=(w_1...w_n)}^{n}P(w_1,w_2...w_n)log_2 P(w_1,w_2...w_n)$

This extends to the entropy of the corpus, as:

$  H(C) =  -\lim_{i\to\infty} \frac{1}{N} f(w)  \sum_{i=(s_1...s_n)}^{n}P(s_1,s_2...s_n)log_2 P(s_1,s_2...s_n)$



#### Sequence Entropy, brown corpus

In [11]:
from nltk.corpus import brown
import re
import numpy as np
import pandas as pd

brown.words()[:100]

text = []
for word in brown.words():
    text.append(word)
    
text_list = " ".join(text)

sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text_list)
sentences = np.array(sentences)
print(sentences.shape, "Sentence Shape")

sentences[0]

(61870,) Sentence Shape


"The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place"

In [16]:
import time
import string 

train_data_sent = pd.DataFrame(columns = ['sent_id', 'brown_sentence'] )

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

start_time = time.time()

for sent_id in range(0,len(sentences) - 1):
    #print(sent_id)

    sentence = sentences[sent_id]
    sentence = remove_punctuations(sentence)
    # Form a row in a dataframe for this setence that captures the words and keeps ids and polarity scores
    # We must pass an arbitrary index which we then reset to show unique numbers
    sentence_row = pd.DataFrame({
                                 #'Complaint ID':data_row['Complaint ID'],
                                 #'pol':data_row['pol'],
                                 'sent_id':sent_id,
                                 'brown_sentence':sentence}, index = [sent_id]) 
    # Form full table that has rows for all sentences
    train_data_sent = train_data_sent.append(sentence_row)
# Outputs progress of main loop, see:
    clear_output(wait=True)
    print('Proportion of comments completed:', np.round(sent_id/len(sentences),4)*100,'%')

end_time = time.time()
print('Total run time = ', np.round(end_time-start_time,2)/60, ' minutes')
# Reset index so that each index value is a unique number

train_data_sent = pd.DataFrame(train_data_sent)
train_data_sent = train_data_sent.reset_index(drop=True)

Proportion of comments completed: 100.0 %
Total run time =  5.5735  minutes


In [19]:
train_data_sent["wc"] = train_data_sent["brown_sentence"].str.count(' ') + 1 
train_data_sent = train_data_sent[train_data_sent['wc'] > 5]
sentences = np.array(train_data_sent["brown_sentence"])
sentences.shape

sentences = pd.DataFrame(sentences, columns = ['text'])
sentences["text"] = sentences["text"].str.lower()

sentences_cleaned = []
for i in range(0, len(sentences)):
    sentences_cleaned.append(re.sub('\s+', ' ', sentences["text"][i]).strip())

#### Take the first sentence: $ S_{1} $

In [21]:
sentences_cleaned[0]

'the fulton county grand jury said friday an investigation of atlantas recent primary election produced no evidence that any irregularities took place'

In [25]:
sentence

['the fulton county grand jury said friday an investigation of atlantas recent primary election produced no evidence that any irregularities took place']

In [26]:
sentence = sentences_cleaned[0:1]
sentence_tokenized = str(sentence[0]).split(' ')

#make set with all unrepeatable symbols from string
sent_dct = dict.fromkeys(list(sentence_tokenized))
sent_dct

{'the': None,
 'fulton': None,
 'county': None,
 'grand': None,
 'jury': None,
 'said': None,
 'friday': None,
 'an': None,
 'investigation': None,
 'of': None,
 'atlantas': None,
 'recent': None,
 'primary': None,
 'election': None,
 'produced': None,
 'no': None,
 'evidence': None,
 'that': None,
 'any': None,
 'irregularities': None,
 'took': None,
 'place': None}

In [45]:
import string
base = 2

sentence = sentences_cleaned[0:1]
sentence_tokenized = str(sentence[0]).split(' ')

#make set with all unrepeatable symbols from string
sent_dct = dict.fromkeys(list(sentence_tokenized))

print(len(sentence_tokenized) , "Sentence Length")
#calculate frequencies
pkvec =  [float(sentence_tokenized.count(w)) / len(sentence_tokenized) for w in sent_dct]

#calculate Entropy
H = -sum([pk  * math.log(pk) / math.log(base) for pk in pkvec ])

#Normalize for sentence length

N =  len(sentence_tokenized)

H_r = (H/len(sentence_tokenized))

print(H, "Sentence Entropy H(S)")
print(H_r, "Sentence Normalized Entropy (per word rate) H_r(S)")

22 Sentence Length
4.459431618637295 Sentence Entropy H(S)
0.20270143721078612 Sentence Normalized Entropy (per word rate) H_r(S)


#### Corpus Entropy : $ H(C) $

In [47]:
#Assume for simplicity a 5 sentence corpus

C = sentences_cleaned[0:5]

In [77]:
set(sentence_tokenized)

{'an',
 'any',
 'atlantas',
 'county',
 'election',
 'evidence',
 'friday',
 'fulton',
 'grand',
 'investigation',
 'irregularities',
 'jury',
 'no',
 'of',
 'place',
 'primary',
 'produced',
 'recent',
 'said',
 'that',
 'the',
 'took'}

In [72]:

def big_S(corpus):
    S = []
    for sentence in corpus:
        sentence_tokenized = str(sentence).split(' ')
        
        print(sentence_tokenized)
        
        sent_dct = dict.fromkeys(list(sentence_tokenized))
        
        pkvec =  [float(sentence_tokenized.count(w)) / len(sentence_tokenized) for w in sent_dct]
 
        H = -sum([pk  * math.log(pk) / math.log(base) for pk in pkvec ])
    
        print(H)
        S.append(H)
    HC = sum(S)
    return HC

HC = big_S(C)


['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'atlantas', 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place']
4.459431618637295
['the', 'jury', 'further', 'said', 'in', 'termend', 'presentments', 'that', 'the', 'city', 'executive', 'committee', 'which', 'had', 'overall', 'charge', 'of', 'the', 'election', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'city', 'of', 'atlanta', 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted']
4.395128251428674
['the', 'septemberoctober', 'term', 'jury', 'had', 'been', 'charged', 'by', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'to', 'investigate', 'reports', 'of', 'possible', 'irregularities', 'in', 'the', 'hardfought', 'primary', 'which', 'was', 'won', 'by', 'mayornominate', 'ivan', 'allen', 'jr']
4.875
['only', 'a', 'relative', 'handful', 'of', 'such', 'reports', 'was', 'received', 'the', 'ju

22.320326124575576

In [78]:
#Assume for simplicity a 5 sentence corpus

C = sentences_cleaned[0:1000]

In [79]:
import string
base = 2

sentence = sentences_cleaned[0:1]
sentence_tokenized = str(sentence[0]).split(' ')

#make set with all unrepeatable symbols from string


def big_S(corpus):
    S = []
    for sentence in corpus:
        sentence_tokenized = str(sentence).split(' ')
        
        print(sentence_tokenized)
        
        sent_dct = dict.fromkeys(list(sentence_tokenized))
        
        #pkvec =  [float(sentence_tokenized.count(w)) / len(sentence_tokenized) for w in sent_dct]
 
        H = -sum([pk  * math.log(pk) / math.log(base) for pk in pkvec ])
    
        print(H)
        S.append(H)
    HC = sum(S)
    return HC

HC = big_S(C)

def big_N(corpus):
    N = []
    for sentence in corpus:
        print(sentence)
        sentence_tokenized = str(sentence).split(' ')
        unique_sentence_elements = set(sentence_tokenized)
        n =  len(unique_sentence_elements)
        N.append(n)
    N_sum = sum(N)
    return N_sum

N = big_N(C)

H_r = HC/N

print(HC, "Corpus Entropy H(C)")
print(H_r, "Corpus Entropy H(S)")

['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'atlantas', 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place']
4.459431618637295
['the', 'jury', 'further', 'said', 'in', 'termend', 'presentments', 'that', 'the', 'city', 'executive', 'committee', 'which', 'had', 'overall', 'charge', 'of', 'the', 'election', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'city', 'of', 'atlanta', 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted']
4.395128251428674
['the', 'septemberoctober', 'term', 'jury', 'had', 'been', 'charged', 'by', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'to', 'investigate', 'reports', 'of', 'possible', 'irregularities', 'in', 'the', 'hardfought', 'primary', 'which', 'was', 'won', 'by', 'mayornominate', 'ivan', 'allen', 'jr']
4.875
['only', 'a', 'relative', 'handful', 'of', 'such', 'reports', 'was', 'received', 'the', 'ju

filed for reorganization under the federal bankruptcy law
on monday the hughes concern was formally declared bankrupt after its directors indicated they could not draw up a plan for reorganization
business relations between the companies and city have been under investigation by hemphill and district attorney james c
intervenes in case the suit was filed later in the day in common pleas court 7 against the hughes company and two bonding firms
at bergers direction the city also intervened in the hughes bankruptcy case in u
district court in a move preliminary to filing a claim there
i am taking the position that the contract was clearly violated berger said
the contract violations mostly involve failure to perform rehabilitation work on expansion joints along the el track
the contract called for overhauling of 102 joints
the city paid for work on 75 of which no more than 21 were repaired hemphill charged
wide range in bids hemphill said the hughes concern contracted to do the repairs at

In [86]:
HC

3955.498145896969

In [87]:
2**(0.22074324158139233)

1.1653337836236108