# Word frequency differences between two text corpora

You have two books (or other writings). Which words are overrepresented in book *A*, and which are overrepresented in *B*? One way to see is Dunning log likelihood.

See: Dunning, T. Accurate Methods for the Statistics of Surprise and Coincidence. *Computational Linguistics* 19, 61–74 (1993).
  

In [1]:
import urllib.request
import math
from scipy.stats import chi2

def pg2txt(pgnum):
    """Given a Project Gutenberg number, download & return its text."""
    s = str(pgnum)
    url = 'https://gutenberg.org/files/' + s + '/' + s + '-0.txt'
    #      https://gutenberg.org/files/      161/161-0.txt
    response = urllib.request.urlopen(url)
    data = response.read()      # a `bytes` object
    text = data.decode('utf-8')
    return(text)

SENSE_N = 161    # Project Gutenberg numeric identifiers
PRIDE_N = 1342

## Do the actual download

In [4]:
SENSE = pg2txt(SENSE_N)

In [5]:
PRIDE = pg2txt(PRIDE_N)

## Cut out the Gutenberg preamble and end matter

In [46]:
BEGIN = 1700
END_SENSE = 684300
endmatter = len(SENSE) - END_SENSE
END_PRIDE = len(PRIDE) - endmatter

pc = PRIDE[BEGIN:END_PRIDE]
sc = SENSE[BEGIN:END_SENSE]

## Change certain punctuation to spaces

In [109]:
PUNCT = "“”,.?\";:-_—!"  # don't remove apostrophe?  ' and ’
SPACES = ' ' * len(PUNCT)
table = str.maketrans(PUNCT, SPACES)

ps = pc.translate(table).lower().split()
ss = sc.translate(table).lower().split()

## Define stop words and filter them out

In [110]:
stopwords = "a able about across after all almost also am among an and any are as at be because been but by can cannot could dear did do does either else ever every for from get got had has have he her hers him his how however i if in into is it its just least let like likely may me might most must my neither no nor not of off often on only or other our own rather said say says she should since so some than that the their them then there these they this tis to too twas us wants was we were what when where which while who whom why will with would yet you your"
# https://www.textfixer.com/tutorials/common-english-words.txt
swl = stopwords.split() + ['chapter']

In [111]:
def filter_all(tlist, slist):
    for w in slist:
        tlist = list(filter(lambda a: a != w, tlist))
    return tlist

In [None]:
pf = filter_all(ps, swl)
sf = filter_all(ss, swl)
# 3 sec for both

## Demo

In [113]:
pf[19:29]

['truth',
 'universally',
 'acknowledged',
 'single',
 'man',
 'possession',
 'good',
 'fortune',
 'want',
 'wife']

In [132]:
def wlist2freqs(wlist):
    multiplier = 1000000
    denom = len(wlist)
    vocab = set(wlist)
    fdict = dict()
    for w in vocab:
        n = wlist.count(w)
        fdict[w] = n / denom * multiplier
    return(fdict)

def wlist2counts(wlist):
    vocab = set(wlist)
    fdict = dict()
    for w in vocab:
        fdict[w] = wlist.count(w)
    return(fdict)

In [146]:
pd = wlist2freqs(pf)  # 6 sec
sd = wlist2freqs(sf)
pc = wlist2counts(pf)
sc = wlist2counts(sf)

## Copy-pasted Dunning functions

In [147]:
# http://pioneer.chula.ac.th/~awirote/colloc/statmethod1.htm
# https://github.com/dhmit/gender_novels/blob/master/gender_novels/analysis/dunning.py

In [134]:
def dunning_total(counter1, counter2, filename_to_pickle=None):
    '''
    runs dunning_individual on words shared by both counter objects
    (-) end of spectrum is words for counter_2
    (+) end of spectrum is words for counter_1
    the larger the magnitude of the number, the more distinctive that word is in its
    respective counter object
    use filename_to_pickle to store the result so it only has to be calculated once and can be
    used for multiple analyses.
    >>> from collections import Counter
    >>> female_counter = Counter({'he': 1,  'she': 10, 'and': 10})
    >>> male_counter =   Counter({'he': 10, 'she': 1,  'and': 10})
    >>> results = dunning_total(female_counter, male_counter)
    # Results is a dict that maps from terms to results
    # Each result dict contains the dunning score...
    >>> results['he']['dunning']
    -8.547243830635558
    # ... counts for corpora 1 and 2 as well as total count
    >>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2']
    (11, 1, 10)
    # ... and the same for frequencies
    >>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2']
    (0.2619047619047619, 0.047619047619047616, 0.47619047619047616)
    :return: dict
    '''

    total_words_counter1 = 0
    total_words_counter2 = 0

    #get word total in respective counters
    for word1 in counter1:
        total_words_counter1 += counter1[word1]
    for word2 in  counter2:
        total_words_counter2 += counter2[word2]

    #dictionary where results will be returned
    dunning_result = {}
    for word in counter1:
        counter1_wordcount = counter1[word]
        if word in counter2:
            counter2_wordcount = counter2[word]


            if counter1_wordcount + counter2_wordcount < 10:
                continue

            dunning_word = dunn_individual_word( total_words_counter1,  total_words_counter2,
                                                 counter1_wordcount,counter2_wordcount)

            dunning_result[word] = {
                'dunning': dunning_word,
                'count_total': counter1_wordcount + counter2_wordcount,
                'count_corp1': counter1_wordcount,
                'count_corp2': counter2_wordcount,
                'freq_total': (counter1_wordcount + counter2_wordcount) / (total_words_counter1 +
                                                                           total_words_counter2),
                'freq_corp1': counter1_wordcount / total_words_counter1,
                'freq_corp2': counter2_wordcount / total_words_counter2
            }

    if filename_to_pickle:
        store_pickle(dunning_result, filename_to_pickle)

    return dunning_result

In [136]:
def dunn_individual_word(total_words_in_corpus_1, total_words_in_corpus_2,
                         count_of_word_in_corpus_1,
                         count_of_word_in_corpus_2):
    '''
    applies dunning log likelihood to compare individual word in two counter objects
    :param word: desired word to compare
    :param m_corpus: c.filter_by_gender('male')
    :param f_corpus: c. filter_by_gender('female')
    :return: log likelihoods and p value
    >>> total_words_m_corpus = 8648489
    >>> total_words_f_corpus = 8700765
    >>> wordcount_female = 1000
    >>> wordcount_male = 50
    >>> dunn_individual_word(total_words_m_corpus,total_words_f_corpus,wordcount_male,wordcount_female)
    -1047.8610274053995
    '''
    a = count_of_word_in_corpus_1
    b = count_of_word_in_corpus_2
    c = total_words_in_corpus_1
    d = total_words_in_corpus_2

    e1 = c * (a + b) / (c + d)
    e2 = d * (a + b) / (c + d)

    dunning_log_likelihood = 2 * (a * math.log(a / e1) + b * math.log(b / e2))

    if count_of_word_in_corpus_1 * math.log(count_of_word_in_corpus_1 / e1) < 0:
        dunning_log_likelihood = -dunning_log_likelihood

    p = 1 - chi2.cdf(abs(dunning_log_likelihood),1)

    return dunning_log_likelihood

## Do the actual work

In [181]:
result = dunning_total(pc, sc)
THRESHOLD = 30
filt = {}
for word in result:
    wdict = result[word]
    d = wdict['dunning']
    if d > THRESHOLD or d < (-1 * THRESHOLD):
        filt[word] = round(d,1)

# negative means Sense, positive means Pride

final = dict(sorted(filt.items(), key=lambda item: item[1], reverse=True))
final

{'mr': 401.5,
 'aunt': 96.1,
 'father': 62.2,
 'william': 41.5,
 'ball': 41.5,
 'mary': 38.9,
 'uncle': 34.3,
 'mother': -33.2,
 'heart': -40.6,
 'mrs': -43.3,
 'colonel': -46.8,
 'body': -53.3,
 'thing': -94.9,
 'john': -187.9,
 'edward': -283.5}