# Word frequency differences between two text corpora

You have two books (or other writings). Which words are overrepresented in book *A*, and which are overrepresented in *B*? One way to see is Dunning log likelihood.

See: Dunning, T. Accurate Methods for the Statistics of Surprise and Coincidence. *Computational Linguistics* 19, 61–74 (1993).
  

In [86]:
import urllib.request
import math
from scipy.stats import chi2

def pg2txt(pgnum):
    """Given a Project Gutenberg number, download & return its text."""
    s = str(pgnum)
    url = 'https://gutenberg.org/files/' + s + '/' + s + '-0.txt'
    #      https://gutenberg.org/files/      161/161-0.txt
    response = urllib.request.urlopen(url)
    data = response.read()      # a `bytes` object
    text = data.decode('utf-8')
    return(text)

SENSE_N = 161    # Project Gutenberg numeric identifiers
PRIDE_N = 1342

## Do the actual download

In [87]:
SENSE = pg2txt(SENSE_N)

In [88]:
PRIDE = pg2txt(PRIDE_N)

## Cut out the Gutenberg preamble and end matter

In [89]:
BEGIN = 1700
END_SENSE = 684300
endmatter = len(SENSE) - END_SENSE
END_PRIDE = len(PRIDE) - endmatter

pc = PRIDE[BEGIN:END_PRIDE]
sc = SENSE[BEGIN:END_SENSE]

## Change certain punctuation to spaces

In [90]:
PUNCT = "“”,.?\";:-_—!"  # don't remove apostrophe?  ' and ’
SPACES = ' ' * len(PUNCT)
table = str.maketrans(PUNCT, SPACES)

ps = pc.translate(table).lower().split()
ss = sc.translate(table).lower().split()

## Define stop words and filter them out

In [91]:
stopwords = "a able about across after all almost also am among an and any are as at be because been but by can cannot could dear did do does either else ever every for from get got had has have he her hers him his how however i if in into is it its just least let like likely may me might most must my neither no nor not of off often on only or other our own rather said say says she should since so some than that the their them then there these they this tis to too twas us wants was we were what when where which while who whom why will with would yet you your"
# https://www.textfixer.com/tutorials/common-english-words.txt
swl = stopwords.split() + ['chapter']

In [92]:
def filter_all(tlist, slist):
    for w in slist:
        tlist = list(filter(lambda a: a != w, tlist))
    return tlist

In [93]:
pf = filter_all(ps, swl)
sf = filter_all(ss, swl)
# 3 sec for both

## Demo of filtered text

In [94]:
pf[19:29]

['truth',
 'universally',
 'acknowledged',
 'single',
 'man',
 'possession',
 'good',
 'fortune',
 'want',
 'wife']

## Count up words in each corpus (longest step)

In [95]:
def wlist2freqs(wlist):
    multiplier = 1000000
    denom = len(wlist)
    vocab = set(wlist)
    fdict = dict()
    for w in vocab:
        n = wlist.count(w)
        fdict[w] = n / denom * multiplier
    return(fdict)

def wlist2counts(wlist):
    """Given a text (list of words), return dict where keys are words and values are word counts."""
    vocab = set(wlist)
    fdict = dict()
    for w in vocab:
        fdict[w] = wlist.count(w)
    return(fdict)

In [96]:
# pd = wlist2freqs(pf)  # 6 sec
# sd = wlist2freqs(sf)
pc = wlist2counts(pf)
sc = wlist2counts(sf)

## Copy-pasted Dunning functions

In [97]:
# http://pioneer.chula.ac.th/~awirote/colloc/statmethod1.htm
# https://github.com/dhmit/gender_novels/blob/master/gender_novels/analysis/dunning.py

In [98]:
def dunning_total(counter1, counter2):
    """Runs dunning_individual on words shared by both counter objects.
    Positive means words in counter_1. Negative means words in counter_2.
    The larger the magnitude of the number, the more distinctive that word is in its
    respective counter object.
    Result is a dict that maps each word to its to results.
    Each result dict contains the dunning score.
    >>> results['he']
    -8.547243830635558
    :return: dict
    """

    t1 = sum(counter1.values())
    t2 = sum(counter2.values())
    
    dunning_result = {}  # dictionary where results will be returned
    for word in counter1:
        if word not in counter2:
            continue
        if counter1[word] + counter2[word] < 10:
            continue
        dunning_result[word] = dunn_individual_word(t1, t2, counter1[word], counter2[word])

    return dunning_result

In [99]:
def dunn_individual_word(t1, t2, c1, c2):
    """
    Applies dunning log likelihood to compare individual word in two counter objects.
    :return: log likelihood
    >>> total_words_m_corpus = 8648489
    >>> total_words_f_corpus = 8700765
    >>> wordcount_female = 1000
    >>> wordcount_male = 50
    >>> dunn_individual_word(total_words_m_corpus,total_words_f_corpus,wordcount_male,wordcount_female)
    -1047.8610274053995
    """
    
    Pr_1 = c1 / t1
    Pr_2 = c2 / t2
    Pr_12 = (c1 + c2) / (t1 + t2)

    d = 2 * (c1 * math.log(Pr_1 / Pr_12) + c2 * math.log(Pr_2 / Pr_12))

    if c1 * math.log(Pr_1 / Pr_12) >= 0:
        return d
    else:
        return -1 * d

    # p = 1 - chi2.cdf(abs(dunning_log_likelihood),1)

## Do the actual work

In [100]:
result = dunning_total(pc, sc)

## Limit to the very most significant, and display them sorted

In [101]:
THRESHOLD = 30
filtered = {}
for word in result:
    d = result[word]
    if d > THRESHOLD or d < (-1 * THRESHOLD):
        filtered[word] = round(d,1)

# negative means Sense, positive means Pride

filtered_sorted = dict(sorted(filtered.items(), key=lambda item: item[1], reverse=True))
filtered_sorted

{'mr': 401.1,
 'aunt': 97.4,
 'father': 64.1,
 'ball': 41.5,
 'william': 41.4,
 'mary': 38.9,
 'uncle': 34.1,
 'mother': -33.8,
 'heart': -42.3,
 'mrs': -43.4,
 'colonel': -46.8,
 'body': -56.0,
 'thing': -100.9,
 'john': -188.0,
 'edward': -297.6}