# Word frequency differences between two text corpora

You have two books (or other writings). Which words are overrepresented in book *A*, and which are overrepresented in *B*? One way to see is Dunning log likelihood.

See: Dunning, T. Accurate Methods for the Statistics of Surprise and Coincidence. *Computational Linguistics* 19, 61–74 (1993).
  

In [26]:
import math
from scipy.stats import chi2
from tqdm import tqdm

## Do the actual load

In [13]:
pc = ""
with open('exp_black_y.csv') as fh:
    for line in fh:
        pc += line

sc = ""
with open('exp_black_n.csv') as fh:
    for line in fh:
        sc += line

## Change certain punctuation to spaces

In [14]:
PUNCT = "“”,.?\";:-_—!"  # fixme - add slash
SPACES = ' ' * len(PUNCT)
table = str.maketrans(PUNCT, SPACES)

ps = pc.translate(table).lower().split()
ss = sc.translate(table).lower().split()

## Define stop words and filter them out

In [16]:
stopwords = "a able about across after all almost also am among an and any are as at be because been but by can cannot could dear did do does either else ever every for from get got had has have he her hers him his how however i if in into is it its just least let like likely may me might most must my neither no nor not of off often on only or other our own rather said say says she should since so some than that the their them then there these they this tis to too twas us wants was we were what when where which while who whom why will with would yet you your"
# https://www.textfixer.com/tutorials/common-english-words.txt
swl = stopwords.split() + ['chapter']

In [28]:
def filter_all(tlist, slist):
    for w in tqdm(slist):
        tlist = list(filter(lambda a: a != w, tlist))
    return tlist

In [29]:
pf = filter_all(ps, swl)
sf = filter_all(ss, swl)
# Takes about 1 min per 20 MB CSV.

100%|██████████| 120/120 [01:10<00:00,  1.70it/s]
  9%|▉         | 11/120 [00:06<01:08,  1.58it/s]


KeyboardInterrupt: 

## Demo of filtered text

In [20]:
pf[119:129]

['patient/famili',
 'answer',
 'question',
 'regard',
 'diagnosi',
 'plan',
 'care',
 'subjective',
 'chart',
 'reviewed']

## Count up words in each corpus (longest step)

In [24]:
def wlist2freqs(wlist):
    multiplier = 1000000
    denom = len(wlist)
    vocab = set(wlist)
    fdict = dict()
    for w in vocab:
        n = wlist.count(w)
        fdict[w] = n / denom * multiplier
    return(fdict)

def wlist2counts(wlist):
    """Given a text (list of words), return dict where keys are words and values are word counts."""
    vocab = set(wlist)
    fdict = dict()
    for w in tqdm(vocab):
        fdict[w] = wlist.count(w)
    return(fdict)

In [27]:
# pd = wlist2freqs(pf)  # 6 sec
# sd = wlist2freqs(sf)
pcount = wlist2counts(pf)

# Estimate 1 hour for just one corpus. Too long.

  1%|          | 509/49151 [00:40<1:03:57, 12.68it/s]


KeyboardInterrupt: 

In [None]:
scount = wlist2counts(sf)

## Copy-pasted Dunning functions

In [13]:
# http://pioneer.chula.ac.th/~awirote/colloc/statmethod1.htm
# https://github.com/dhmit/gender_novels/blob/master/gender_novels/analysis/dunning.py

In [14]:
def dunning_total(t1, t2):
    """Does Dunning log-likelihood on two tables of word counts.
    Positive means words in arg 1. Negative means words in arg 2.
    The larger the magnitude of the number, the more distinctive that word is in its
    respective counter object.
    Result is a dict that maps each word to its to results.
    Each result dict contains the dunning score.
    >>> results['he']
    -8.547243830635558
    :return: dict
    """

    n1 = sum(t1.values())
    n2 = sum(t2.values())
    
    dunning_result = {}  # dictionary where results will be returned
    for w in t1:
        if w not in t2:
            continue
        if t1[w] + t2[w] < 10:
            continue
        Pr_1 = t1[w] / n1
        Pr_2 = t2[w] / n2
        Pr_12 = (t1[w] + t2[w]) / (n1 + n2)

        d = 2 * (t1[w] * math.log(Pr_1 / Pr_12) + t2[w] * math.log(Pr_2 / Pr_12))

        if t1[w] * math.log(Pr_1 / Pr_12) < 0:
            d = -1 * d
        dunning_result[w] = d

    return dunning_result

## Do the actual work

In [15]:
result = dunning_total(pcount, scount)

## Limit to the very most significant, and display them sorted

In [16]:
THRESHOLD = 30
filtered = {}
for word in result:
    d = result[word]
    if d > THRESHOLD or d < (-1 * THRESHOLD):
        filtered[word] = round(d,1)

# negative means Sense, positive means Pride

filtered_sorted = dict(sorted(filtered.items(), key=lambda item: item[1], reverse=True))
filtered_sorted

{'mr': 401.1,
 'aunt': 97.4,
 'father': 64.1,
 'ball': 41.5,
 'william': 41.4,
 'mary': 38.9,
 'uncle': 34.1,
 'mother': -33.8,
 'heart': -42.3,
 'mrs': -43.4,
 'colonel': -46.8,
 'body': -56.0,
 'thing': -100.9,
 'john': -188.0,
 'edward': -297.6}