In [103]:
# read files
import sys
# reversing a dict
import operator
# math work
import math 
# for counts
from collections import Counter
# tokenizing 
import nltk
from nltk.tokenize import RegexpTokenizer
# pandas
import pandas as pd

In [12]:
'''
Reads a new-line delimited file representing a "corpus" for a given category, group etc.
Tokenizes to lowercase, strip spaces and remove non-alphanumeric tokens w/ `nltk`.
This tokenization strategy makes sense because these aren't the people-facing samples, these are for the comp. pipeline

Args:
    filename: location of file

Return:
    tokens: a list of the tokens found in the file
'''

tokenizer = RegexpTokenizer(r'\w+')
def read_and_tokenize(filename):
    
    with open(filename, encoding="utf-8") as file:
        tokens=[]
        # lowercase
        for line in file:
            # remove spaces, lowercase
            data=line.rstrip().lower()
            # tokenize, getting rid of any non alphanumeric chars
            tokens.extend(tokenizer.tokenize(data))
        return tokens

In [19]:
'''
Provides a dictionary with frequency counts for an inputted `tokens` list.
Via the `Counter` api.

Args:
    tokens: tokens from a corpus

Return:
    counts: associative mapping between a word and its frequency
'''
def get_counts(tokens):
    counts=Counter()
    for token in tokens:
        counts[token]+=1
    return counts

In [148]:
'''
Similar to `get_counts`, but tallies a frequency score for each word used in a file.
In this case, I'm going to hand off one big text file that contains every work in the corpus.

Args:
    filename: location of corpus file

Return:
    freqs: mapping between each word and its relative frequency
'''
def read_priors(filename):
    counts=Counter()
    freqs={}
    tokens=read_and_tokenize(filename)
    total=len(tokens)

    for token in tokens:
        counts[token]+=1

    for word in counts:
        freqs[word]=counts[word]/total

    return freqs

In [86]:
'''
Similar to `get_counts`, but tallies a frequency score for each word in a list

Args:
    token_list: a list of words

Return:
    counts: mapping between each word and its frequency
'''
def count_combined_list(token_list):
    counts=Counter()

    for token in token_list:
        counts[token]+=1

    return counts

In [30]:
'''
Math helper for log-odds ratio 
Args:
    y_i_w: count of a word in i
    y_j_w: count of a word in j
    alpha_w: the bias term
    alpha_not: the derived alpha_not ("informative" prior)
    n_i: number of words in i
    n_j: number of words in j
    debug: boolean flag - setting to True will output information about the calculation
    
Return: 
    result: the computed difference 
'''
def math_helper(y_i_w, y_j_w, alpha_w, alpha_not, n_i, n_j, debug):
    
    ## first, log-odds
    y_i_log = math.log2((y_i_w + alpha_w) / (n_i + alpha_not - y_i_w - alpha_w))
    
    y_j_log = math.log2((y_j_w + alpha_w) / (n_j + alpha_not - y_j_w - alpha_w))
    
    log_odds = y_i_log - y_j_log
    
    ## next, variance
    y_i_variance = 1 / (y_i_w + alpha_w)
    y_j_variance = 1 / (y_j_w + alpha_w)
    
    variance = y_i_variance + y_j_variance
    
    result = log_odds / (math.sqrt(variance))
    
    if (debug == True):
        print("alpha_w = {}, log_odds={}, variance={}, result={}".format(alpha_w, log_odds, variance, result))
        
    return result

Logg odds, $\hat\zeta_w^{(i-j)}$ for word $w$ reflecting the difference in usage between corpus $i$ and corpus $j$, is given by the following equation:

$$
\hat\zeta_w^{(i-j)}= {\hat{d}_w^{(i-j)} \over \sqrt{\sigma^2\left(\hat{d}_w^{(i-j)}\right)}}
$$

Where: 

$$
\hat{d}_w^{(i-j)} = \log \left({y_w^i + \alpha_w} \over {n^i + \alpha_0 - y_w^i - \alpha_w}) \right) -  \log \left({y_w^j + \alpha_w} \over {n^j + \alpha_0 - y_w^j - \alpha_w}) \right)
$$

$$
\sigma^2\left(\hat{d}_w^{(i-j)}\right) \approx {1 \over {y_w^i + \alpha_w}} + {1 \over {y_w^j + \alpha_w} }
$$

And:

* $y_w^i = $ count of word $w$ in corpus $i$ (likewise for $j$)
* $\alpha_w$ = 0.01
* $V$ = size of vocabulary (number of distinct word types)
* $\alpha_0 = V * \alpha_w$
* $n^i = $ number of words in corpus $i$ (likewise for $j$)

In [165]:
'''
Implements the log-odds ratio with an informative Dirichlet prior 
(Described in Monroe et al. 2009, Fighting Words) 
Args:
    one_tokens: tokens from first label
    one_counts: frequencies from first label
    two_tokens: tokens from second label
    two_counts: frequencies from second label
    priors: relative frequencies from "joint" corpus - ("informative")
    
Return: 
    result: log_odds_dict, a mapping between a word and its log-odds
        the more positive a value, the more it is aligned with the "detail" group

'''
def logodds_with_informative_prior(one_tokens, one_counts, two_tokens, two_counts, priors):
     
    ## create a list combining the two corpora
    ## this is what we will iterate through
    combined_tokens = list(set(one_tokens + two_tokens))
    
    ## alpha_not = size of vocab * .01
    alpha_not = len(combined_tokens) * .01
    
    ## next, calculate corpus sizes
    detail_n = len(one_tokens)
    not_detail_n = len(two_tokens)
    
    ## with the detail token in the first log-chunk, the more positive a word is, the more aligned
    ## it is with the detail corpus
    ## w/ 0 as the marking line
    ## so, once calculation is complete, sort the list and pull out the 25 largest and 25 smallest vals
    
    log_odds_dict = {}
    for word in combined_tokens:
        ## skip words fragments that may appear at beginning/end of samples        
        if word not in priors:
            continue
        
        ## count of word in detail corpus
        detail_y_w = one_counts[word]
        
        ## count of word in not_detail corpus
        not_detail_y_w = two_counts[word]
        
        ## my prior -- tamp down very common words like "the"
        prior_w = priors[word]
        
        ## calculate alpha_w on the fly, now
        alpha_w = alpha_not * prior_w
        
        ## carry out log odds calculation
        result = math_helper(detail_y_w, not_detail_y_w, alpha_w, alpha_not, detail_n, not_detail_n, False)
        
        ## commit to results dict
        log_odds_dict[word] = result
        
    return log_odds_dict  

In [166]:
detail_tokens=read_and_tokenize("detail_samples.txt")
not_detail_tokens=read_and_tokenize("not_detail_samples.txt")

detail_counts = get_counts(detail_tokens)
not_detail_counts = get_counts(not_detail_tokens)

priors = read_priors("../Gutenberg/merged.txt")

log_odds_dict_with_informative_prior = logodds_with_informative_prior(detail_tokens, detail_counts, not_detail_tokens, not_detail_counts, priors)



In [167]:
sorted_log_odds_dict_with_informative_prior = sorted(log_odds_dict_with_informative_prior.items(), key=operator.itemgetter(1), reverse = True)


In [168]:
print("25 most detail words ----")
print([tuple for tuple in sorted_log_odds_dict_with_informative_prior[:20]])
print("25 most not_detail words ----")
print([tuple for tuple in sorted_log_odds_dict_with_informative_prior[-20:]])

25 most detail words ----
[('the', 15.047004638994647), ('of', 6.382370326356031), ('with', 6.113952460694612), ('house', 5.554706651189872), ('and', 5.1069088753192435), ('up', 5.016458242350711), ('three', 4.863483962709612), ('a', 4.543889821797246), ('had', 4.469231328708773), ('two', 4.430870927364034), ('room', 4.089514752973798), ('were', 3.9870066804149067), ('black', 3.97569210497496), ('along', 3.83313004443763), ('in', 3.805744500913115), ('through', 3.710348829252746), ('place', 3.6879761380938536), ('behind', 3.679455671321996), ('light', 3.637921579207515), ('gatsby', 3.5692903011290107)]
25 most not_detail words ----
[('if', -5.341157067146428), ('what', -5.4478787757761475), ('may', -5.5282946033548805), ('don', -5.646696288686094), ('am', -5.718516357068141), ('tell', -5.782383687979141), ('that', -6.028068069800341), ('not', -6.202648307398496), ('are', -6.272452265447527), ('your', -6.552078491784141), ('should', -6.80156680007829), ('do', -6.891941558401004), ('know

In [89]:
# Graph the top 20 per category.
combined_tokens = detail_tokens + not_detail_tokens
combined_counts = count_combined_list(combined_tokens)


53438


13

In [169]:
trip = []
for word, score in log_odds_dict_with_informative_prior.items():
    container = []
    container.append(word)
    container.append(score)
    container.append(combined_counts[word])
    trip.append(container)

In [170]:
df = pd.DataFrame(trip)

In [171]:
df.rename(columns={0: 'word', 1: 'log_odds', 2: 'frequency'}, inplace=True)

In [172]:
df = df.sort_values('log_odds', ascending=False)

In [173]:
df.head(3)

Unnamed: 0,word,log_odds,frequency
3168,the,15.047005,2941
3181,of,6.38237,1588
5061,with,6.113952,453


In [175]:
# write out as csv
df.to_csv("log_odds_data.csv")