In [1]:
# counter
from collections import Counter

# pandas
import pandas as pd

# POS
import spacy

# sorting dicts
import operator

# chi squared
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from scipy.stats import fisher_exact

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1366f1d60>)

In [3]:
'''
Read in .tsv of tagged sample data as a Pandas data frame
Add appropriate header to the columns as well.
'''
def read_as_df(filename):
    
    df = pd.read_csv(filename, sep="\t", header = None)
    df.columns = ["Text_Loc", "Sample", "Rating", "Specificity","Adj", "Adv", "Noun", "Verb", "Adp"]
    return df

In [4]:
'''
Identifies frequencies for preposition usage across the samples.
Provides output in the form suitable for chi-square analysis.
'''
def prep_capture(samples, detail_rating):
    preps = set()
    details_counter = Counter()
    not_details_counter = Counter()
    total_words = 0
    detail_word_count = 0
    not_detail_word_count = 0
    for index, value in enumerate(samples):
        score = detail_rating[index]
        sample = nlp(samples[index])
        for word in sample:
            if word.pos_ != 'PUNC':
                total_words += 1
                if score >= 3.0:
                    detail_word_count += 1
                else:
                    not_detail_word_count += 1
            if word.pos_ == 'ADP':
                if score >= 3.0:
                    if word.text not in details_counter:
                        details_counter[word.text] = 1
                    else:
                        details_counter[word.text] += 1
                    
                else:
                    if word.text not in not_details_counter:
                        not_details_counter[word.text] = 1
                    else:
                        not_details_counter[word.text] += 1
    
    # first, remove anything that has an observed count of < 5 -- probably some weird typo.
    
    details_counter = {k: v for k, v in details_counter.items() if v >= 5}
    not_details_counter = {k: v for k, v in not_details_counter.items() if v >= 5}
    
    # now, double check that details_counter and not_details_counter both have same elements
    # if they aren't, add the missing entries and give them a count of 0
    
    for key in details_counter.keys():
        if key not in not_details_counter.keys():
            not_details_counter[key] = 0
    for key in not_details_counter.keys():
        if key not in details_counter.keys():
            details_counter[key] = 0
            
    # sort them, by their keys, so we can easily iterate + compare.
    sorted_detail_counts = dict(sorted(details_counter.items(), key=operator.itemgetter(0), reverse = False))
    sorted_not_detail_counts = dict(sorted(not_details_counter.items(), key=operator.itemgetter(0), reverse = False))

    return sorted_detail_counts, sorted_not_detail_counts, total_words, detail_word_count, not_detail_word_count


In [29]:
def chi_square_test(detail_counts, not_detail_counts, total_word_count, detail_word_count, not_detail_word_count):
    
    # write output to a file.
    data = []
    for word, count in detail_counts.items():
        # top left
        word_of_interest_detail_count = detail_counts[word]
        # bottom left
        word_of_interest_not_detail_count = not_detail_counts[word]
        
        # top right
        detail_leftover = detail_word_count - word_of_interest_detail_count
        # bottom right
        not_detail_leftover = not_detail_word_count - word_of_interest_not_detail_count
        
        # now, transform each of these "cells" into the frequencies expected by chi squared.
        table = [[word_of_interest_detail_count, detail_leftover],
                [word_of_interest_not_detail_count, not_detail_leftover]]
        
        # run chi square
        stat, p, dof, expected = chi2_contingency(table)
        
        fisher = False
        for row in expected:
            for i in row:
                if i < 5.0:
                    stat, p = fisher_exact(table)
                    fisher = True
                    
        reject = 'REJECT' if p < .05 else 'ACCEPT'
                
        data.append((word, word_of_interest_detail_count, word_of_interest_not_detail_count, stat, p, reject, fisher))
                
    df = pd.DataFrame(data, columns=['word', 'detail_count', 'not_detail_count', 'stat', 'p', 'reject', 'fisher'])
        
    df.to_csv('data/chi_squared_results.csv')
    print("Wrote to CSV!")
    
    return df
        # make sure expected for the table is over 5, or else re-run w/ fisher's exact test.

In [6]:
df = read_as_df("data/samples_data.tsv")

In [7]:
df.head()

Unnamed: 0,Text_Loc,Sample,Rating,Specificity,Adj,Adv,Noun,Verb,Adp
0,../Gutenberg/samples/heart_of_darkness_20960_2...,...lap. she wore a starched white affair on he...,4.5,4.565217,20,7,29,21,21
1,../Gutenberg/samples/adam_bede_601425_602225.txt,"...him, if he had known it, that the general a...",2.5,4.758621,16,10,36,24,20
2,../Gutenberg/samples/middlemarch_1718514_17193...,"...passionate exclamation, as if some torture ...",2.0,2.945455,9,8,21,35,20
3,../Gutenberg/samples/tom_jones_1740540_1741340...,"...so vicious a passion from your heart, and y...",1.5,4.823529,10,10,29,29,16
4,../Gutenberg/samples/the_jungle_691598_692398.txt,"...intensity, staring at the platform as if no...",2.0,3.407407,12,13,22,33,9


In [8]:
detail_preps, not_detail_preps, total_word_count, detail_word_count, not_detail_word_count = prep_capture(df['Sample'], df['Rating'])


In [9]:
# sanity check
not_detail_word_count + detail_word_count == total_word_count

True

In [30]:
chi_squared_data = chi_square_test(detail_preps, not_detail_preps, total_word_count, detail_word_count, not_detail_word_count)

Wrote to CSV!
