In [56]:
# colors
from termcolor import colored, cprint
from numpy import inf

# pandas
import pandas as pd

import spacy

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x13ae8e8e0>)

In [3]:
'''
Read in .tsv of tagged sample data as a Pandas data frame
Add appropriate header to the columns as well.
'''
def read_as_df(filename):
    df = pd.read_csv(filename, sep="\t", header = None)
    df.columns = ["Text_Loc", "Sample", "Rating", "Specificity","Adj", "Adv", "Noun", "Verb", "Adp"]
    return df

In [4]:
df = read_as_df("data/samples_data.tsv")

In [5]:
'''
Read in .csv of results from chi-squared
'''
def read_chi_squared_data(filename):
    df = pd.read_csv(filename, sep=",")
    return df

In [60]:
'''
Helper function to call https://pypi.org/project/termcolor/
'''
def make_colored_string(word):
    return colored(word, 'cyan', attrs=['reverse', 'blink'])

In [61]:
'''
Locates word of interest across the corpus samples.
Includes an optional param for excluding "not_detail" samples.
Returns a series of strings w/ the word highlighted in context.
'''
def find_samples(df, word, ratings_cutoff = True):
    
    if ratings_cutoff:
        df = df[df['Rating'] > 3.0]
    
    word_samples = ""
    for sample in df['Sample']:
        if word in sample:
            text_loc = df.loc[df['Sample'] == sample]['Text_Loc'].item()
            # build out a string that is, normal, then colored for the word, etc.
            doc = nlp(sample)
            word_samples += text_loc + " "
            for w in doc:
                if w.text == word:
                    word_samples += make_colored_string(word)
                else:
                    word_samples += w.text
                word_samples+= ' '
            word_samples += "\n"
    
    print(word_samples)

In [8]:
chi_squared_data = read_chi_squared_data("data/chi_squared_results.csv")

In [9]:
chi_squared_data.head()

Unnamed: 0.1,Unnamed: 0,word,detail_count,not_detail_count,stat,p,reject,fisher
0,0,about,50,42,0.957749,0.327755,ACCEPT,False
1,1,above,15,0,8.448165,0.003654,REJECT,False
2,2,across,9,0,inf,0.013785,REJECT,True
3,3,after,38,24,0.009159,0.923758,ACCEPT,False
4,4,against,17,12,0.002515,0.96,ACCEPT,False


Let's investigate the usage of some of the most "extreme" prepositions.

In [10]:
chi_squared_data[chi_squared_data['stat'] != inf].sort_values('stat', ascending = False)

Unnamed: 0.1,Unnamed: 0,word,detail_count,not_detail_count,stat,p,reject,fisher
35,35,with,326,127,27.194586,1.839731e-07,REJECT,False
23,23,of,1017,568,12.188626,0.0004808179,REJECT,False
16,16,down,47,11,9.943002,0.00161462,REJECT,False
18,18,for,211,191,8.91704,0.00282522,REJECT,False
34,34,up,84,30,8.482045,0.003586687,REJECT,False
1,1,above,15,0,8.448165,0.003654119,REJECT,False
5,5,along,15,0,8.448165,0.003654119,REJECT,False
9,9,at,239,116,7.90785,0.004922073,REJECT,False
21,21,into,87,33,7.442944,0.006368562,REJECT,False
36,36,within,13,0,7.11876,0.007628147,REJECT,False


Let's work with `down`, `up`, `above` and `along`.

In [62]:
to_investigate = ["down"]

In [63]:
for word in to_investigate:
    find_samples(df, word, True)

../Gutenberg/samples/madame_bovary_486841_487641.txt ... to go to town once a week to see her lover . at the end of a month she was even considered to have made considerable progress . she went on thursdays . she got up and dressed silently , in order not to awaken charles , who would have made remarks about her getting ready too early . next she walked up and [5m[7m[36mdown[0m , went to the windows , and looked out at the place . the early dawn was broadening between the pillars of the market , and the chemist ’s shop , with the shutters still up , showed in the pale light of the dawn the large letters of his signboard . when the clock pointed to a quarter past seven , she went off to the “ lion d’or , ” whose door artémise opened yawning . the girl then made up the coals covered by the cinders , and emma remained alone in the kitchen . now and again she went out . hivert was ... 
../Gutenberg/samples/crime_and_punishment_517448_518248.txt ... something far more serious than could