In [21]:
# re
import re

# POS
import spacy

# nltk for wordnet
import nltk
from nltk.corpus import wordnet as wn

In [22]:
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x145ca01c0>)

In [41]:
'''
Reads in a file containing text samples, that has each sample separated by a carriage return and then tabs

Args:
    filename: location of sample file

Return:
    tagged_samples: a list of spacy `nlp` docs that contains information about each word in the sample
'''
def read_and_tag(filename):    
    
    tagged_samples = []
    
    with open(filename, encoding="utf-8") as file:
        # samples are separated by line breaks
        next(file)
        for sample in file:
            sample_data = sample.split("\t")
            doc=nlp(sample_data[2])
            tagged_samples.append(doc)
        
    return tagged_samples

In [27]:
# Function from http://www.nltk.org/howto/wordnet.html to get *all* of a synset's hyponym/hypernyms
hyper = lambda s: s.hypernyms()

In [28]:
'''
Consult wordnet for the situation of a verb or noun with respect to its station 
In the hypernym hierarchy. 
Based on current SOA, it is acceptable to simply grab the top-level (.01) synset.

Args:
    tagged_sample: a spacy doc

Return:
    specificity: a value conveying the "specificity" of the input, via Nelson (2020)
'''

def specificity(tagged_sample):
    hyper_sum = 0
    noun_and_verb_count = 0
    for word in tagged_sample:
        if word.pos_ == "NOUN" or word.pos_ == "VERB":
            # if it's a verb, get the most common verb hypernym chain
            # else, get the most common noun hypernym chain
            pos = word.pos_
            tag = "n" if pos.startswith("N") else "v"
            wn_lookup = word.lemma_ + "." + tag + ".01"
            try:
                hyper_sum += len(list(wn.synset(wn_lookup).closure(hyper)))
            except:
                # on off chance we have a mistag, don't break down the system
                continue
            noun_and_verb_count +=1
    
    return hyper_sum / noun_and_verb_count

In [89]:
'''
Tally up part of speech tags for categories of interest
Args:
    tagged_sample: a spacy doc

Return:
    a tab delimited string containing counts for verbs, adjectives, adverbs, nouns and adpositions (prepositions/postpositions)
'''
def pos_counts(tagged_sample):
    verb_count = 0
    adj_count = 0
    adv_count = 0
    noun_count = 0
    adp_count = 0
    
    for word in tagged_sample:
        if word.pos_ == "NOUN":
            noun_count +=1
        elif word.pos_ == "VERB":
            verb_count +=1
        elif word.pos_ == "ADJ":
            adj_count +=1
        elif word.pos_ == "ADV":
            adv_count +=1
        elif word.pos_ == "ADP":
            adp_count +=1
    
    return str(adj_count) + "\t" + str(adv_count) + "\t" + str(noun_count) + "\t" + str(verb_count) + "\t" + str(adp_count)

In [14]:
len(list(wn.synset("house.n.01").closure(hyper)))

  for synset in acyclic_breadth_first(self, rel, depth):


9

In [97]:
'''
Concatenate data from this notebook and previous surveys.
Write the data to an output file.
Args:
    filename: a file holding text samples, their source and ratings
    attr: a list containing the part of speech tag tallies from `pos_counts`
    output: the file to write the output to (should be a .tsv)

'''
def collect_data(filename, attr, output):
    row_data = []
    
    with open(filename, encoding="utf-8") as file:
        # samples are separated by line breaks
        next(file)
        for sample in file:
            sample_data = sample.split("\t")
            row_data.append(sample_data[1] + "\t" + sample_data[2] + "\t" + sample_data[3])
        
    for index, val in enumerate(row_data):
        row_data[index] = (row_data[index] + "\t" + attr[index]).replace("\n\t", "\t")
    
    f = open(output, "w")
    for val in row_data:
        f.write(val)
        f.write("\n")
    f.close()

In [52]:
tagged_samples = read_and_tag("tagged_details_sep_fix.tsv")

In [94]:
test = nlp("...century, had been a gigantic pine, with its roots and trunk in the darksome shade, and its head aloft in the upper atmosphere. it was a little dell where they had seated themselves, with a leaf-strewn bank rising gently on either side, and a brook flowing through the midst, over a bed of fallen and drowned leaves. the trees impending over it had flung down great branches from time to time, which choked up the current, and compelled it to form eddies and black depths at some points; while, in its swifter and livelier passages there appeared a channel-way of pebbles, and brown, sparkling sand. letting the eyes follow along the course of the stream, they could catch the reflected light from its water, at some short distance within the forest, but soon lost all traces of it amid the...")
pos_counts(test)

'10\t4\t37\t22\t23'

In [93]:
attr = []
for sample in tagged_samples:
    attr.append(str(specificity(sample)) + "\t" + pos_counts(sample))

In [96]:
collect_data("tagged_details_sep_fix.tsv", attr, "samples_data_with_spec_and_pos_two.tsv")