In [25]:
# re
import re

# POS
import spacy

# nltk for wordnet
import nltk
from nltk.corpus import wordnet as wn

In [3]:
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1100f4400>)

In [19]:
'''
Reads in a file containing text samples, that has each sample separated by a carriage return.

Args:
    filename: location of sample file

Return:
    tagged_samples: a list of spacy `nlp` docs that contains information about each word in the sample
'''
def read_and_tag(filename):    
    
    tagged_samples = []
    
    with open(filename, encoding="utf-8") as file:
        # samples are separated by line breaks
        for sample in file:            
            doc=nlp(sample)
            tagged_samples.append(doc)
        
    return tagged_samples

In [49]:
# Function from http://www.nltk.org/howto/wordnet.html to get *all* of a synset's hyponym/hypernyms
hyper = lambda s: s.hypernyms()

In [50]:
'''
Consult wordnet for the situation of a verb or noun with respect to its station 
In the hypernym hierarchy. 
Based on current SOA, it is acceptable to simply grab the top-level (.01) synset.

Args:
    tagged_sample: a spacy doc

Return:
    specificity: a value conveying the "specificity" of the input, via Nelson (2020)
'''

def specificity(tagged_sample):
    hyper_sum = 0
    noun_and_verb_count = 0
    for word in tagged_sample:
        if word.pos_ == "NOUN" or word.pos_ == "VERB":
            # if it's a verb, get the most common verb hypernym chain
            # else, get the most common noun hypernym chain
            pos = word.pos_
            tag = "n" if pos.startswith("N") else "v"
            wn_lookup = word.lemma_ + "." + tag + ".01"
            try:
                hyper_sum += len(list(wn.synset(wn_lookup).closure(hyper)))
            except:
                # on off chance we have a mistag, don't break down the system
                continue
            noun_and_verb_count +=1
    
    return hyper_sum / noun_and_verb_count

In [74]:
def pos_counts(tagged_sample):
    verb_count = 0
    adj_count = 0
    adv_count = 0
    noun_count = 0
    
    for word in tagged_sample:
        if word.pos_ == "NOUN":
            noun_count +=1
        elif word.pos_ == "VERB":
            verb_count +=1
        elif word.pos_ == "ADJ":
            adj_count +=1
        elif word.pos_ == "ADV":
            adv_count +=1
    
    return str(adj_count) + "," + str(adv_count) + "," + str(noun_count) + "," + str(verb_count)

In [73]:
detail_samples = read_and_tag("detail_samples.txt")

In [70]:
not_detail_samples = read_and_tag("not_detail_samples.txt")

In [75]:
detail_attr = []
for sample in detail_samples:
    detail_attr.append(str(specificity(sample)) + "," + pos_counts(sample))

In [76]:
not_detail_attr = []
for sample in not_detail_samples:
    not_detail_attr.append(str(specificity(sample)) + "," + pos_counts(sample))

In [84]:
# write out detail_attr and not_detail attr to a file
# then merge w/ existing sheet

'4.875,12,8,30,30'