This notebook explores dependency parsing by identifying the actions and objects that are characteristically associated with male and female characters.

In [3]:
import spacy, math
from collections import Counter
import operator

In [4]:
nlp = spacy.load('en_core_web_sm')

We'll run seven novels by Jane Austen through spacy (this will take a few minutes).

In [5]:
filenames=["../data/fiction/emma.txt", "../data/fiction/lady_susan.txt", "../data/fiction/mansfield_park.txt", "../data/fiction/northanger_abbey.txt", "../data/fiction/persuasion.txt", "../data/fiction/pride.txt", "../data/fiction/sense_and_sensibility.txt"]
all_tokens=[]
for filename in filenames:
    print(filename)
    data=open(filename, encoding="utf-8").read()
    tokens=nlp(data)
    all_tokens.extend(tokens)

../data/fiction/emma.txt
../data/fiction/lady_susan.txt
../data/fiction/mansfield_park.txt
../data/fiction/northanger_abbey.txt
../data/fiction/persuasion.txt
../data/fiction/pride.txt
../data/fiction/sense_and_sensibility.txt


In [7]:
all_tokens

[﻿The,
 Project,
 Gutenberg,
 EBook,
 of,
 Emma,
 ,,
 by,
 Jane,
 Austen,
 
 ,
 This,
 eBook,
 is,
 for,
 the,
 use,
 of,
 anyone,
 anywhere,
 at,
 no,
 cost,
 and,
 with,
 ,
 almost,
 no,
 restrictions,
 whatsoever,
 .,
  ,
 You,
 may,
 copy,
 it,
 ,,
 give,
 it,
 away,
 or,
 ,
 re,
 -,
 use,
 it,
 under,
 the,
 terms,
 of,
 the,
 Project,
 Gutenberg,
 License,
 included,
 ,
 with,
 this,
 eBook,
 or,
 online,
 at,
 www.gutenberg.org,
 
 
 ,
 Title,
 :,
 Emma,
 
 ,
 Author,
 :,
 Jane,
 Austen,
 
 ,
 Release,
 Date,
 :,
 August,
 ,,
 1994,
  ,
 [,
 Etext,
 #,
 158,
 ],
 ,
 Posting,
 Date,
 :,
 January,
 21,
 ,,
 2010,
 ,
 Last,
 Updated,
 :,
 March,
 10,
 ,,
 2018,
 
 ,
 Language,
 :,
 English,
 
 ,
 Character,
 set,
 encoding,
 :,
 UTF-8,
 
 ,
 *,
 *,
 *,
 START,
 OF,
 THIS,
 PROJECT,
 GUTENBERG,
 EBOOK,
 EMMA,
 *,
 *,
 *,
 
 
 
 
 ,
 Produced,
 by,
 An,
 Anonymous,
 Volunteer,
 
 
 
 
 
 ,
 EMMA,
 
 ,
 By,
 Jane,
 Austen,
 
 
 
 
 ,
 VOLUME,
 I,
 
 
 
 ,
 CHAPTER,
 I,
 
 
 ,
 Emma,
 

In [6]:
print (len(all_tokens))

972810


In [7]:
def test(maleCounter, femaleCounter, display=25):
    
    """ Function that takes two Counter objects as inputs and prints out a ranked list of terms
    more characteristic of the first counter than the second.  Here we'll use log-odds
    with an uninformative prior (from Monroe et al 2008, "Fightin Words", eqn. 22) as our metric.
    
    """
    
    vocab=dict(maleCounter) 
    vocab.update(dict(femaleCounter))
    maleSum=sum(maleCounter.values())
    femaleSum=sum(femaleCounter.values())

    ranks={}
    alpha=0.01
    alphaV=len(vocab)*alpha
        
    for word in vocab:
        
        log_odds_ratio=math.log( (maleCounter[word] + alpha) / (maleSum+alphaV-maleCounter[word]-alpha) ) - math.log( (femaleCounter[word] + alpha) / (femaleSum+alphaV-femaleCounter[word]-alpha) )
        variance=1./(maleCounter[word] + alpha) + 1./(femaleCounter[word] + alpha)
        
        ranks[word]=log_odds_ratio/math.sqrt(variance)

    sorted_x = sorted(ranks.items(), key=operator.itemgetter(1), reverse=True)
    
    print("Most male:")
    for k,v in sorted_x[:display]:
        print("%.3f\t%s" % (v,k))
    
    print("\nMost female:")
    for k,v in reversed(sorted_x[-display:]):
        print("%.3f\t%s" % (v,k))

Spacy uses the [ClearNLP dependency labels](https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md), which are very close to the Stanford typed dependencies.  See the [Stanford dependencies manual](http://people.ischool.berkeley.edu/~dbamman/DependencyManual.pdf) for more information about each tag.  Parse information is contained in the spacy token object; see the following for which attributes encode the token text, idx (position in sentence), part of speech, and dependency relation.  The syntactic head for a token is another token given in `token.head` (where all of those same token attributes are accessible).

In [139]:
testDoc=nlp("He admired her.")
for token in testDoc:
    print("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (token.text, token.idx, token.tag_, token.dep_, token.head.text, token.head.idx, token.head.tag_))


He	0	PRP	nsubj	admired	3	VBD
admired	3	VBD	ROOT	admired	3	VBD
her	11	PRP	dobj	admired	3	VBD
.	14	.	punct	admired	3	VBD


In [133]:
# malesubj = []
# femaleobj = []
# for token in all_tokens:
#     if token.text in ['he', 'He'] and token.dep_ == 'nsubj' and token.head.tag_.startswith('VBD') and token.head.dep_ == 'ROOT':
#         malesubj.append((token.head.text, token.head.idx))
#     elif token.text in ['her', 'Her'] and token.dep_ == 'dobj' and token.head.tag_.startswith('VBD') and token.head.dep_ == 'ROOT':
#         femaleobj.append((token.head.text, token.head.idx))

# maleobj = []
# femalesubj = []
# for token in all_tokens:
#     if token.text in ['she', 'She'] and token.dep_ == 'nsubj' and token.head.tag_.startswith('VBD') and token.head.dep_ == 'ROOT':
#         maleobj.append((token.head.text, token.head.idx))
#     elif token.text in ['him', 'Him'] and token.dep_ == 'dobj' and token.head.tag_.startswith('VBD') and token.head.dep_ == 'ROOT':
#         femalesubj.append((token.head.text, token.head.idx))

In [134]:
# maleCounter = Counter()
# for x in femaleobj:
#     if x in malesubj:
#         maleCounter[x[0]] += 1

# femaleCounter = Counter()
# for x in maleobj:
#     if x in femalesubj:
#         femaleCounter[x[0]] += 1

Q1: Find the verbs that men are more characteristically the *subject* of than women.  Feel free to only consider subjects that are "he" and "she" pronouns.  This function should return two Counter objects (`maleCounter` and `femaleCounter`) which counts the number of times a given verb has "he" (`maleCounter`) and "she" (`femaleCounter`) as its syntactic subject.

In [85]:
def count_subjects():
    maleCounter=Counter()
    femaleCounter=Counter()

    for token in all_tokens:
        if token.text in ['he', 'He'] and token.dep_ == 'nsubj' and token.head.tag_.startswith('VBD'):
            maleCounter[token.head.text]+=1
        elif token.text in ['she', 'She'] and token.dep_ == 'nsubj' and token.head.tag_.startswith('VBD'):
            femaleCounter[token.head.text]+=1
    
    return maleCounter, femaleCounter

In [86]:
male, female=count_subjects()
test(male, female, display=10)

Most male:
6.583	replied
5.794	said
5.454	came
4.609	seemed
3.422	told
3.128	continued
2.697	took
2.552	left
2.454	talked
2.247	asked

Most female:
-6.998	felt
-4.810	saw
-4.011	found
-3.799	heard
-3.793	knew
-3.303	thought
-2.395	feared
-2.354	had
-2.182	hoped
-2.164	resolved


Q2: Find the verbs that men are more characteristically the *object* of than women.  Feel free to only consider objects that are "him" and "her" pronouns.  This function should return two Counter objects (`maleCounter` and `femaleCounter`) which counts the number of times a given verb has "he" (`maleCounter`) and "she" (`femaleCounter`) as its syntactic direct object. 

In [77]:
def count_objects():
    maleCounter=Counter()
    femaleCounter=Counter()
    
    for token in all_tokens:
        if token.text in ['him', 'Him'] and token.dep_ == 'dobj' and token.head.tag_.startswith('VBD'):
            maleCounter[token.head.text]+=1
        elif token.text in ['her', 'Her'] and token.dep_ == 'dobj' and token.head.tag_.startswith('VBD'):
            femaleCounter[token.head.text]+=1
    
    return maleCounter, femaleCounter

In [78]:
male, female=count_objects()
test(male, female, display=10)

Most male:
2.869	saw
2.583	thanked
2.459	liked
1.905	begged
1.846	recommended
1.625	wished
1.625	understood
1.535	believed
1.535	kept
1.418	brought

Most female:
-2.462	left
-2.078	struck
-1.921	attended
-1.750	convinced
-1.601	obliged
-1.459	joined
-1.225	gave
-1.044	enabled
-1.044	pleased
-0.861	advised


Q3: Find the objects that are *possessed* more frequently by men than women.  Feel free to only consider possessors that are "his" and "her" pronouns.   This function should return two Counter objects (`maleCounter` and `femaleCounter`) which counts the number of times a given term is possessed by "he" (`maleCounter`) and "she" (`femaleCounter`).

In [73]:
def count_possessions():
    maleCounter=Counter()
    femaleCounter=Counter()
    
    for token in all_tokens:
        if token.text in ['his', 'His'] and token.dep_ == 'poss':
            maleCounter[token.head.text]+=1
        elif token.text in ['her', 'Her'] and token.dep_ == 'poss':
            femaleCounter[token.head.text]+=1
    
    return maleCounter, femaleCounter

In [72]:
male, female=count_possessions()
test(male, female, display=10)

Most male:
4.696	sisters
4.425	attentions
4.357	house
4.296	return
4.096	name
3.797	attachment
3.740	son
3.574	horse
3.536	character
3.534	manners

Most female:
-7.239	mother
-6.339	sister
-4.514	eyes
-4.422	aunt
-4.047	uncle
-3.593	spirits
-3.584	heart
-3.574	room
-3.177	thoughts
-3.097	brother


Q4: Find the actions that are men do *to women* more frequently than women do *to men*.  Feel free to only consider subjects and objects that are "she"/"he"/"her"/"him" pronouns.   This function should return two Counter objects (`maleCounter` and `femaleCounter`) which counts the number of times a given verb has "he" as the subject and "her" as the object (`maleCounter`) and "she" as the subject and "him" as the object (`femaleCounter`).

In [137]:
def count_SVO_tuples():
    maleCounter=Counter()
    femaleCounter=Counter()

    malesubj = []
    femaleobj = []
    for token in all_tokens:
        if token.text in ['he', 'He'] and token.dep_ == 'nsubj' and token.head.tag_.startswith('VBD') and token.head.dep_ == 'ROOT':
            malesubj.append((token.head.text, token.head.idx))
        elif token.text in ['her', 'Her'] and token.dep_ == 'dobj' and token.head.tag_.startswith('VBD') and token.head.dep_ == 'ROOT':
            femaleobj.append((token.head.text, token.head.idx))

    maleobj = []
    femalesubj = []
    for token in all_tokens:
        if token.text in ['she', 'She'] and token.dep_ == 'nsubj' and token.head.tag_.startswith('VBD') and token.head.dep_ == 'ROOT':
            maleobj.append((token.head.text, token.head.idx))
        elif token.text in ['him', 'Him'] and token.dep_ == 'dobj' and token.head.tag_.startswith('VBD') and token.head.dep_ == 'ROOT':
            femalesubj.append((token.head.text, token.head.idx))
    
    for x in femaleobj:
        if x in malesubj:
            maleCounter[x[0]] += 1

    for x in maleobj:
        if x in femalesubj:
            femaleCounter[x[0]] += 1
    
    return maleCounter, femaleCounter

In [138]:
male, female=count_SVO_tuples()
test(male, female, display=10)

Most male:
1.203	loved
0.557	knew
0.557	joined
0.557	left
0.557	heard
0.485	caught
0.485	called
0.485	shook
0.485	praised
0.485	assisted

Most female:
-0.781	followed
-0.584	saw
-0.510	perceived
-0.510	received
-0.510	liked
-0.510	assured
-0.438	cast
-0.438	met
-0.438	hated
-0.438	begged
