In [1]:
import json
from pprint import pprint
from analysis.gender import gender, gender_special
from analysis.utils import get_people_mentioned, get_gender, get_quotes, get_associated_verbs, identify_sources, get_associated_adjectives
from analysis.analysis import get_article_info
from datetime import datetime
import time

%load_ext autoreload
%autoreload 2

In [2]:
def load_nyt_data(year, month, folder='annotated/NYT/'):
    nyt_data = {}
    with open('{}nyt_annotated_{}_{}.tsv'.format(folder, year, month), 'r') as nyt_f:
        for line in nyt_f:
            link, data, corenlp = line.strip().split('\t')
            nyt_data[link] = {'data': json.loads(data), 'corenlp': json.loads(corenlp)}
    
    return nyt_data

In [3]:
def get_mentions_quotes_new(nyt_data, out_fn):
    for link, values in nyt_data.iteritems():
        data = values['data']
        corenlp = values['corenlp']
        if not type(corenlp) is dict: # This happens when CoreNLP timed out
            continue
        pm, quotes, _, _, _ = get_article_info('', ann=corenlp)
        num_mentions = {'MALE': 0, 'FEMALE': 0}
        num_distinct_mentions = {'MALE': 0, 'FEMALE': 0}
        num_quoted_words = {'MALE': 0, 'FEMALE': 0}
        num_quoted_people = {'MALE': 0, 'FEMALE': 0}
        for person, info in pm.iteritems():
            count = info[0]
            gender = info[1][0]
            if not type(gender) is str:
                continue
            if gender.lower() not in ['male', 'female']:
                continue
            num_mentions[gender.upper()] += count
            num_distinct_mentions[gender.upper()] += 1
            if person in quotes:
                num_quoted_people[gender.upper()] += 1
                quote_length = len(quotes[person])
                num_quoted_words[gender.upper()] += quote_length
            

        author_gender = 'UNKNOWN'
        if 'print_byline' in data:
            pb = data['print_byline']
            if pb.startswith('By'):
                pb = pb[3:]
            if len(pb) > 0:
                author_gender = get_gender(pb)
        elif 'norm_byline' in data:
            author_gender = get_gender(data['norm_byline'][data['norm_byline'].find(',') + 1:])
        if not type(author_gender) is str:
            author_gender = 'UNKNOWN'
        else:
            author_gender = author_gender.upper()
        '''
        print author_gender
        print data['id']
        print data.keys()
        print data['print_byline']
        print data['norm_byline']
        '''
        year, month = data['id'].split('_')[:2]
        with open(out_fn, 'a') as out_f:
            try:
                out_f.write('\t'.join([unicode(a) for a in [
                            link, author_gender, year, month, data.get('section', ''), 
                            ','.join([unicode(d) for d in data.get('descriptors', [])]), 
                            num_distinct_mentions['MALE'], num_distinct_mentions['FEMALE'],
                            num_mentions['MALE'], num_mentions['FEMALE'],
                            num_quoted_people['MALE'], num_quoted_people['FEMALE'],
                            num_quoted_words['MALE'], num_quoted_words['FEMALE'],
                            ]]))
                out_f.write('\n')
            except UnicodeEncodeError:
                pass
            except:
                print link
                print author_gender
                print year
                print month
                print data.get('section', '')
                print data.get('descriptors')
                print ','.join(data.get('descriptors', []))
                print num_distinct_mentions
                print num_mentions
                print num_quotes
                print [
                            link, author_gender, year, month, data.get('section', ''), ','.join(data.get('descriptors', [])), 
                            num_distinct_mentions['MALE'], num_distinct_mentions['FEMALE'],
                            num_mentions['MALE'], num_mentions['FEMALE'],
                            num_quotes['MALE'], num_quotes['FEMALE']
                            ]
                return


In [12]:
def get_mentions_quotes(nyt_data, out_fn):
    for link, values in nyt_data.iteritems():
        data = values['data']
        corenlp = values['corenlp']
        if not type(corenlp) is dict: # This happens when CoreNLP timed out
            continue
        corefs = corenlp['corefs']
        sentences = corenlp['sentences']
        pm = get_people_mentioned(sentences, corefs, include_gender=True)
        quotes = get_quotes(pm, sentences, corefs)
        num_mentions = {'MALE': 0, 'FEMALE': 0}
        num_distinct_mentions = {'MALE': 0, 'FEMALE': 0}
        num_quoted_words = {'MALE': 0, 'FEMALE': 0}
        num_quoted_people = {'MALE': 0, 'FEMALE': 0}
        for person, info in pm.iteritems():
            count = info[0]
            gender = info[1][0]
            if not type(gender) is str:
                continue
            if 'ambiguous' in gender.lower():  # For now, ignore ambiguous cases
                continue
            num_mentions[gender.upper()] += count
            num_distinct_mentions[gender.upper()] += 1
            if person in quotes:
                num_quoted_people[gender.upper()] += 1
                quote_length = len(quotes[person])
                num_quoted_words[gender.upper()] += quote_length
            

        author_gender = 'UNKNOWN'
        if 'print_byline' in data:
            pb = data['print_byline']
            if pb.startswith('By'):
                pb = pb[3:]
            if len(pb) > 0:
                author_gender = get_gender(pb)
        elif 'norm_byline' in data:
            author_gender = get_gender(data['norm_byline'][data['norm_byline'].find(',') + 1:])
        if not type(author_gender) is str:
            author_gender = 'UNKNOWN'
        else:
            author_gender = author_gender.upper()
        '''
        print author_gender
        print data['id']
        print data.keys()
        print data['print_byline']
        print data['norm_byline']
        '''
        year, month = data['id'].split('_')[:2]
        with open(out_fn, 'a') as out_f:
            try:
                out_f.write('\t'.join([unicode(a) for a in [
                            link, author_gender, year, month, data.get('section', ''), 
                            ','.join([unicode(d) for d in data.get('descriptors', [])]), 
                            num_distinct_mentions['MALE'], num_distinct_mentions['FEMALE'],
                            num_mentions['MALE'], num_mentions['FEMALE'],
                            num_quoted_people['MALE'], num_quoted_people['FEMALE'],
                            num_quoted_words['MALE'], num_quoted_words['FEMALE'],
                            ]]))
                out_f.write('\n')
            except UnicodeEncodeError:
                pass
            except:
                print link
                print author_gender
                print year
                print month
                print data.get('section', '')
                print data.get('descriptors')
                print ','.join(data.get('descriptors', []))
                print num_distinct_mentions
                print num_mentions
                print num_quotes
                print [
                            link, author_gender, year, month, data.get('section', ''), ','.join(data.get('descriptors', [])), 
                            num_distinct_mentions['MALE'], num_distinct_mentions['FEMALE'],
                            num_mentions['MALE'], num_mentions['FEMALE'],
                            num_quotes['MALE'], num_quotes['FEMALE']
                            ]
                return


In [13]:
nyt_data = load_nyt_data(1989, 8)

In [14]:
get_mentions_quotes_new(nyt_data, 'nyt_data_counts.tsv')

TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES


In [None]:
for year in range(1987, 2006):
    for month in range(1, 13):
        print time.ctime(), "Loading data for {}/{}".format(year, month)
        try:
            nyt_data = load_nyt_data(year, month)
        except:
            print "Exception Occurred"
            continue
        print time.ctime(), "Analyzing data ..."
        get_mentions_quotes_new(nyt_data, 'nyt_data_counts_0518_{}_{}.tsv'.format(year, month))

Thu May 18 16:55:46 2017 Loading data for 1987/1
Thu May 18 16:56:13 2017 Analyzing data ...
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TWO POSSESSORS OF THIS PERSON!
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
Thu May 18 16:56:18 2017 Loading data for 1987/2
Thu May 18 16:56:49 2017 Analyzing data ...
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
Thu May 18 16:56:55 2017 Loading data for 1987/3
Thu May 18 16:57:29 2017 Analyzing data ...
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SURNAMES
TOO MANY CANDIDATES SUR

In [11]:
MIN_NUM = 31
MAX_NUM = MIN_NUM + 10
curr_idx = 0
for link, values in tc_data.iteritems():
    curr_idx += 1
    if curr_idx < MIN_NUM:
        continue
    if curr_idx > MAX_NUM:
        break

    # if link != 'https://techcrunch.com/2016/01/05/intel-says-button-sized-curie-will-ship-in-q1-costing-under-10/':
    #    continue
    data = values['data']
    corenlp = values['corenlp']
    print link
    #pprint(corenlp['sentences'][0]['collapsed-ccprocessed-dependencies'])
    #print corenlp['sentences'][0]['tokens'][1:3]
    corefs = corenlp['corefs']
    sentences = corenlp['sentences']
    # pprint(corefs)
    # pprint(sentences[5])
    # print sentences[0].keys()
    # pprint(sentences[5]['tokens'])
    pm = get_people_mentioned(sentences, corefs, include_gender=True)
    quotes = get_quotes(pm, sentences, corefs)
    verbs = get_associated_verbs(pm, sentences, corefs)
    print pm
    print {k: len(v) for k, v in quotes.iteritems()}
    print verbs
    #for c_id, coref in corefs.iteritems():
    #    process_coref_chain(c_id, coref)

https://techcrunch.com/2016/01/31/the-league-vs-stanford/
{u'Amanda Bradford': (5, (u'FEMALE', 'COREF'))}
{u'Amanda Bradford': 3}
{u'Amanda Bradford': [(u'decided', u'decide'), (u're-posted', u're-post'), (u'wrote', u'write'), (u'said', u'say')]}
https://techcrunch.com/2016/01/20/with-brave-software-javascripts-inventor-is-building-a-browser-for-the-ad-blocked-future/
{u'Brendan Eich': (8, (u'MALE', 'COREF'))}
{u'Brendan Eich': 40}
{u'Brendan Eich': [(u'working', u'work'), (u'wrote', u'write'), (u'told', u'tell'), (u'put', u'put'), (u'said', u'say'), (u'explained', u'explain'), (u'hoping', u'hope')]}
https://techcrunch.com/2016/01/06/stereolabss-depth-sensing-camera-helps-robots-drones-and-cars-see/
{u'Cecile Schmollgruber': (1, (u'FEMALE', 'COREF')), u'Stereolabs': (1, (None, None)), u'Stereolabsthen': (3, (None, None))}
{u'Cecile Schmollgruber': 0, u'Stereolabs': 0, u'Stereolabsthen': 0}
{u'Cecile Schmollgruber': [], u'Stereolabs': [(u'says', u'say'), (u'working', u'work')], u'Stereo

In [1]:
from nlp.utils import annotate_corenlp
text = u'''
 President Obama said on Thursday that the United States would retaliate for Russia’s efforts to influence the presidential election, asserting that “we need to take action,” and “we will.”

The comments, in an interview with NPR, indicate that Mr. Obama, in his remaining weeks in office, will pursue either economic sanctions against Russia or perhaps some kind of response in cyberspace.

Mr. Obama spoke as President-elect Donald J. Trump on Thursday again refused to accept Moscow’s culpability, asking on Twitter why the administration had waited “so long to act” if Russia “or some other entity” had carried out cyberattacks.

The president discussed the potential for American retaliation with Steve Inskeep of NPR for an interview to air on Friday morning. “I think there is no doubt that when any foreign government tries to impact the integrity of our election,” Mr. Obama said, “we need to take action. And we will — at the time and place of our choosing.”

On Friday morning, the Kremlin’s spokesman, Dmitri S. Peskov, batted away the warning. “It is necessary to either stop talking about it, or finally produce some evidence,” he told the Interfax news agency. “Otherwise, it all begins to look quite unseemly.”

The White House strongly suggested before the election that Mr. Obama would make use of sanctions authority for cyberattacks that he had given to himself by executive order. But he did not, in part out of concern that action before the election could lead to an escalated conflict.

If Mr. Obama invokes sanctions on Russian individuals or organizations, Mr. Trump could reverse them. But that would be politically difficult, as his critics argue that he is blind to Russian behavior.

On Thursday, pressure grew on Mr. Trump in Congress for him to acknowledge intelligence agencies’ conclusions that Russia was behind the hacking. But aides said that was all but impossible before the Electoral College convenes on Monday to formalize his victory.

Mr. Trump has said privately in recent days that he believes there are people in the C.I.A. who are out to get him and are working to delegitimize his presidency, according to people briefed on the conversations who described them on the condition of anonymity.

The president-elect’s suspicions have been stoked by the efforts of a group of Democratic electors, as well as one Republican, who called this week for an intelligence briefing on the Russian hacking, raising the prospect that votes in the Electoral College might be changed.

In his Twitter posting on Thursday, Mr. Trump suggested that the government’s conclusions on Russian hacking were a case of sour grapes by Mr. Obama. The president-elect falsely stated that Mr. Obama had waited until after the election to raise the issue.

“Why did they only complain after Hillary lost?” Mr. Trump asked, although the director of national intelligence, James R. Clapper Jr., formally blamed Russia on Oct. 7 for cyberattacks on the Democratic National Committee and other organizations.

In September, meeting privately in China with President Vladimir V. Putin of Russia, Mr. Obama not only complained, the White House says, but also warned him of consequences if the Russian activity did not stop.

Among those in his own party, Mr. Trump’s refusal to accept the evidence that Russia was the perpetrator was raising growing concerns, with Senator Lindsey Graham of South Carolina saying he would not vote for Rex W. Tillerson, Mr. Trump’s nominee for secretary of state, unless Mr. Tillerson addressed Russia’s role during his confirmation hearings.

It remains to be seen whether Mr. Trump’s stated doubts about Russia’s involvement will subside after Monday’s Electoral College vote. He and his allies have been concerned that the reports of Russian hacking have been intended to peel away votes from him, although even Democrats have not gone so far as to say the election was illegitimate.

“Right now, certain elements of the media, certain elements of the intelligence community and certain politicians are really doing the work of the Russians — they’re creating this uncertainty over the election,” Representative Peter T. King, Republican of New York, told reporters on Thursday after meeting with Mr. Trump.

But many other Republicans, including Senator Mitch McConnell of Kentucky, the majority leader, and Senator John McCain of Arizona, have publicly argued that the evidence leads straight to Russia. They have called for a full investigation, and Senator Dianne Feinstein, Democrat of California, who sits on the Senate Intelligence Committee, urged Mr. Obama on Thursday to complete an administration review quickly.

Mr. Trump’s Twitter post was his latest move to accuse the intelligence agencies he will soon control of acting with a political agenda and to dispute the well-documented conclusion that Moscow carried out a meticulously planned series of attacks and releases of information to interfere in the presidential race.

But as he repeated his doubts, Mr. Trump seized on emerging questions about the Obama administration’s response: Why did it take months after the breaches had been discovered for the administration to name Moscow publicly as the culprit? And why did Mr. Obama initially opt not to openly retaliate, through sanctions or other measures?

White House officials have said that the warning to Mr. Putin at the September summit meeting in China constituted the primary American response so far. When the administration decided to go public with its conclusion a month later, it did so in a statement from the director of national intelligence and the Homeland Security secretary, not in a prominent presidential appearance.

Officials said they were worried that any larger public response would have raised doubts about the election’s integrity, something Mr. Trump was already seeking to do during the campaign when he insisted the election was “rigged.”

Josh Earnest, the White House press secretary, criticized Mr. Trump on Thursday for questioning whether Russia was behind the attacks, referring to Mr. Trump’s call during the campaign for Moscow to hack Hillary Clinton’s emails, a remark his team has since dismissed as a joke.

“I don’t think anybody at the White House thinks it’s funny that an adversary of the United States engaged in malicious cyberactivity to destabilize our democracy — that’s not a joke,” Mr. Earnest said. “It might be time to not attack the intelligence community, but actually be supportive of a thorough, transparent, rigorous, nonpolitical investigation into what exactly happened.”

While he declined to confirm news reports that Mr. Putin was personally involved in directing the cyberattacks, Mr. Earnest pointedly read part of the Oct. 7 statement that said intelligence officials believed “that only Russia’s senior-most officials could have authorized these activities.”

He said that language “would lead me to conclude that based on my personal reading and not based on any knowledge that I have that may be classified or otherwise, it was pretty obvious that they were referring to the senior-most government official in Russia.”

In a conference call with reporters later on Thursday, aides declined to explain Mr. Trump’s position on whether Russia had been responsible for the breaches or to describe what he would do about the issue as president. Jason Miller, a spokesman, said he would let Mr. Trump’s “tweets speak for themselves” and added that those raising questions about the hacking were refusing to come to terms with his victory. “At a certain point you’ve got to realize that the election from last month is going to stand,” Mr. Miller said.
'''
ann = annotate_corenlp(text, annotators=['pos', 'lemma', 'ner', 'parse', 'depparse', 'dcoref', 'quote'])

In [13]:
sentences, corefs = ann['sentences'], ann['corefs']
people_mentioned = get_people_mentioned(sentences, corefs,
                                        include_gender=True)
quotes = get_quotes(people_mentioned, sentences, corefs)
verbs = get_associated_verbs(people_mentioned, sentences, corefs)
print people_mentioned
print
print {k: len(v) for k, v in quotes.iteritems()}
print
print verbs
print
print identify_sources(people_mentioned, people_to_quotes=quotes, people_to_verbs=verbs)
print
pprint(get_associated_adjectives(people_mentioned, sentences, corefs))

{u'Jason Miller': (2, (u'MALE', 'COREF')), u'Josh Earnest': (3, (u'MALE', 'COREF')), u'Steve Inskeep': (1, ('male', 'NAME_ONLY')), u'Lindsey Graham': (1, ('female', 'NAME_ONLY')), u'Hillary Clinton': (2, ('female', 'NAME_ONLY')), u'John McCain': (1, ('male', 'NAME_ONLY')), u'Vladimir V. Putin': (3, (u'MALE', 'COREF')), u'Donald J. Trump': (17, (u'MALE', 'COREF')), u'Rex W. Tillerson': (2, ('male', 'NAME_ONLY')), u'Dianne Feinstein': (1, ('female', 'NAME_ONLY')), u'Mitch McConnell': (1, ('male', 'NAME_ONLY')), u'James R. Clapper Jr.': (1, ('male', 'NAME_ONLY')), u'Dmitri S. Peskov': (1, (u'MALE', 'COREF')), u'Peter T. King': (1, ('male', 'NAME_ONLY')), u'Obama': (12, (u'MALE', 'COREF'))}

{u'Jason Miller': 20, u'Josh Earnest': 119, u'Steve Inskeep': 0, u'Lindsey Graham': 0, u'Hillary Clinton': 0, u'John McCain': 0, u'Vladimir V. Putin': 0, u'Donald J. Trump': 0, u'Rex W. Tillerson': 0, u'Dianne Feinstein': 0, u'Mitch McConnell': 0, u'James R. Clapper Jr.': 0, u'Dmitri S. Peskov': 26, u'