In [1]:
import json
from pprint import pprint
from analysis.gender import gender, gender_special

In [2]:
def get_gender(name, verbose=False):

    first_name = name.split()[0].upper()
    if first_name == 'DR.':
        first_name = name.split()[1].upper()
    found = gender.get(first_name, None)
    if not found:
        special_found = gender_special.get(name.upper(), None)
        if special_found:
            return special_found
        if verbose:
            print 'Gender not found:', name
    if type(found) is tuple:
        special_found = gender_special.get(name.upper(), None)
        if special_found:
            return special_found
        if verbose:
            print 'Ambiguous gender:', name, found
    return found

In [3]:
def process_coref_chain(coref_id, coref_chain):
    # Get the unique representative element
    rep_elem = next(coref for coref in coref_chain if coref['isRepresentativeMention'])
    to_print = [(elem['text'], elem['type'], elem['number'], elem['gender'], elem['sentNum']) for elem in coref_chain
                if elem['animacy'] == 'ANIMATE']
    if len(to_print):
        print len(coref_chain), to_print

In [4]:
def process_sentences(sentences):
    """
    Process the sentences object returned by CoreNLP's annotation.
    """
    people_mentioned = set()
    curr_mention = ''
    for (i, sentence) in enumerate(sentences):
        assert i == sentence['index']
        tokens = sentence['tokens']
        for token in tokens:
            if token['ner'] == 'PERSON':
                if len(curr_mention) > 0:
                    curr_mention += ' '
                curr_mention += token['originalText']
            else:
                if len(curr_mention) > 0:
                    sp_cm = tuple(curr_mention.split())
                    intersected = False
                    to_add = []
                    to_remove = []
                    for pm in people_mentioned:
                        if len(set.intersection(set(pm), set(sp_cm))) > 0:
                            intersected = True
                            if len(sp_cm) > len(pm):
                                to_add.append(sp_cm)
                                to_remove.append(pm)
                    if not intersected:
                        people_mentioned.add(sp_cm)
                    else:
                        for elem in to_remove:
                            people_mentioned.remove(elem)
                        for elem in to_add:
                            people_mentioned.add(elem)
                    curr_mention = ''
    
    return people_mentioned

In [5]:
with open('data/annotated_old/techcrunch_annotated_2016.json', 'r') as tc_f:
    tc_2016_data = json.load(tc_f)

In [7]:
def get_mentions_counts(data):
    all_counts = {'MALE': {'MALE': 0, 'FEMALE': 0}, 'FEMALE': {'MALE': 0, 'FEMALE': 0}, 'UNK': {'MALE': 0, 'FEMALE': 0}}
    for link, data in data.iteritems():
        try:
            gend = get_gender(data['author'])
            if type(gend) is str:
                gend = gend.upper()
            else:
                gend = 'UNK'
            if not gend:
                continue
            counts = all_counts[gend]
            sentences = data['corenlp']['sentences']
            corefs = data['corenlp']['corefs']
            people_mentioned = process_sentences(sentences)
            pm_to_gender = {pm: None for pm in people_mentioned}
            for c_id, coref_chain in corefs.iteritems():
                rep_elem = next(coref for coref in coref_chain if coref['isRepresentativeMention'])
                elem_text = set(rep_elem['text'].split())
                for pm in pm_to_gender:
                    if len(set.intersection(elem_text, set(pm))) > 0:
                        if rep_elem['gender'] in ['MALE', 'FEMALE']:
                            pm_to_gender[pm] = rep_elem['gender']
            for gender in ['MALE', 'FEMALE']:
                counts[gender] += len([sp for sp in pm_to_gender.values() if sp == gender])
        except:
            continue
    return all_counts

In [8]:
for year in range(2009, 2017):
    with open('data/annotated_old/techcrunch_annotated_{}.json'.format(year), 'r') as tc_f:
        data = json.load(tc_f)
        print year, get_mentions_counts(data)

2009 {'UNK': {'MALE': 4006, 'FEMALE': 440}, 'MALE': {'MALE': 9721, 'FEMALE': 1066}, 'FEMALE': {'MALE': 2520, 'FEMALE': 301}}
2010 {'UNK': {'MALE': 5241, 'FEMALE': 567}, 'MALE': {'MALE': 10354, 'FEMALE': 1047}, 'FEMALE': {'MALE': 4902, 'FEMALE': 537}}
2011 {'UNK': {'MALE': 5147, 'FEMALE': 544}, 'MALE': {'MALE': 9420, 'FEMALE': 970}, 'FEMALE': {'MALE': 8884, 'FEMALE': 1025}}
2012 {'UNK': {'MALE': 4406, 'FEMALE': 441}, 'MALE': {'MALE': 10956, 'FEMALE': 1223}, 'FEMALE': {'MALE': 7407, 'FEMALE': 925}}
2013 {'UNK': {'MALE': 3397, 'FEMALE': 346}, 'MALE': {'MALE': 14846, 'FEMALE': 1615}, 'FEMALE': {'MALE': 9682, 'FEMALE': 1232}}
2014 {'UNK': {'MALE': 954, 'FEMALE': 121}, 'MALE': {'MALE': 14318, 'FEMALE': 1582}, 'FEMALE': {'MALE': 7363, 'FEMALE': 1210}}
2015 {'UNK': {'MALE': 614, 'FEMALE': 72}, 'MALE': {'MALE': 13075, 'FEMALE': 1553}, 'FEMALE': {'MALE': 7555, 'FEMALE': 1399}}
2016 {'UNK': {'MALE': 274, 'FEMALE': 44}, 'MALE': {'MALE': 7386, 'FEMALE': 881}, 'FEMALE': {'MALE': 4755, 'FEMALE': 950}

In [46]:
MIN_NUM = 623 + 6
MAX_NUM = MIN_NUM + 1
curr_idx = 0
for link, data in tc_2016_data.iteritems():
    print data['author'], get_gender(data['author'])
    continue
    curr_idx += 1
    if curr_idx < MIN_NUM:
        continue
    if curr_idx >= MAX_NUM:
        break    
    print link
    corefs = data['corenlp']['corefs']
    sentences = data['corenlp']['sentences']
    # print sentences[0].keys()
    # pprint(sentences[5]['tokens'])
    print process_sentences(sentences)
    for c_id, coref in corefs.iteritems():
        process_coref_chain(c_id, coref)

Mark Kaganovich male
Natasha Lomas female
Ingrid Lunden female
Matthew Lynley male
Jonathan Shieber male
Ben Narasin male
Robert Dale male
Matthew Panzarino male
Romain Dillet male
Romain Dillet male
Simon Khalaf male
Kristen Hall-Geisler female
Jonathan Shieber male
Lora Kolodny female
Kate Conger female
Romain Dillet male
Connie Loizos female
Katie Roof female
Catherine Shu female
Lucas Matney male
Sarah Buhr female
Jon Russell male
Jon Russell male
Ron Miller male
Joe Edelheit Ross male
Anthony Ha male
Peter Hirst male
Matt Burns male
Darrell Etherington male
Connie Loizos female
Sarah Perez female
Megan Rose Dickey female
Jake Bright male
Ryan Angilly male
Darrell Etherington male
Anna Escher female
Lucas Matney male
Benjamin Brandall male
Lora Kolodny female
Charles Birnbaum male
Ingrid Lunden female
Catherine Shu female
Sarah Buhr female
Haje Jan Kamps male
Sarah Buhr female
Connie Loizos female
Evan Baehr male
Josh Constine male
Natasha Lomas female
Lucas Matney male
Josh Consti

https://techcrunch.com/2016/06/20/is-enterprise-genomics-good-enough-yet/
