In [46]:
import json
from pprint import pprint
from analysis.gender import gender, gender_special
from analysis.utils import get_people_mentioned, get_gender, get_sources
from datetime import datetime
import time

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
def load_tc_data(year, month, folder='annotated/'):
    tc_data = {}
    with open('{}techcrunch_annotated_{}_{}.tsv'.format(folder, year, month), 'r') as tc_f:
        for line in tc_f:
            link, data, corenlp = line.strip().split('\t')
            tc_data[link] = {'data': json.loads(data), 'corenlp': json.loads(corenlp)}
    
    return tc_data

In [4]:
def get_mentions_counts(data):
    all_counts = {'MALE': {'MALE': 0, 'FEMALE': 0}, 'FEMALE': {'MALE': 0, 'FEMALE': 0}, 'UNK': {'MALE': 0, 'FEMALE': 0}}
    for link, data in data.iteritems():
        try:
            gend = get_gender(data['author'])
            if type(gend) is str:
                gend = gend.upper()
            else:
                gend = 'UNK'
            if not gend:
                continue
            counts = all_counts[gend]
            sentences = data['corenlp']['sentences']
            corefs = data['corenlp']['corefs']
            people_mentioned = process_sentences(sentences)
            pm_to_gender = {pm: None for pm in people_mentioned}
            for c_id, coref_chain in corefs.iteritems():
                rep_elem = next(coref for coref in coref_chain if coref['isRepresentativeMention'])
                elem_text = set(rep_elem['text'].split())
                for pm in pm_to_gender:
                    if len(set.intersection(elem_text, set(pm))) > 0:
                        if rep_elem['gender'] in ['MALE', 'FEMALE']:
                            pm_to_gender[pm] = rep_elem['gender']
            for gender in ['MALE', 'FEMALE']:
                counts[gender] += len([sp for sp in pm_to_gender.values() if sp == gender])
        except:
            continue
    return all_counts

In [51]:
def get_mentions_quotes(tc_data, out_fn):
    for link, values in tc_data.iteritems():
        data = values['data']
        corenlp = values['corenlp']
        if not type(corenlp) is dict: # This happens when CoreNLP timed out
            continue
        corefs = corenlp['corefs']
        sentences = corenlp['sentences']
        pm = get_people_mentioned(sentences, corefs, include_gender=True)
        sources = get_sources(pm, sentences, corefs)
        num_mentions = {'MALE': 0, 'FEMALE': 0}
        num_distinct_mentions = {'MALE': 0, 'FEMALE': 0}
        num_quotes = {'MALE': 0, 'FEMALE': 0}
        for person, info in pm.iteritems():
            count = info[0]
            gender = info[1][0]
            if not type(gender) is str:
                continue
            num_mentions[gender.upper()] += count
            num_distinct_mentions[gender.upper()] += 1
            quote_length = len(sources[person])
            num_quotes[gender.upper()] += quote_length

        author_gender = get_gender(data['author'])
        if not type(author_gender) is str:
            author_gender = 'UNKNOWN'
        else:
            author_gender = author_gender.upper()
        dt = datetime.strptime(data['timestamp'], '%Y-%m-%d %H:%M:%S')
        with open(out_fn, 'a') as out_f:
            try:
                out_f.write('\t'.join([unicode(a) for a in [
                            link, author_gender, dt.year, dt.month, ','.join(data['category']), ','.join(data['tag']), 
                            num_distinct_mentions['MALE'], num_distinct_mentions['FEMALE'],
                            num_mentions['MALE'], num_mentions['FEMALE'],
                            num_quotes['MALE'], num_quotes['FEMALE']
                            ]]))
                out_f.write('\n')
            except UnicodeEncodeError:
                pass

In [26]:
tc_data = load_tc_data(2016, 1)

In [45]:
get_mentions_quotes(tc_data, 'tc_data_counts.tsv')

In [53]:
for year in range(2009, 2017):
    for month in range(1, 13):
        if year == 2016 and month > 7:
            break
        print time.ctime(), "Loading data for {}/{}".format(year, month)
        tc_data = load_tc_data(year, month)
        print time.ctime(), "Analyzing data ..."
        get_mentions_quotes(tc_data, 'tc_data_counts.tsv')

Thu Dec  8 12:03:09 2016 Loading data for 2009/1
Thu Dec  8 12:03:26 2016 Analyzing data ...
Thu Dec  8 12:03:50 2016 Loading data for 2009/2
Thu Dec  8 12:04:12 2016 Analyzing data ...
Thu Dec  8 12:04:34 2016 Loading data for 2009/3
Thu Dec  8 12:04:55 2016 Analyzing data ...
Thu Dec  8 12:05:18 2016 Loading data for 2009/4
Thu Dec  8 12:05:44 2016 Analyzing data ...
Thu Dec  8 12:06:09 2016 Loading data for 2009/5
Thu Dec  8 12:06:33 2016 Analyzing data ...
Thu Dec  8 12:06:57 2016 Loading data for 2009/6
Thu Dec  8 12:07:25 2016 Analyzing data ...
Thu Dec  8 12:07:51 2016 Loading data for 2009/7
Thu Dec  8 12:08:17 2016 Analyzing data ...
Thu Dec  8 12:08:45 2016 Loading data for 2009/8
Thu Dec  8 12:09:12 2016 Analyzing data ...
Thu Dec  8 12:09:39 2016 Loading data for 2009/9
Thu Dec  8 12:10:10 2016 Analyzing data ...
Thu Dec  8 12:10:36 2016 Loading data for 2009/10
Thu Dec  8 12:11:03 2016 Analyzing data ...
Thu Dec  8 12:11:30 2016 Loading data for 2009/11
Thu Dec  8 12:12:01

KeyboardInterrupt: 

In [19]:
MIN_NUM = 11
MAX_NUM = MIN_NUM + 10
curr_idx = 0
for link, values in tc_data.iteritems():
    curr_idx += 1
    if curr_idx < MIN_NUM:
        continue
    if curr_idx > MAX_NUM:
        break

    # if link != 'https://techcrunch.com/2016/01/05/intel-says-button-sized-curie-will-ship-in-q1-costing-under-10/':
    #    continue
    data = values['data']
    corenlp = values['corenlp']
    print link
    corefs = corenlp['corefs']
    sentences = corenlp['sentences']
    # pprint(corefs)
    # pprint(sentences[5])
    # print sentences[0].keys()
    # pprint(sentences[5]['tokens'])
    pm = get_people_mentioned(sentences, corefs, include_gender=True)
    sources = get_sources(pm, sentences, corefs)
    print pm
    print {k: len(v) for k, v in sources.iteritems()}
    #for c_id, coref in corefs.iteritems():
    #    process_coref_chain(c_id, coref)

https://techcrunch.com/2016/01/14/nasa-receives-patent-for-a-new-type-of-squishy-amorphous-robot/
{u'Imagecourtesy': (5, (None, None)), u'Arthur Bradley': (1, (u'MALE', 'COREF'))}
{u'Imagecourtesy': 0, u'Arthur Bradley': 1}
https://techcrunch.com/2016/01/20/apple-releases-music-memos-a-recorder-app-for-musicians/
{}
{}
https://techcrunch.com/2016/01/16/notify-nearby-launch/
{u'Nevin Jethmalani': (3, (u'MALE', 'COREF')), u'Levi': (1, ('male', 'NAME_ONLY'))}
{u'Nevin Jethmalani': 84, u'Levi': 0}
https://techcrunch.com/2016/01/27/watch-microsoft-ventures-london-accelerator-right-here-2/
{}
{}
https://techcrunch.com/2016/01/18/why-big-companies-keep-failing-the-stack-fallacy/
{u'Anshu Sharma': (1, ('male', 'NAME_ONLY')), u'Larry Ellison': (1, (u'MALE', 'COREF'))}
{u'Anshu Sharma': 0, u'Larry Ellison': 0}
https://techcrunch.com/2016/01/26/ipo-slowdown-a-look-at-company-profitability/
{u'Ben Narasin': (1, ('male', 'NAME_ONLY')), u'Jeremy': (1, ('male', 'NAME_ONLY'))}
{u'Ben Narasin': 0, u'Je