In [1]:
from ipywidgets import FloatProgress, IntText
import time
import collections
import re
import pickle
import sys, os
import numpy as np
import pandas as pd
from math import log

print("Python version + compiler & build info:", sys.version)
print("Jupyter notebook version:", os.popen("jupyter notebook --version").readlines()[0])

Python version + compiler & build info: 3.6.5 |Anaconda custom (64-bit)| (default, Mar 29 2018, 18:21:58) 
[GCC 7.2.0]
Jupyter notebook version: 5.4.1



In [3]:
with open('data/is_male.pkl', 'rb') as infile:
    is_male = pickle.load(infile)

def save_object(obj, filename):
    with open(filename, 'wb') as outfile:  
        pickle.dump(obj, outfile, pickle.HIGHEST_PROTOCOL)

# Getting bigrams from Wiki biographies, according to gender and category
Categories: artists, monarchs, programming, scientist, sports

In [3]:
def get_bigrams(text):
    return zip(*[text[i:] for i in range(2)])

In [10]:
fp = FloatProgress(min=0, max=850399)
cc = IntText(value=0, description="Categories")
display(cc, fp)
article_count = 0
category_counts = None

matcher = re.compile("\[\[Category:([^\]]*)", re.IGNORECASE)

category_to_articles = collections.defaultdict(list)

with open("data/gendered-labeled-articles", 'r') as f:
    for line in f:
        article_count += 1
        if article_count % 1000 == 0:
            fp.value = article_count
            cc.value = len(category_to_articles)
        article_id = int(line.split(" ", 1)[0])
        for category in matcher.findall(line):
            category_to_articles[category].append(article_id)

IntText(value=0, description='Categories')

FloatProgress(value=0.0, max=850399.0)

In [5]:
{k for k in list(category_to_articles)[:15]}

{'1809 births',
 '1865 deaths',
 '19th-century American politicians',
 '19th-century Christians',
 'Abraham Lincoln| ',
 'American Christians',
 'American people of English descent',
 'American postmasters',
 'American surveyors',
 'Assassinated Presidents of the United States',
 'Assassinated heads of state',
 'Burials at Oak Ridge Cemetery',
 'Deaths by firearm in Washington, D.C.',
 'Hall of Fame for Great Americans inductees',
 'Illinois Central Railroad people'}

In [11]:
topic_to_articles = collections.defaultdict(set)
topic_patterns = {
    "programming": re.compile("software|programmer"),
    "sports": re.compile("athlete|sportspeople"),
    "artists": re.compile("artist"),
    "scientist": re.compile("scientist"),
#     "fictional": re.compile("fictional"),
    "monarch": re.compile("king|queen|monarch|royal|prince|princess"),
}

for k,v in category_to_articles.items():
    for topic, pattern in topic_patterns.items():
        if pattern.search(k):
            topic_to_articles[topic].update(v)
            
[(k, len(v)) for (k,v) in topic_to_articles.items()]

[('sports', 22673),
 ('monarch', 12644),
 ('scientist', 10384),
 ('artists', 30265),
 ('programming', 1047)]

In [7]:
# Note there is some overlap between categories
len(topic_to_articles['monarch'].intersection(topic_to_articles['sports']))

214

In [69]:
# Create set of all article ids from all 5 categories
all_ids = set.union(*[v for k,v in topic_to_articles.items()])
len(all_ids)

75953

In [20]:
master_dict = {
    'male_sports': collections.Counter(), 'female_sports': collections.Counter(),
    'male_monarch': collections.Counter(), 'female_monarch': collections.Counter(),
    'male_artists': collections.Counter(), 'female_artists': collections.Counter(),
    'male_scientist': collections.Counter(), 
    'female_scientist': collections.Counter(),
    'male_programming': collections.Counter(), 
    'female_programming': collections.Counter() }
cats = ['sports', 'monarch', 'artists', 'scientist', 'programming']
cat_count = {'male_sports': 0, 'female_sports': 0, 'male_monarch': 0, 
             'female_monarch': 0, 'male_artists': 0, 'female_artists': 0, 
             'male_scientist': 0, 'female_scientist': 0, 'male_programming': 0, 
             'female_programming': 0}

fp = FloatProgress(min=0, max=850399)
all_articles = IntText(value=0, description="Article Count:")
display(fp, all_articles)
article_count = 0
start = time.time()
try:
    with open("data/gendered-labeled-articles.stripped", 'r') as f:
        for line in f:
            article_count += 1
            all_articles.value += 1
            if article_count % 1000 == 0:
                fp.value = article_count
            line = line.split()
            art_id = line.pop(0)
            if int(art_id) in all_ids:
                male = is_male[art_id]
                line = [line[i-1 : i+99] for i in range(1, len(line), 99)]
                if male:
                    for c in cats:
                        if int(art_id) in topic_to_articles[c]:
                            cat_count["male_" + c] += 1
                else:
                    for c in cats:
                        if int(art_id) in topic_to_articles[c]:
                            cat_count["female_" + c] += 1
                while line:
                    bigrams = list(get_bigrams(line.pop(0)))
                    for c in cats:
                        if int(art_id) in topic_to_articles[c]:
                            if male:
                                master_dict["male_" + c].update(bigrams)
                            else:
                                master_dict["female_" + c].update(bigrams)
            # save male bigram counters in batches to save memory
            if article_count % 50000 == 0:
                for c in cats[:4]:
                    save_object(master_dict["male_" + c], 
                                'data/bigrams/male_{}_{}.pkl'.format(c, article_count//50000))
                    master_dict["male_" + c] = collections.Counter()
    # last iteration:
    for c in cats[:4]:
        save_object(master_dict["male_" + c], 
                    'data/bigrams/male_{}_{}.pkl'.format(c, article_count//50000 + 1))
        master_dict["male_" + c] = collections.Counter()
finally:
    print("Articles:", article_count, " Time:", (time.time() - start), "seconds")


FloatProgress(value=0.0, max=850399.0)

IntText(value=0, description='Article Count:')

Articles: 850399  Time: 1042.0519289970398 seconds


In [21]:
# save all
save_object(master_dict["male_programming"], 'data/bigrams/male_programming.pkl')

for c in cats:
    save_object(master_dict["female_" + c], 'data/bigrams/female_{}.pkl'.format(c))

In [22]:
[(k, v) for (k,v) in cat_count.items()]

[('male_sports', 17528),
 ('female_sports', 5145),
 ('male_monarch', 10120),
 ('female_monarch', 2524),
 ('male_artists', 21781),
 ('female_artists', 8484),
 ('male_scientist', 8438),
 ('female_scientist', 1946),
 ('male_programming', 995),
 ('female_programming', 52)]

In [23]:
# bigram counters still in kernel
[(k, len(v)) for (k,v) in master_dict.items()]

[('male_sports', 0),
 ('female_sports', 982573),
 ('male_monarch', 0),
 ('female_monarch', 1299352),
 ('male_artists', 0),
 ('female_artists', 3854438),
 ('male_scientist', 0),
 ('female_scientist', 956458),
 ('male_programming', 551059),
 ('female_programming', 54269)]

In [25]:
# Create master counters of bigrams for male categories 
# of sports, monarch, scientist, artists:

# Sports
fp = FloatProgress(min=0, max=18)
count = IntText(value=0, description="Count:")
display(fp, count)
male_sports = collections.Counter()
start = time.time()
for i in range(1, 19):
    fp.value += 1
    count.value += 1
    with open('data/bigrams/male_sports_{}.pkl'.format(i), 'rb') as infile:
        male_sports += pickle.load(infile)
for k in set(k for k,v in male_sports.items() if v <= 2):
    del male_sports[k]  
save_object(male_sports, 'data/bigrams/male_sports.pkl')
print("Time:", (time.time() - start), "seconds")

FloatProgress(value=0.0, max=18.0)

IntText(value=0, description='Count:')

Time: 30.27496027946472 seconds


In [26]:
print("Deleting temporary batches...")
for i in range(1, 19):
    os.remove('data/bigrams/male_sports_{}.pkl'.format(i))
print("Done.")

Deleting temporary batches...
Done.


In [27]:
# Monarch
fp = FloatProgress(min=0, max=18)
count = IntText(value=0, description="Count:")
display(fp, count)
male_monarch = collections.Counter()
start = time.time()
for i in range(1, 19):
    fp.value += 1
    count.value += 1
    with open('data/bigrams/male_monarch_{}.pkl'.format(i), 'rb') as infile:
        male_monarch += pickle.load(infile)
for k in set(k for k,v in male_monarch.items() if v <= 2):
    del male_monarch[k]  
save_object(male_monarch, 'data/bigrams/male_monarch.pkl')
print("Time:", (time.time() - start), "seconds")

FloatProgress(value=0.0, max=18.0)

IntText(value=0, description='Count:')

Time: 34.27636647224426 seconds


In [29]:
print("Deleting temporary batches...")
for i in range(1, 19):
    os.remove('data/bigrams/male_monarch_{}.pkl'.format(i))
print("Done.")

Deleting temporary batches...
Done.


In [30]:
# Scientist
fp = FloatProgress(min=0, max=18)
count = IntText(value=0, description="Count:")
display(fp, count)
male_scientist = collections.Counter()
start = time.time()
for i in range(1, 19):
    fp.value += 1
    count.value += 1
    with open('data/bigrams/male_scientist_{}.pkl'.format(i), 'rb') as infile:
        male_scientist += pickle.load(infile)
for k in set(k for k,v in male_scientist.items() if v <= 2):
    del male_scientist[k]  
save_object(male_scientist, 'data/bigrams/male_scientist.pkl')
print("Time:", (time.time() - start), "seconds")

FloatProgress(value=0.0, max=18.0)

IntText(value=0, description='Count:')

Time: 26.785165071487427 seconds


In [31]:
print("Deleting temporary batches...")
for i in range(1, 19):
    os.remove('data/bigrams/male_scientist_{}.pkl'.format(i))
print("Done.")

Deleting temporary batches...
Done.


In [4]:
# Artist
fp = FloatProgress(min=0, max=18)
count = IntText(value=0, description="Count:")
display(fp, count)
male_artists = collections.Counter()
start = time.time()
for i in range(1, 19):
    fp.value += 1
    count.value += 1
    with open('data/bigrams/male_artists_{}.pkl'.format(i), 'rb') as infile:
        male_artists += pickle.load(infile)
for k in set(k for k,v in male_artists.items() if v <= 3):
    del male_artists[k]  
save_object(male_artists, 'data/bigrams/male_artists.pkl')
print("Time:", (time.time() - start), "seconds")

FloatProgress(value=0.0, max=18.0)

IntText(value=0, description='Count:')

Time: 52.9793004989624 seconds


In [5]:
print("Deleting temporary batches...")
for i in range(1, 19):
    os.remove('data/bigrams/male_artists_{}.pkl'.format(i))
print("Done.")

Deleting temporary batches...
Done.


In [8]:
# The missing values from above
print("Number of bigrams in men's bios, sports category:", sum(male_sports.values()))
print("Number of bigrams in men's bios, monarch category:", sum(male_monarch.values()))
print("Number of bigrams in men's bios, scientist category:", sum(male_scientist.values()))
print("Number of bigrams in men's bios, artists category:", sum(male_artists.values()))

Number of bigrams in men's bios, sports category: 21443336
Number of bigrams in men's bios, monarch category: 14567320
Number of bigrams in men's bios, scientist category: 9049407
Number of bigrams in men's bios, artists category: 29899078


In [7]:
# To load the data back in:

with open('data/bigrams/female_sports.pkl', 'rb') as infile:
    female_sports = pickle.load(infile)
with open('data/bigrams/female_monarch.pkl', 'rb') as infile:
    female_monarch = pickle.load(infile)
with open('data/bigrams/female_scientist.pkl', 'rb') as infile:
    female_scientist = pickle.load(infile)
with open('data/bigrams/female_artists.pkl', 'rb') as infile:
    female_artists = pickle.load(infile)
with open('data/bigrams/female_programming.pkl', 'rb') as infile:
    female_programming = pickle.load(infile)
with open('data/bigrams/male_programming.pkl', 'rb') as infile:
    male_programming = pickle.load(infile)
with open('data/bigrams/male_scientist.pkl', 'rb') as infile:
    male_scientist = pickle.load(infile)
with open('data/bigrams/male_sports.pkl', 'rb') as infile:
    male_sports = pickle.load(infile)
with open('data/bigrams/male_artists.pkl', 'rb') as infile:
    male_artists = pickle.load(infile)
with open('data/bigrams/male_monarch.pkl', 'rb') as infile:
    male_monarch = pickle.load(infile)

In [50]:
cols = [(k, len(v)) for (k,v) in topic_to_articles.items()]
# cols.insert(0, ("All categories", 848989))

df = pd.DataFrame(cols, columns=["Category", "All bios"])
df

Unnamed: 0,Category,All bios
0,sports,22673
1,monarch,12644
2,scientist,10384
3,artists,30265
4,programming,1047


In [51]:
# Men
men_cats = [(k,v) for (k,v) in cat_count.items()][0::2]
# men_cats.insert(0, ('all biographies', 718413))
men_cats

[('male_sports', 17528),
 ('male_monarch', 10120),
 ('male_artists', 21781),
 ('male_scientist', 8438),
 ('male_programming', 995)]

In [52]:
# Women
women_cats = [(k, v) for (k,v) in cat_count.items()][1::2]
# women_cats.insert(0, ('all biographies', 130576))
women_cats

[('female_sports', 5145),
 ('female_monarch', 2524),
 ('female_artists', 8484),
 ('female_scientist', 1946),
 ('female_programming', 52)]

In [53]:
# Ensuring correct order
df["Men"] = [men_cats[i][1] for i in [0, 1, 3, 2, 4]]
df["Women"] = [women_cats[i][1] for i in [0, 1, 3, 2, 4]]
df["% Women"] = round(df["Women"] / df["All bios"] * 100,1)
df.sort_values(by="% Women", inplace=True, ascending=False)
df.reset_index(drop=True, inplace=True)

In [56]:
toprow = [("All categories", 848989, 718413, 130576, round(130576/848989*100,1))]
pd.concat([pd.DataFrame(toprow, columns = list(df)), df], ignore_index=True) 

Unnamed: 0,Category,All bios,Men,Women,% Women
0,All categories,848989,718413,130576,15.4
1,artists,30265,21781,8484,28.0
2,sports,22673,17528,5145,22.7
3,monarch,12644,10120,2524,20.0
4,scientist,10384,8438,1946,18.7
5,programming,1047,995,52,5.0


# Pointwise Mutual Information

In [57]:
class PMICalculator:
    
    def __init__(self, counter1, counter2, n1, n2):
        self.counter1 = counter1
        self.counter2 = counter2
        self.n1 = n1
        self.n2 = n2
        self.lpc1 = log(n1 / (n1 + n2))

    def pmi(self, word):
        try:
            return (log(self.counter1[word]) - 
            self.lpc1 -
            log(self.counter1[word] + self.counter2[word]))
        except ValueError:
            return 0

In [58]:
pc_sports = PMICalculator(female_sports, male_sports, 5145, 17528)
pc_monarch = PMICalculator(female_monarch, male_monarch, 2524, 10120)
pc_scientist = PMICalculator(female_scientist, male_scientist, 1946, 8438)
pc_artists = PMICalculator(female_artists, male_artists, 8484, 21781)
pc_programming = PMICalculator(female_programming, male_programming, 52, 995)

## Sports

In [60]:
sports_bigrams = male_sports + female_sports
top_sports = sports_bigrams.most_common(20000)
top_sports_with_pmi = [(p[0], p[1], pc_sports.pmi(p[0])) for p in top_sports]
top_sports_with_pmi.sort(key = lambda x: x[-1])

In [33]:
pc_sports.pmi(("male", "athlete"))

-3.3330917361095396

In [34]:
pc_sports.pmi(("female", "athlete"))

1.460421168880937

In [36]:
sports_bigrams[("male", "athlete")]

247

In [37]:
sports_bigrams[("female", "athlete")]

356

In [61]:
# Men
top_sports_with_pmi[:50]

[(('town', 'f'), 4762, -6.985273607088316),
 (('men', 'footer'), 3020, -6.529862690410423),
 (('he', 'became'), 2847, -6.470871667319544),
 (('championships', 'men'), 8485, -6.464293469041724),
 (('league', 'one'), 2683, -6.411541430467131),
 (('he', 'did'), 2153, -6.19146807740587),
 (('cfb', 'yearly'), 2126, -6.1788481389434),
 (('year', 'he'), 2079, -6.15649286789952),
 (('his', 'personal'), 1957, -6.096018547437583),
 (('united', 'f'), 7706, -6.08031074410772),
 (('he', 'played'), 7609, -6.06764325509814),
 (('he', 'won'), 9484, -6.064774114792703),
 (('paralympics', 'men'), 1836, -6.032195151221943),
 (('la', 'liga'), 3628, -6.0201402107165904),
 (('athletics', 'men'), 6987, -5.9823627774740675),
 (('ham', 'united'), 1736, -5.976189475261803),
 (('ref', 'his'), 3413, -5.959050348388269),
 (('he', 'started'), 1701, -5.955822172437369),
 (('he', 'went'), 1615, -5.903940815698264),
 (('his', 'first'), 14486, -5.90056394746056),
 (('games', 'he'), 1590, -5.8883398752557845),
 (('won',

In [62]:
# Women
top_sports_with_pmi[-50:]

[(('english', 'female'), 219, 1.483149419958493),
 (('the', 'lpga'), 219, 1.483149419958493),
 (('cup', 'women'), 217, 1.483149419958493),
 (('events', 'she'), 214, 1.483149419958493),
 (('she', 'beat'), 214, 1.483149419958493),
 (('open', 'she'), 211, 1.483149419958493),
 (('australian', 'female'), 209, 1.483149419958493),
 (('she', 'participated'), 209, 1.483149419958493),
 (('york', 'flash'), 209, 1.483149419958493),
 (('blue', 'fc'), 207, 1.483149419958493),
 (('she', 'graduated'), 204, 1.483149419958493),
 (('female', 'tennis'), 204, 1.483149419958493),
 (('she', 'improved'), 201, 1.483149419958493),
 (('marathon', 'she'), 201, 1.483149419958493),
 (('reign', 'fc'), 194, 1.483149419958493),
 (('2014', 'she'), 193, 1.483149419958493),
 (('2004', 'she'), 193, 1.483149419958493),
 (('began', 'her'), 190, 1.483149419958493),
 (('her', 'heat'), 190, 1.483149419958493),
 (('athletics', 'she'), 189, 1.483149419958493),
 (('results', 'women'), 188, 1.483149419958493),
 (('she', 'retired')

## Monarch

In [63]:
monarch_bigrams = male_monarch + female_monarch
top_monarch = monarch_bigrams.most_common(20000)
top_monarch_with_pmi = [(p[0], p[1], pc_monarch.pmi(p[0])) for p in top_monarch]
top_monarch_with_pmi.sort(key = lambda x: x[-1])

In [64]:
# Men
top_monarch_with_pmi[:30]

# I THINK THERE IS AN ERROR HERE, as the bigrams seem to indicate the sports category

[(('american', 'football'), 4467, -6.793134471677714),
 (('united', 'f'), 3335, -6.5008901086753195),
 (('cricket', 'club'), 2072, -6.024931753704969),
 (('football', 'league'), 5673, -5.933523218984052),
 (('county', 'cricket'), 1720, -5.838741720133094),
 (('english', 'football'), 1683, -5.8169953445164015),
 (('league', 'team'), 1502, -5.703214982649675),
 (('league', 'players'), 1435, -5.6575822785193175),
 (('hi', 'football'), 1412, -5.641424568378783),
 (('national', 'league'), 1350, -5.596522021758071),
 (('0', 'colspan'), 1328, -5.5800914803619746),
 (('sport', 'date'), 1296, -5.555700027237815),
 (('com', 'players'), 1285, -5.547176147654916),
 (('he', 'played'), 3389, -5.4183400332645375),
 (('f', 'c'), 23118, -5.2589751338318615),
 (('free', 'agent'), 857, -5.142100068923376),
 (('football', 'teams'), 853, -5.137421697817275),
 (('major', 'league'), 1619, -5.085078923443286),
 (('league', 'baseball'), 1581, -5.061327806975123),
 (('league', 'season'), 736, -4.989892269054472

In [70]:
top_monarch_with_pmi[30:100]

[(('list', 'a'), 1032, -4.634768915807158),
 (('footballer', 'who'), 509, -4.621110166876118),
 (('season', 'he'), 1003, -4.606265757727586),
 (('team', 'all'), 998, -4.601268246077114),
 (('his', 'debut'), 961, -4.563489378735943),
 (('he', 'finished'), 478, -4.558272882817052),
 (('appearances', 'for'), 462, -4.524227041407334),
 (('football', 'player'), 452, -4.502344330157827),
 (('football', 'footballer'), 446, -4.48898110234566),
 (('he', 'graduated'), 427, -4.44544616355422),
 (('he', 'studied'), 426, -4.4431014965949664),
 (('season', 'with'), 838, -4.426533070247733),
 (('first', 'team'), 1579, -4.366914803461347),
 (('uk', 'sport1'), 1552, -4.349667489948869),
 (('all', 'american'), 776, -4.349667489948869),
 (('league', 'championship'), 388, -4.349667489948869),
 (('sport1', 'hi'), 1549, -4.347732629622573),
 (('football', 'eng'), 384, -4.3393047029133225),
 (('second', 'team'), 377, -4.3209073377736065),
 (('http', 'sports'), 370, -4.302165155963865),
 (('season', 'as'), 36

In [65]:
# Women
top_monarch_with_pmi[-30:]

[(('hedvig', 'taube'), 117, 1.576551733588989),
 (('throughout', 'her'), 118, 1.576851673603235),
 (('diana', 's'), 125, 1.5788146579688442),
 (('best', 'actress'), 218, 1.5834290615573279),
 (('danish', 'princesses'), 161, 1.586179290038249),
 (('joan', 'sutherland'), 125, 1.5870451571053596),
 (('actresses', 'category'), 384, 1.5902844404765721),
 (('she', 'appeared'), 192, 1.5902844404765721),
 (('harvnb', 'chibnall'), 145, 1.590431164855091),
 (('weir', '2008'), 157, 1.5920446467397262),
 (('ref', 'porter'), 161, 1.5925285177169082),
 (('dame', 'grand'), 375, 1.5952084677445209),
 (('her', 'spouse'), 328, 1.5990677570825893),
 (('women', 'writers'), 192, 1.6113378496744044),
 (('she', 'attended'), 186, 1.6113378496744044),
 (('chibnall', '1991'), 151, 1.6113378496744044),
 (('princess', 'diana'), 145, 1.6113378496744044),
 (('der', 'maur'), 143, 1.6113378496744044),
 (('martha', 'stewart'), 139, 1.6113378496744044),
 (('valide', 'sultan'), 139, 1.6113378496744044),
 (('category', '

## Scientist

In [14]:
scientist_bigrams = male_scientist + female_scientist
top_scientist = scientist_bigrams.most_common(20000)
top_scientist_with_pmi = [(p[0], p[1], pc_scientist.pmi(p[0])) for p in top_scientist]
top_scientist_with_pmi.sort(key = lambda x: x[-1])

In [18]:
# Men
top_scientist_with_pmi[:30]

In [38]:
# Women
top_scientist_with_pmi[-30:]

In [57]:
scientist_bigrams[("women", "scientists")]

2274

In [58]:
scientist_bigrams[("men", "scientists")]

0

In [59]:
scientist_bigrams[("female", "scientist")]

15

In [66]:
print(pc_scientist.pmi(("first", "person")))

0.30688794809018294


## Artists

In [20]:
artists_bigrams = male_artists + female_artists
top_artists = artists_bigrams.most_common(20000)
top_artists_with_pmi = [(p[0], p[1], pc_artists.pmi(p[0])) for p in top_artists]
top_artists_with_pmi.sort(key = lambda x: x[-1])

In [28]:
# Men
top_artists_with_pmi[:30]

In [25]:
# Women
top_artists_with_pmi[-30:]

## Programming

In [67]:
programming_bigrams = male_programming + female_programming
top_programming = programming_bigrams.most_common(20000)
top_programming_with_pmi = [(p[0], p[1], pc_programming.pmi(p[0])) for p in top_programming]
top_programming_with_pmi.sort(key = lambda x: x[-1])

In [52]:
# Men
top_programming_with_pmi[:30]

[(('which', 'he'), 314, -2.7469524936191436),
 (('where', 'he'), 613, -2.7227772630871567),
 (('he', 'was'), 1619, -2.595363992160471),
 (('his', 'first'), 253, -2.530948996438411),
 (('his', 'work'), 248, -2.510988253875873),
 (('archiveurl', 'http'), 187, -2.2286681245654774),
 (('businesspeople', 'in'), 177, -2.1737092402847193),
 (('in', 'may'), 166, -2.109547296067434),
 (('acquired', 'by'), 165, -2.103504981611471),
 (('chairman', 'of'), 163, -2.091309708517653),
 (('webcitation', 'org'), 159, -2.066463709931122),
 (('apple', 'ii'), 159, -2.066463709931122),
 (('www', 'webcitation'), 158, -2.0601545407378574),
 (('the', 'apple'), 158, -2.0601545407378574),
 (('operating', 'system'), 297, -1.998144465953645),
 (('software', 'category'), 145, -1.9742932501314647),
 (('apple', 'inc'), 139, -1.932033440841582),
 (('nbsp', 'million'), 138, -1.9248131928680956),
 (('publisher', 'ref'), 136, -1.9102143934469429),
 (('on', 'his'), 262, -1.8727568309120417),
 (('df', 'mdy'), 131, -1.87275

In [68]:
# Women
top_programming_with_pmi[-30:]

[(('kay', 'mcnulty'), 12, 3.0024404922891095),
 (('abdul', 'karim'), 12, 3.0024404922891095),
 (('mayer', 'was'), 12, 3.0024404922891095),
 (('stormy', 'peters'), 12, 3.0024404922891095),
 (('new', 'justice'), 12, 3.0024404922891095),
 (('category', 'women'), 67, 3.00244049228911),
 (('ada', 'lovelace'), 62, 3.00244049228911),
 (('mm', 'b'), 31, 3.00244049228911),
 (('of', 'lovelace'), 18, 3.00244049228911),
 (('lord', 'byron'), 18, 3.00244049228911),
 (('women', 'scientists'), 18, 3.00244049228911),
 (('justice', 'minister'), 18, 3.00244049228911),
 (('to', 'governability'), 18, 3.00244049228911),
 (('lovelace', 's'), 17, 3.00244049228911),
 (('last', 'conway'), 17, 3.00244049228911),
 (('smita', 'bellur'), 17, 3.00244049228911),
 (('1972', 'p'), 10, 3.00244049228911),
 (('hopper', 's'), 10, 3.00244049228911),
 (('center', 'center'), 10, 3.00244049228911),
 (('transsexual', 'women'), 10, 3.00244049228911),
 (('gender', 'identity'), 10, 3.00244049228911),
 (('maxwell', 'medal'), 10, 3.

In [71]:
# Unigram categories:

master_dict = {
    'male_sports_u': collections.Counter(), 
    'female_sports_u': collections.Counter(),
    'male_monarch_u': collections.Counter(), 
    'female_monarch_u': collections.Counter(),
    'male_artists_u': collections.Counter(), 
    'female_artists_u': collections.Counter(),
    'male_scientist_u': collections.Counter(), 
    'female_scientist_u': collections.Counter(),
    'male_programming_u': collections.Counter(), 
    'female_programming_u': collections.Counter() }
cats = ['sports', 'monarch', 'artists', 'scientist', 'programming']

fp = FloatProgress(min=0, max=850399)
all_articles = IntText(value=0, description="Article Count:")
display(fp, all_articles)
article_count = 0
start = time.time()

try:
    with open("data/gendered-labeled-articles.stripped", 'r') as f:
        for line in f:
            article_count += 1
            all_articles.value += 1
            if article_count % 1000 == 0:
                fp.value = article_count
            words = line.split()
            art_id = words.pop(0)
            if int(art_id) in all_ids:
                male = is_male[art_id]
                for c in cats:
                    if int(art_id) in topic_to_articles[c]:
                        if male:
                            master_dict["male_" + c + "_u"].update(words)
                        else:
                            master_dict["female_" + c + "_u"].update(words)
finally:
    print("Articles:", article_count, " Time:", (time.time() - start), "seconds")

FloatProgress(value=0.0, max=850399.0)

IntText(value=0, description='Article Count:')

In [73]:
#SAVE
for c in cats:
    save_object(master_dict["female_" + c + "_u"], 
                'data/bigrams/female_{}_u.pkl'.format(c))
    save_object(master_dict["male_" + c + "_u"], 
                'data/bigrams/male_{}_u.pkl'.format(c))

In [48]:
sum(master_dict["male_scientist_u"].values())

12296493

In [6]:
sum(master_dict["female_scientist_u"].values())

2840901

In [8]:
sum(master_dict["male_artists_u"].values())

38561387

In [9]:
sum(master_dict["female_artists_u"].values())

16363597

In [30]:
pc_artists2 = PMICalculator(master_dict["female_artists_u"], 
                            master_dict["male_artists_u"], 8484, 21781)

In [34]:
pc_scientist2 = PMICalculator(master_dict["female_scientist_u"], 
                              master_dict["male_scientist_u"], 1946, 8438)

In [74]:
pc_monarch2 = PMICalculator(master_dict["female_monarch_u"], 
                              master_dict["male_monarch_u"], 2524, 10120)

In [75]:
monarch_unigrams = master_dict["male_monarch_u"] + master_dict["female_monarch_u"]
top_monarch2 = monarch_unigrams.most_common(20000)
top_monarch2_with_pmi = [(p[0], p[1], pc_monarch2.pmi(p[0])) for p in top_monarch2]
top_monarch2_with_pmi.sort(key = lambda x: x[-1])
# Men
top_monarch2_with_pmi[:50]

[('nfl', 14983, -7.310186473808795),
 ('wigan', 2124, -6.049718532687425),
 ('quarterback', 1959, -5.968851568270137),
 ('batting', 1459, -5.674168698848381),
 ('rovers', 1390, -5.625721176450333),
 ('rams', 951, -5.246176212870986),
 ('tampa', 801, -5.074523097393955),
 ('rookie', 801, -5.074523097393955),
 ('striker', 761, -5.023295508187282),
 ('innings', 732, -4.984442664286907),
 ('touchdown', 1455, -4.9782761493712435),
 ('interception', 702, -4.942595554351406),
 ('brentford', 691, -4.926801974093266),
 ('giants', 1314, -4.876346168810206),
 ('bengals', 622, -4.821602243064775),
 ('goryeo', 615, -4.810284418132113),
 ('packers', 1223, -4.804577105452823),
 ('tackles', 1826, -4.799932922812399),
 ('ravens', 563, -4.721941778465286),
 ('goals6', 558, -4.713021112706906),
 ('arsenal', 532, -4.66530563966724),
 ('100s', 508, -4.619143597904078),
 ('cricket', 6999, -4.603127381399876),
 ('cincinnati', 1499, -4.602603359760083),
 ('behr', 496, -4.5952380770505235),
 ('nero', 489, -4.5

In [76]:
# Women
top_monarch2_with_pmi[-50:]

[('mensch', 148, 1.5699526335115506),
 ('wahlström', 128, 1.5714919411272046),
 ('widstrand', 128, 1.5714919411272046),
 ('aubyn', 104, 1.5721171365211237),
 ('hämtad', 115, 1.5759359226234881),
 ('platten', 90, 1.577436297998723),
 ('azaria', 66, 1.5805661910076507),
 ('conroy', 169, 1.5813055625755297),
 ('mindy', 107, 1.5828999143538711),
 ('woodham', 73, 1.583558285567329),
 ('permaisuri', 159, 1.5858587643734197),
 ('plowden', 87, 1.5880809875101374),
 ('brava', 135, 1.5888649938223454),
 ('ampuan', 97, 1.5905037627715624),
 ('sridevi', 297, 1.5909289780431974),
 ('mirabella', 99, 1.5909289780431974),
 ('falle', 120, 1.5945307313580237),
 ('valide', 188, 1.5952517119227796),
 ('gelardi', 188, 1.5952517119227796),
 ('tamannaah', 128, 1.5955894927062655),
 ('kvindebiografisk', 66, 1.596070377543616),
 ('brunhilda', 66, 1.596070377543616),
 ('kunglig', 67, 1.5962999723098639),
 ('titmuss', 74, 1.5977321976186252),
 ('devyani', 102, 1.6014855532313934),
 ('christabel', 104, 1.60167593