In [163]:
from ipywidgets import FloatProgress, IntText
import nltk
import time
import collections
from math import log 
import pickle
import os
import numpy as np
import pandas as pd

def save_object(obj, filename):
    with open(filename, 'wb') as outfile:  
        pickle.dump(obj, outfile, pickle.HIGHEST_PROTOCOL)

with open('data/is_male.pkl', 'rb') as infile:
    is_male = pickle.load(infile)

# Create Counters of Bigrams in Biographies of Women and Men, in Batches

In [3]:
# This is purportedly the fastest way to get bigrams
def get_bigrams(text):
    return zip(*[text[i:] for i in range(2)])

In [5]:
# This cell takes about 46 minutes to run.

male_bigrams = collections.Counter()
female_bigrams = collections.Counter()

print("Building batches of counters for bigrams found in all Wiki bios of women and of men...")
print("Estimated time for completion: 46 minutes")

fp = FloatProgress(min=0, max=850399)
all_articles = IntText(value=0, description="Article Count:")
display(fp, all_articles)
article_count = 0

if not os.path.exists('data/bigrams'):
    os.makedirs('data/bigrams')

start = time.time()
try:
    with open("data/gendered-labeled-articles.stripped", 'r') as f:
        for line in f:
            article_count += 1
            all_articles.value += 1
            if article_count % 1000 == 0:
                fp.value = article_count
            line = line.split()
            male = is_male[line.pop(0)]
            # Read text in batches of 100 words
            line = [line[i-1 : i+99] for i in range(1, len(line), 99)]
            while line:
                bigrams = list(get_bigrams(line.pop(0)))
                if male:
                    male_bigrams.update(bigrams)
                else:
                    female_bigrams.update(bigrams)
            # Pickle results after every 1000 articles for men; 3000 for women (since we are using
            # a total article count, and we know there are fewer bios of women). These numbers seem
            # to be about right to prevent memory errors.
            if article_count % 1000 == 0:
                save_object(male_bigrams, 'data/bigrams/male_bigrams_{}.pkl'.format(article_count // 1000))
                male_bigrams = collections.Counter()
                if article_count % 3000 == 0:
                    save_object(female_bigrams, 'data/bigrams/female_bigrams_{}.pkl'.format(article_count // 3000))
                    female_bigrams = collections.Counter()
        save_object(male_bigrams, 'data/bigrams/male_bigrams_{}.pkl'.format(article_count // 1000 + 1))
        save_object(female_bigrams, 'data/bigrams/female_bigrams_{}.pkl'.format(article_count // 3000 + 1))
finally: 
    print("\nArticles:", article_count, "Time:", (time.time() - start), "seconds")

Building batches of counters for bigrams found in all Wiki bios of women and of men...
Estimated time for completion: 46 minutes


A Jupyter Widget

A Jupyter Widget


Articles: 850399 Time: 2681.5078961849213 seconds


In [6]:
total_batches_female = article_count // 3000 + 1  
total_batches_male = article_count // 1000 + 1 

print("Number of batches of bigrams for women collected:", total_batches_female)
print("Number of batches of bigrams for men collected:", total_batches_male)

Number of batches of bigrams for women collected: 284
Number of batches of bigrams for men collected: 851


In [7]:
# This cell takes about 10 minutes to run.

print("Merging batches of the female_bigrams counters. Removing bigrams with counts <= 2, six times.")

fp = FloatProgress(min=0, max=total_batches_female)
count = IntText(value=0, description="Count:")
display(fp, count)

f_count = 0
female_bigrams = collections.Counter()

start = time.time()
for i in range(1, total_batches_female + 1):
    fp.value += 1
    count.value += 1
    f_count += 1
    with open('data/bigrams/female_bigrams_{}.pkl'.format(i), 'rb') as infile:
        female_bigrams += pickle.load(infile)
        # Remove bigrams with counts <=2 six times.
        if f_count % 48 == 0:
            for k in set(k for k,v in female_bigrams.items() if v <= 2):
                del female_bigrams[k]
for k in set(k for k,v in female_bigrams.items() if v <= 2):
    del female_bigrams[k]  
save_object(female_bigrams, 'data/bigrams/female_bigrams_master.pkl')
print("Done.\nMaster counter for female bigrams saved in 'data/bigrams/female_bigrams_master.pkl'")
print("Time:", (time.time() - start), "seconds")

Merging batches of the female_bigrams counters. Removing bigrams with counts <= 2, six times.


A Jupyter Widget

A Jupyter Widget

Done.
Master counter for female bigrams saved in 'data/bigrams/female_bigrams_master.pkl'
Time: 552.6426672935486 seconds


In [8]:
print("Total number of distinct bigrams collected from bios of women:", len(female_bigrams))

Total number of distinct bigrams collected from bios of women: 2725813


In [9]:
print("Deleting temporary batches...")
for i in range(1, total_batches_female + 1):
    os.remove('data/bigrams/female_bigrams_{}.pkl'.format(i))
print("Done.")

Deleting temporary batches...
Done.


In [4]:
# This cell takes about 20 minutes to run.

# NOTE: If you get a memory error here, restart kernel and re-run first cell of notebook only.
# Then try again (a hacky way to solve memory issue that seems to work.)

# Delete female_bigrams to save memory.  We can load it later from pickle file.
# del female_bigrams
total_batches_male = 851

fp = FloatProgress(min=0, max=total_batches_male)
count = IntText(value=0, description="Count:")
m_count = 0
display(fp, count)

male_bigrams = collections.Counter()
n = 7

start = time.time()
for i in range(1, total_batches_male + 1):
    fp.value += 1
    count.value += 1
    m_count += 1
    with open('data/bigrams/male_bigrams_{}.pkl'.format(i), 'rb') as infile:
        male_bigrams += pickle.load(infile)
        # Remove bigrams with count <= n forty-eight times.
        if m_count % 18 == 0:
            for k in set(k for k,v in male_bigrams.items() if v <= n ):
                del male_bigrams[k]
            save_object(male_bigrams, 'data/bigrams/male_bigrams_b_{}.pkl'.format(m_count // 18))
            male_bigrams = collections.Counter()
# for k in set(k for k,v in male_bigrams.items() if v <= n ):
#     del male_bigrams[k]
save_object(male_bigrams, 'data/bigrams/male_bigrams_b_{}.pkl'.format(m_count // 18 + 1))
male_bigrams = collections.Counter()
print("Time:", (time.time() - start), "seconds")
print("Number of batches of male bigrams reduced to", m_count // 18 + 1)

A Jupyter Widget

A Jupyter Widget

Time: 1240.206268787384 seconds
Number of batches of male bigrams reduced to 48


In [9]:
# This cell takes less than 2 minutes to run.

fp = FloatProgress(min=0, max=48)
count = IntText(value=0, description="Count:")
display(fp, count)

print("Creating final master counter for male bigrams...")
print("This should take under 2 minutes...")

start = time.time()
for i in range(1, 49):
    fp.value += 1
    count.value += 1
    with open('data/bigrams/male_bigrams_b_{}.pkl'.format(i), 'rb') as infile:
        male_bigrams += pickle.load(infile)
# Limiting the number of bigrams to those with counts of 8 or more helps save memory
for k in set(k for k,v in male_bigrams.items() if v < 8):
    del male_bigrams[k]
save_object(male_bigrams, 'data/bigrams/male_bigrams_master.pkl')
print("Done.\nMaster counter for male bigrams saved in 'data/bigrams/male_bigrams_master.pkl'")
print("Time:", (time.time() - start), "seconds")
print("Total number of distinct bigrams collected from bios of men:", len(male_bigrams))

A Jupyter Widget

A Jupyter Widget

Creating final master counter for male bigrams...
This should take under 2 minutes...
Done.
Master counter for male bigrams saved in 'data/bigrams/male_bigrams_master.pkl'
Time: 35.66937756538391 seconds
Total number of distinct bigrams collected from bios of men: 1612684


In [12]:
print("Deleting temporary batches...")
for i in range(1, total_batches_male + 1):
    os.remove('data/bigrams/male_bigrams_{}.pkl'.format(i))
for i in range(1, m_count // 18 + 2):
    os.remove('data/bigrams/male_bigrams_b_{}.pkl'.format(i))
print("Done.")

Deleting temporary batches...
Done.


In [7]:
# Load female_bigrams_master:
with open('data/bigrams/female_bigrams_master.pkl', 'rb') as infile:
    female_bigrams = pickle.load(infile)

In [13]:
for k in set(k for k,v in female_bigrams.items() if v < 8):
    del female_bigrams[k]
len(female_bigrams)

1331985

In [6]:
# Load male_bigrams_master:
# with open('data/male_bigrams/male_bigrams_master.pkl', 'rb') as infile:
#     male_bigrams = pickle.load(infile)

# Exploratory Data Analysis of Bigrams

In [14]:
female_bigrams.most_common(70)

[(('http', 'www'), 813722),
 (('of', 'the'), 708824),
 (('ref', 'name'), 683958),
 (('url', 'http'), 578411),
 (('in', 'the'), 566040),
 (('ref', 'cite'), 448580),
 (('cite', 'web'), 440215),
 (('at', 'the'), 391527),
 (('web', 'url'), 299257),
 (('she', 'was'), 251644),
 (('women', 's'), 226877),
 (('for', 'the'), 190627),
 (('cite', 'news'), 179157),
 (('new', 'york'), 175253),
 (('to', 'the'), 173509),
 (('on', 'the'), 173402),
 (('and', 'the'), 169740),
 (('ref', 'http'), 160787),
 (('as', 'a'), 150900),
 (('birth', 'date'), 149470),
 (('ref', 'ref'), 144628),
 (('align', 'center'), 141886),
 (('united', 'states'), 121101),
 (('url', 'https'), 117811),
 (('in', 'a'), 110508),
 (('births', 'category'), 109660),
 (('ref', 'she'), 108566),
 (('living', 'people'), 107575),
 (('people', 'category'), 107094),
 (('archive', 'org'), 104657),
 (('category', 'american'), 104348),
 (('of', 'her'), 103947),
 (('university', 'of'), 102493),
 (('web', 'archive'), 100422),
 (('org', 'web'), 10027

In [15]:
male_bigrams.most_common(70)

[(('of', 'the'), 10743389),
 (('http', 'www'), 7115634),
 (('in', 'the'), 6708376),
 (('ref', 'name'), 6162263),
 (('url', 'http'), 4826060),
 (('ref', 'cite'), 3982254),
 (('he', 'was'), 3527017),
 (('cite', 'web'), 3441055),
 (('at', 'the'), 3024121),
 (('to', 'the'), 2744050),
 (('for', 'the'), 2462184),
 (('web', 'url'), 2362973),
 (('and', 'the'), 1935295),
 (('as', 'a'), 1894177),
 (('on', 'the'), 1864609),
 (('united', 'states'), 1833942),
 (('new', 'york'), 1697296),
 (('cite', 'news'), 1685356),
 (('birth', 'date'), 1576543),
 (('align', 'center'), 1555454),
 (('ref', 'http'), 1532831),
 (('players', 'category'), 1523481),
 (('with', 'the'), 1445508),
 (('f', 'c'), 1414783),
 (('was', 'a'), 1383429),
 (('births', 'category'), 1249080),
 (('university', 'of'), 1191819),
 (('of', 'his'), 1189126),
 (('from', 'the'), 1167930),
 (('in', 'a'), 1155019),
 (('by', 'the'), 1128407),
 (('ref', 'ref'), 1123085),
 (('style', 'background'), 1054393),
 (('url', 'https'), 1048855),
 (('ref'

In [18]:
# Load the unigram counters

with open('data/female_counter.pkl', 'rb') as infile:
    female_counter = pickle.load(infile)
with open('data/male_counter.pkl', 'rb') as infile:
    male_counter = pickle.load(infile)

In [19]:
class PMICalculator:
    
    def __init__(self, counter1, counter2, n1, n2):
        self.counter1 = counter1
        self.counter2 = counter2
        self.n1 = n1
        self.n2 = n2
        self.lpc1 = log(n1 / (n1 + n2))

    def pmi(self, word):
        try:
            return (log(self.counter1[word]) - 
            self.lpc1 -
            log(self.counter1[word] + self.counter2[word]))
        except ValueError:
            return 0

In [20]:
pc1 = PMICalculator(female_counter, male_counter, 130576, 718413)

In [21]:
pc1.pmi("his")

-1.9380632450542024

In [22]:
pc1.pmi("she")

1.7940272570088585

In [35]:
pc1.pmi("husband")

1.509566866723068

In [36]:
pc1.pmi("wife")

-0.057060278138187925

In [41]:
pc1.pmi("career")

-0.12356478572158913

In [42]:
pc1.pmi("family")

0.28344799212810123

In [53]:
pc1.pmi("received")

0.20763811546007105

In [54]:
pc1.pmi("felt")

0.25115538983372687

In [55]:
pc1.pmi("thought")

-0.04150079746606927

In [56]:
pc1.pmi("feared")

-0.17359434944100727

In [57]:
pc1.pmi("married")

0.4251156630049806

In [58]:
pc1.pmi("looks")

0.49427551126943214

In [23]:
pc2 = PMICalculator(female_bigrams, male_bigrams, 130576, 718413)

In [24]:
all_bigrams = female_bigrams + male_bigrams

In [26]:
top_bigrams = all_bigrams.most_common(20000)
top_bigrams[-5:]

[(('of', 'aberdeen'), 8253),
 (('46', 'ref'), 8252),
 (('the', 'eagles'), 8252),
 (('pcupdate', 'ntupdate'), 8252),
 (('houston', 'texans'), 8252)]

In [27]:
top_bigrams_with_pmi = [(p[0], p[1], pc2.pmi(p[0])) for p in top_bigrams]
top_bigrams_with_pmi.sort(key = lambda x: x[-1])

In [38]:
# Men
top_bigrams_with_pmi[:35]

[(('season', 'he'), 92920, -7.169508117004767),
 (('c', 'season'), 89374, -7.130599021242945),
 (('nhl', 'season'), 107596, -6.941459231976644),
 (('english', 'male'), 51900, -6.5870879992744005),
 (('soccerbase', 'com'), 32341, -6.4325587153514885),
 (('class', 'cricket'), 40502, -6.3391205647118145),
 (('nfl', 'draft'), 41359, -6.273047884071101),
 (('town', 'f'), 111024, -6.248903314591642),
 (('championships', 'men'), 43722, -6.248566532355593),
 (('league', 'one'), 39728, -6.078663381051481),
 (('port', 'vale'), 30684, -6.061510555199607),
 (('male', 'television'), 38810, -6.055285097666484),
 (('football', 'reference'), 21509, -6.024694392415389),
 (('county', 'f'), 26149, -5.996890337768831),
 (('league', 'he'), 23514, -5.996035893010259),
 (('i', 'men'), 31265, -5.99325709313219),
 (('pro', 'bowl'), 30146, -5.956810076501267),
 (('his', 'playing'), 21908, -5.925291771486183),
 (('football', 'season'), 52560, -5.906577402718968),
 (('bromwich', 'albion'), 19054, -5.9034999935056

In [37]:
# Women
top_bigrams_with_pmi[-35:]

[(('american', 'women'), 25328, 1.814727984961209),
 (('her', 'work'), 22365, 1.818924876053325),
 (('her', 'debut'), 10883, 1.8210054345701217),
 (('her', 'career'), 21648, 1.8261513329865693),
 (('she', 'has'), 70357, 1.8265834685324158),
 (('supporting', 'actress'), 12378, 1.8268087717543349),
 (('she', 'then'), 10979, 1.8288141422258697),
 (('she', 'received'), 17720, 1.8361243766480992),
 (('she', 'served'), 10180, 1.8412669513945836),
 (('category', 'women'), 29341, 1.8449019731513019),
 (('she', 'began'), 15514, 1.8513143813812842),
 (('she', 'won'), 28102, 1.8550074071748277),
 (('she', 'joined'), 9000, 1.8569771592711604),
 (('year', 'she'), 12683, 1.8577169145055379),
 (('she', 'attended'), 11307, 1.8598108776867885),
 (('she', 'appeared'), 17848, 1.860425078265246),
 (('she', 'played'), 19948, 1.8630263780836405),
 (('century', 'women'), 20836, 1.8657837541957925),
 (('began', 'her'), 8704, 1.865983028165859),
 (('she', 'studied'), 9917, 1.8669348436887674),
 (('female', 'si

In [32]:
pc2.pmi(("first", "woman"))

1.6782626536456213

In [33]:
pc2.pmi(("first", "man"))

-2.248461007969188

In [34]:
pc2.pmi(("first", "person"))

-0.5448207087013106

In [39]:
pc2.pmi(("her", "mother"))

1.677569700476507

In [40]:
pc2.pmi(("his", "mother"))

-2.3741492182690216

In [43]:
pc2.pmi(("husband", "of"))

-0.20183556391051916

In [44]:
pc2.pmi(("wife", "of"))

0.46124905152146845

In [45]:
pc2.pmi(("leader", "of"))

-1.089677112699471

In [46]:
pc2.pmi(("known", "for"))

-0.3949652180862451

In [47]:
pc2.pmi(("recognized", "for"))

-0.18826044315597024

In [48]:
pc2.pmi(("first", "of"))

-0.8254634190213821

In [49]:
pc2.pmi(("the", "most"))

-0.7645317867478543

In [50]:
pc2.pmi(("the", "best"))

-0.45633540156981844

In [51]:
pc2.pmi(("the", "second"))

-0.8233678688462245

In [52]:
pc2.pmi(("the", "first"))

-0.6509658253578845

# Collocations

In [241]:
# bigrams beginning with he or she, filtered by words appearing at least 150 times.

she = collections.Counter({ k[1]: v for k, v in female_bigrams.items() if v >= 150 and k[0] == "she" })
he = collections.Counter({ k[1]: v for k, v in male_bigrams.items() if v >= 150 and k[0] == "he" })

In [109]:
len(she)

496

In [110]:
sum(she.values())

1237776

In [102]:
len(he)

1100

In [103]:
sum(he.values())

14471116

In [97]:
# Number of words we are dealing with in our corpus: 
sum(female_counter.values()) + sum(male_counter.values())

912719349

In [118]:
# Number of words in corpus of women's bios: 
n_women = sum(female_counter.values())
n_women

147838368

In [119]:
# Number of words in corpus of men's bios: 
n_men = sum(male_counter.values())
n_men

764880981

In [228]:
he_count = male_counter['he']
she_count = female_counter['she']
print("Number of times 'he' appears in male bios:", he_count )
print("Number of times 'she' appears in female bios:", she_count)

Number of times 'he' appears in male bios: 7053558
Number of times 'she' appears in female bios: 1306814


In [242]:
# Look up counts for all words in "he" counter not in "she" counter and vice versa
for word in he:
    if word not in she:
        she[word] = female_bigrams[("she", word)]
for word in she:
    if word not in he:
        he[word] = male_bigrams[("he", word)]

In [245]:
columns = ['Word following he/she', "'she-word' count"]
df = pd.DataFrame([(k,v) for k,v in she.items()], columns=columns)
df.set_index(['Word following he/she'], inplace=True)

In [212]:
df.head()

Unnamed: 0_level_0,'she-word' count
Word following he/she,Unnamed: 1_level_1
is,72233
called,1325
moved,9979
had,44820
achieved,1092


In [246]:
columns2 = ['Word following he/she', "'he-word' count",]
df2 = pd.DataFrame([(k,v) for k,v in he.items()], columns=columns2)
df2.set_index(['Word following he/she'], inplace=True)

In [247]:
df = pd.concat([df,df2], axis=1)

df['total count'] = df.fillna(0)["'she-word' count"] + df.fillna(0)["'he-word' count"]

df.sort_values(by=['total count'], ascending=False, inplace=True)

In [248]:
df['he-normalized count'] = (df.fillna(0)["'he-word' count"] + 1) / ( sum(he.values()) + 1)
df['she-normalized count'] = (df.fillna(0)["'she-word' count"] + 1) / ( sum(she.values()) + 1)
df.tail(10)

Unnamed: 0,'she-word' count,'he-word' count,total count,he-normalized count,she-normalized count
abdicated,17,159,176,1.105642e-05,1.411509e-05
hath,24,152,176,1.05727e-05,1.960429e-05
vetoed,0,175,175,1.216206e-05,7.841717e-07
emphasised,20,153,173,1.064181e-05,1.64676e-05
anticipated,16,156,172,1.084911e-05,1.333092e-05
entrusted,16,154,170,1.071091e-05,1.333092e-05
duly,0,165,165,1.147104e-05,7.841717e-07
united,8,156,164,1.084911e-05,7.057545e-06
kicks,159,0,159,6.910263e-08,0.0001254675
suppressed,0,159,159,1.105642e-05,7.841717e-07


In [249]:
df['log_ratio'] = np.log2(df["she-normalized count"] / df["he-normalized count"])
df['abs_ratio'] = abs(df['log_ratio'])

# Note: Also consider other metrics besides "log ratio"

In [250]:
df.sort_values(by=['log_ratio'], ascending=False, inplace=True)
df.head(25)

Unnamed: 0,'she-word' count,'he-word' count,total count,he-normalized count,she-normalized count,log_ratio,abs_ratio
herself,744,0,744,6.910263e-08,0.000584,13.045454,13.045454
peaked,242,0,242,6.910263e-08,0.000191,11.429169,11.429169
her,203,0,203,6.910263e-08,0.00016,11.176782,11.176782
kicks,159,0,159,6.910263e-08,0.000125,10.826285,10.826285
modeled,182,112,294,7.808597e-06,0.000144,4.199878,4.199878
posed,354,249,603,1.727566e-05,0.000278,4.010248,4.010248
danced,513,458,971,3.171811e-05,0.000403,3.667631,3.667631
stars,156,203,359,1.409694e-05,0.000123,3.126552,3.126552
dated,227,303,530,2.10072e-05,0.000179,3.089319,3.089319
upset,129,182,311,1.264578e-05,0.000102,3.011025,3.011025


In [251]:
df.tail(25)

Unnamed: 0,'she-word' count,'he-word' count,total count,he-normalized count,she-normalized count,log_ratio,abs_ratio
rushed,17,3238,3255,0.000224,1.411509e-05,-3.987051,3.987051
pronounced,0,180,180,1.3e-05,7.841717e-07,-3.995489,3.995489
dispatched,0,181,181,1.3e-05,7.841717e-07,-4.003438,4.003438
captain,0,182,182,1.3e-05,7.841717e-07,-4.011343,4.011343
calculated,0,191,191,1.3e-05,7.841717e-07,-4.080606,4.080606
downed,0,194,194,1.3e-05,7.841717e-07,-4.102973,4.102973
invaded,0,219,219,1.5e-05,7.841717e-07,-4.277003,4.277003
farmed,0,221,221,1.5e-05,7.841717e-07,-4.290059,4.290059
gallantly,0,229,229,1.6e-05,7.841717e-07,-4.341133,4.341133
trans,0,258,258,1.8e-05,7.841717e-07,-4.512451,4.512451


In [253]:
df.drop(['111','n'], axis=0, inplace=True)
df.tail(25)

Unnamed: 0,'she-word' count,'he-word' count,total count,he-normalized count,she-normalized count,log_ratio,abs_ratio
vetoed,0,175,175,1.2e-05,7.841717e-07,-3.955075,3.955075
born,16,3045,3061,0.00021,1.333092e-05,-3.98088,3.98088
rushed,17,3238,3255,0.000224,1.411509e-05,-3.987051,3.987051
pronounced,0,180,180,1.3e-05,7.841717e-07,-3.995489,3.995489
dispatched,0,181,181,1.3e-05,7.841717e-07,-4.003438,4.003438
captain,0,182,182,1.3e-05,7.841717e-07,-4.011343,4.011343
calculated,0,191,191,1.3e-05,7.841717e-07,-4.080606,4.080606
downed,0,194,194,1.3e-05,7.841717e-07,-4.102973,4.102973
invaded,0,219,219,1.5e-05,7.841717e-07,-4.277003,4.277003
farmed,0,221,221,1.5e-05,7.841717e-07,-4.290059,4.290059


In [254]:
# Some of these bigrams don't make much sense...may be because of 
# sentence endings/ beginnings being joined, or wikipedia tags