In [1]:
import pandas as pd
import numpy as np
from eval_lib import *
import torch

In [2]:
def check_for_string(col, substr):
    string = col.str
    return string.startswith(substr + " ") | string.contains(" " + substr + " ") | string.endswith(" " + substr)

assert(check_for_string(pd.Series("hello you there"), "hello").all())
assert(check_for_string(pd.Series("hello you there"), "there").all())
assert(check_for_string(pd.Series("hello you there"), "you").all())
assert(not(check_for_string(pd.Series("hello you there"), "ello")).all())
print(check_for_string(pd.Series("hello you there"), "hello"))

0    True
dtype: bool


In [3]:
def print_pronoun_counts(file_path):
    print("Reading from %s" % file_path)
    new_df = pd.read_csv(file_path, sep='\t')
    new_df = new_df[new_df["tr"].notna()] # for some reason, two weird entries with NaN
    new_df = new_df[new_df["en"].notna()] 
    pronouns = new_df[check_for_string(new_df["tr"], "o")]
    # Number of examples with "o bir"
    print("# o bir", pronouns.shape[0])
    # Number of which translate to "he", roughly
    hes = pronouns[check_for_string(pronouns["en"], "he") | check_for_string(pronouns["en"], "he's")]
    print("o -> he", hes.shape[0])
    # Number of which translate to "she", roughly
    shes = pronouns[check_for_string(pronouns["en"], "she") | check_for_string(pronouns["en"], "she's")]
    print("o -> she", shes.shape[0])
    its = pronouns[check_for_string(pronouns["en"], "it") | check_for_string(pronouns["en"], "it's")]
    print("o -> it", its.shape[0])
    return pronouns

    

In [5]:
male_pronouns = set(["he", "he's", "his", "him", "he'll"])
female_pronouns = set(["she", "she's", "hers", "her", "she'll"])

In [6]:
# Returns true if col.str has one of any of the set of strings.
def check_for_set_of_strings(col, iter_of_strings):
    init_bool = np.zeros(len(col), dtype=bool)
    for substr in iter_of_strings:
        init_bool = init_bool | check_for_string(col, substr)
    return init_bool

assert(check_for_set_of_strings(pd.Series("hello you there"), ["hello"]).all())

In [9]:
train_df = pd.read_csv('data/train_200k.csv', sep='\t')

In [11]:
print("fem", train_df[check_for_set_of_strings(train_df['en'], female_pronouns)].shape)
print("masc", train_df[check_for_set_of_strings(train_df['en'], male_pronouns)].shape)
print("overlap", train_df[check_for_set_of_strings(train_df['en'], male_pronouns) & check_for_set_of_strings(train_df['en'], female_pronouns)].shape)


fem (8158, 4)
masc (14821, 4)
overlap (362, 4)


In [14]:
print(14821.0 /(14821 + 8158))

0.6449801993124157


In [13]:
print(14821 + 8158 - 362)
print(22617.0 / 2E6)


22617
0.0113085


In [4]:
print_pronoun_counts("data/val_10k_0306.csv")

Reading from data/val_10k_0306.csv
# o bir 1823
o -> he 356
o -> she 214
o -> it 279


Unnamed: 0,tr_context,tr,en_context,en
95,"michael , şu küçük koreli geldi , ve onla ne y...","en azından , sanırım o .","michael , the little korean is here , and i do...","at least , i think it 's a him ."
102,döndüm .,kim o ?,{ rattling ],who 's that ?
164,""" selam , dans etmek ister misin ? ""","ve o da şöyle bir şey , diyeceğini düşündüm , ...",""" well , hey , do you want to dance ? ""","and she 'd be , like , "" ah , what the heck ? ..."
200,"görüyor musun , george michael sana bu posteri...","biliyor musun , o sana , aşık gibi , bu yüzden .","you see , george michael made this poster for ...","you know , he-he kind of , uh , is in love wit..."
245,"vay canına , michael .","sana o kadar kötü davrandıktan sonra , beni ko...","wow , michael .",i 'm really touched that you 'd stick up for m...
304,bende .,o benim kardeşim stanton ; şartlı tahliye memu...,"me , too .",he 's my brother stanton / parole officer .
307,"stanton ve ben iki yaşındayken , annemiz onun ...",o bilmez .,"when stanton and i were two , our mother dropp...",and he doesn 't know .
384,"yeterince açık , değil mi ?","tamam o zaman , haydi parti başlasın .","pretty straight forward , right ?","okay , then let 's get this party started ."
528,"baştan bu kadar korkunç olacağını bilseydim , ...","haydi ahbap , o kadarda kötü değildir .",i would never have done it .,"come on , dude , it couldn 't have been that b..."
539,"bana mı öyle geliyor , yoksa burası daha da so...","evet , o haklı .","is it me , or is it getting colder in here ?","yeah , she 's right ."


In [5]:
print_pronoun_counts("data/train_200k.csv")

Reading from data/train_200k.csv
# o bir 61542
o -> he 11582
o -> she 6186
o -> it 7477


Unnamed: 0,tr_context,tr,en_context,en
27,"tüm gün pislik yerim, düşene kadar içerim, anl...","sonuna kadar lock-up'ı izleyebilirim, 'çünkü o...","i eat crap all day, i drink until i drop, you ...","i might binge-watch lock-up, 'cause i put half..."
38,evet son düzlüğe girdik.,o kadar uzun süre ellerine hakim olabiliceğini...,it's the home stretch.,do you think you can keep your hands to yourse...
39,teşekkür ederim.,neşelendirici ama bir o kadar da korkutucu ola...,thank you.,the exhilarating and terrifying truth is that ...
45,i̇şte benim adamım.,"hey, ne yapıyor... hayır, o burada olamaz..",there's my man.,"hey, what are you... no, he can't be in here."
55,i̇şte telefonu cevapladığınızda bunlar olur.,bu gece yaptığın o muhteşem konuşmadan sonra s...,that's what happens when you take the call.,how can i be mad at you after that great speec...
120,"bu beni, ayrılık şarkısına sürükledi.",# o gidince güneş de gitti #,and that sent me into a breakup funk.,♪ ain't no sunshine when she's gone ♪
121,# o gidince güneş de gitti #,# o gidince sıcak olmuyor #,♪ ain't no sunshine when she's gone ♪,♪ it's not warm when she's away ♪
122,# o gidince sıcak olmuyor #,# o gidince güneş de gitti #,♪ it's not warm when she's away ♪,♪ ain't no sunshine when she's gone ♪
126,#,# o gidince güneş de gitti #,♪ wonder if she's gone to stay,♪ ain't no sunshine when she's gone ♪
128,"# ve bu ev, ev gibi değil #",# o gidince #,♪ and this house just ain't no home ♪,♪ any time she goes away


In [14]:
stereotypes = pd.read_csv("data/pro_stereotype.tsv", sep="\t", header=None)

In [7]:
with open('data/male_occupations.txt', 'r') as myfile:
    male_occupations = [i.strip() for i in myfile.readlines()]

In [8]:
with open('data/female_occupations.txt', 'r') as myfile:
    female_occupations = [i.strip() for i in myfile.readlines()]

In [16]:
df = pd.read_csv("data/train_200k.csv", sep="\t")
df = df[df["tr"].notna()] # for some reason, two weird entries with NaN
df = df[df["en"].notna()] 
data_ratios = {} # ratio in
for i in range(len(stereotypes)):
    tr = stereotypes.iloc[i, 1]
    en = stereotypes.iloc[i, 3]
    occupation = en[8:-1] if i < 20 else en[9:-1] # he is vs she is
    
    # Count # of rows with "he" vs "she"
    mentions_df = df[df["en"].str.contains(occupation)]
    he_mentions = mentions_df[check_for_set_of_strings(mentions_df["en"], male_pronouns)]
    she_mentions = mentions_df[check_for_set_of_strings(mentions_df["en"], female_pronouns)]
    print("%s: %d total, %d male, %d female" % (occupation, len(mentions_df), len(he_mentions), len(she_mentions)))
    data_ratios[occupation] = len(he_mentions), len(she_mentions)
    

driver: 906 total, 122 male, 29 female
supervisor: 97 total, 15 male, 1 female
janitor: 77 total, 5 male, 1 female
cook: 1571 total, 109 male, 106 female
mover: 42 total, 0 male, 0 female
laborer: 18 total, 4 male, 1 female
construction worker: 14 total, 2 male, 0 female
chief: 1565 total, 140 male, 40 female
developer: 32 total, 2 male, 0 female
carpenter: 83 total, 10 male, 2 female
manager: 572 total, 66 male, 28 female
lawyer: 1280 total, 215 male, 63 female
farmer: 304 total, 34 male, 8 female
salesperson: 5 total, 0 male, 0 female
physician: 127 total, 22 male, 4 female
guard: 1984 total, 267 male, 75 female
analyst: 65 total, 12 male, 3 female
mechanic: 219 total, 20 male, 5 female
sheriff: 743 total, 63 male, 15 female
CEO: 0 total, 0 male, 0 female
attendant: 51 total, 5 male, 2 female
cashier: 42 total, 6 male, 0 female
teacher: 1107 total, 122 male, 81 female
nurse: 762 total, 52 male, 64 female
assistant: 491 total, 89 male, 30 female
secretary: 472 total, 58 male, 28 femal

In [111]:
for k, v in data_ratios.items():
    print("%s, %d, %d" % (k, v[0], v[1]))

driver, 122, 29
supervisor, 15, 1
janitor, 5, 1
cook, 109, 106
mover, 0, 0
laborer, 4, 1
construction worker, 2, 0
chief, 140, 40
developer, 2, 0
carpenter, 10, 2
manager, 66, 28
lawyer, 215, 63
farmer, 34, 8
salesperson, 0, 0
physician, 22, 4
guard, 267, 75
analyst, 12, 3
mechanic, 20, 5
sheriff, 63, 15
CEO, 0, 0
attendant, 5, 2
cashier, 6, 0
teacher, 122, 81
nurse, 52, 64
assistant, 89, 30
secretary, 58, 28
auditor, 2, 0
cleaner, 20, 9
receptionist, 7, 6
clerk, 13, 9
counselor, 18, 12
designer, 13, 12
hairdresser, 2, 3
writer, 66, 24
housekeeper, 8, 9
baker, 19, 11
accountant, 23, 7
editor, 17, 15
librarian, 2, 2
tailor, 4, 0


In [None]:
def count_pronouns_with_antecedent(file_path):
    df = pd.read_csv(file_path, sep='\t')

In [194]:
def analyze_annotated_df(file_path):
    annotated_df = pd.read_csv(file_path, sep='\t')
    print("total rows:", annotated_df.shape[0])
    non_empty_df = annotated_df[(annotated_df['Unnamed: 4'] != '[]') & (annotated_df['Unnamed: 4'].notna())]
    print("Number of rows with a pronoun with antecedent in prev sentence:", non_empty_df.shape[0])
    print("fraction of total rows: ", non_empty_df.shape[0] / annotated_df.shape[0])
    print("Distribution of pronouns:")
    non_empty_df_o = non_empty_df[check_for_string(non_empty_df['tr'], "o")]
    print(non_empty_df_o['Unnamed: 4'].value_counts())
    return non_empty_df
    

In [195]:
ann_df_1 = analyze_annotated_df("data/train_200k_annotated.csv")

total rows: 226775
Number of rows with a pronoun with antecedent in prev sentence: 14541
fraction of total rows:  0.06412082460588689
Distribution of pronouns:
['it']                                         336
['he']                                         124
['she']                                         73
['they']                                        39
['it', 'it']                                    36
['them']                                        31
['his']                                         24
['that']                                        22
['he', 'he']                                    14
['him']                                         12
['their']                                       11
['he', 'his']                                    7
['they', 'they']                                 7
['her']                                          6
['it', 'he']                                     5
['she', 'she']                                   5
['them', 'they']        

In [185]:
ann_df_2 = analyze_annotated_df("data/0308_train_200k_annotated_2.csv")

total rows: 319634
Number of rows with a pronoun with antecedent in prev sentence: 20464
fraction of total rows:  0.06402322656538416
Distribution of pronouns:
['it']                                         9555
['they']                                       1735
['he']                                         1310
['them']                                       1036
['his']                                         749
['it', 'it']                                    628
['she']                                         607
['their']                                       528
['him']                                         527
['that']                                        507
['her']                                         370
['you']                                         266
['its']                                         171
['they', 'they']                                127
['he', 'he']                                    106
['he', 'his']                                    94
['themse

In [193]:
ann_df_1[check_for_string(ann_df_1['tr'], 'o')]['Unnamed: 4'].value_counts()

['it']                                         336
['he']                                         124
['she']                                         73
['they']                                        39
['it', 'it']                                    36
['them']                                        31
['his']                                         24
['that']                                        22
['he', 'he']                                    14
['him']                                         12
['their']                                       11
['he', 'his']                                    7
['they', 'they']                                 7
['her']                                          6
['it', 'he']                                     5
['she', 'she']                                   5
['them', 'they']                                 5
['her', 'she']                                   4
['he', 'him']                                    4
['his', 'he']                  