In [2]:
import pandas as pd
import json
import collections
import csv

In [3]:
# get dialogue of each act
df = pd.read_csv("../data/dialoguebyact.csv")
df = df[df["act"] != 0.21] #ignore credits

# contributors of each act
contributors_dataset = pd.read_table("../data/contributorForEachActFinal.tsv")

# create gender file with contributors that never speak
gender_dataset = pd.read_csv("../data/gender.csv")
contributorWithoutDialogueGender = pd.read_csv("../data/contributorWithoutDialogueGender.csv")
gender_dataset2 = pd.concat([gender_dataset, contributorWithoutDialogueGender])

contributorsUppercase = contributors_dataset
# lowercase all the names so they can match
gender_dataset2["name"] = gender_dataset2["name"].str.lower()
contributors_dataset["name"] = contributors_dataset["name"].str.lower()

# merge contributor listing with gender listing
contributors_gender = contributors_dataset.merge(gender_dataset2, on="name")

In [4]:
df["name"] = df["name"].str.lower()
subjectsPerAct = df[(df["role"]=="subject")][["episode-act","name","gender","wordCount"]].groupby(["name","episode-act","gender"]).sum()["wordCount"].reset_index()

In [5]:
contributors_gender.to_csv("../data/producersOfEachAct.csv", index=False)
subjectsPerAct.to_csv("../data/subjectsOfEachAct.csv", index=False)

In [6]:
# getting reporter gender breakdown per act
producerGenders = []

for episode in range(1, 624):
    for act in range(0, 22):
        episodeAct = episode + float(act) * 0.01
        producers = contributors_gender[contributors_gender["episode-act"]== episodeAct]
        maleCount = 0
        femaleCount = 0
        for index, row in producers.iterrows():
#             if (row["name"] != "ira glass"):
            if row["gender"] == "M":
                maleCount += 1
            if row["gender"] == "F":
                femaleCount += 1
        total = float(femaleCount + maleCount)
        malepercent = -1
        femalepercent = -1
        if total != 0:
            malepercent = maleCount/total
            femalepercent = femaleCount/total
        producerGenders.append([episodeAct, malepercent, femalepercent, maleCount, femaleCount, total])

with open('../data/producersGender.csv', 'wb') as f: 
    w = csv.writer(f)
    w.writerow(["episode-act","male%","female%","male","female","total"])
    for list1 in producerGenders:
        w.writerow(list1)

In [7]:
producerGenderData = pd.read_csv("../data/producersGender.csv")
producerGenderData = producerGenderData.drop_duplicates()

In [8]:
# exclude prologues, credits, and acts where no reporter is listed
producerGenderData[(producerGenderData["male%"] != -1)  & (producerGenderData["episode-act"] != 0.00) & (producerGenderData["episode-act"] != 0.21)].shape

(2467, 6)

In [9]:
# getting subject/interviewee gender breakdown per act
subjectsPerActAltered = []

for episode in range(1, 624):
    for act in range(0, 22):
        episodeAct = episode + float(act) * 0.01
        producers = contributors_gender[contributors_gender["episode-act"]== episodeAct]
        subjects = subjectsPerAct[subjectsPerAct["episode-act"] == episodeAct]
        for index, srow in subjects.iterrows():
            for index2, prow in producers.iterrows():
                if srow["name"].lower() != prow["name"].lower():
                    subjectsPerActAltered.append([srow["name"], episodeAct, srow["gender"], srow["wordCount"]])

with open('../data/subjectGendersWithoutProducers.csv', 'wb') as f: 
    w = csv.writer(f)
    w.writerow(["name", "episode-act","gender", "wordCount"])
    for list1 in subjectsPerActAltered:
        w.writerow(list1)

In [10]:
subjectsPerAct2 = pd.read_csv("../data/subjectGendersWithoutProducers.csv").groupby(["gender", "episode-act"])["wordCount"].sum().reset_index()

In [11]:
maleSubjects = subjectsPerAct2[subjectsPerAct2["gender"]=="M"]
femaleSubjects = subjectsPerAct2[subjectsPerAct2["gender"]=="F"]
mergedSubjects = maleSubjects.merge(femaleSubjects, on="episode-act", how="outer").fillna(0)
mergedSubjects["total"] = mergedSubjects["wordCount_x"]+mergedSubjects["wordCount_y"]
mergedSubjects["male%"] = mergedSubjects["wordCount_x"]/mergedSubjects["total"]
subjects = mergedSubjects[["episode-act","male%","wordCount_x","total"]]

In [12]:
# merge producer gender data and interviewee gender data into one spreadsheet
producerSubject = producerGenderData.merge(subjects, on="episode-act")[["episode-act", "male%_x","total_x","male%_y","wordCount_x","total_y"]]
producerSubject.columns = ["episode-act","percentMaleProducers","totalProducerCount","percentMaleSubjectDialogue","maleSubjectWordCount","totalSubjectWordCount"]
actInfo = pd.read_csv("../data/actInfo.csv")
mergedAct3 = producerSubject.merge(actInfo, on="episode-act")
mergedAct3 = mergedAct3[mergedAct3["act"] != 0]
mergedAct3["percentMaleProducers"] = mergedAct3["percentMaleProducers"]*100
mergedAct3["percentMaleSubjectDialogue"] = mergedAct3["percentMaleSubjectDialogue"]*100

In [13]:
# export spreadsheet for use
mergedAct3 = mergedAct3.sort_values("percentMaleSubjectDialogue", ascending = False)
mergedAct3.to_csv("../web/src/assets/data/act3.csv", index=False)

In [14]:
#average amount of male interviewee dialogue for male reported acts and female reported acts
male = mergedAct3[mergedAct3["percentMaleProducers"] == 100]
female = mergedAct3[(mergedAct3["percentMaleProducers"] == 0)]
male["percentMaleSubjectDialogue"].mean(), female["percentMaleSubjectDialogue"].mean()

(68.686856100131877, 64.744854353538258)

In [15]:
maleContributorCount = len(contributors_gender[contributors_gender["gender"]=="M"]["name"].drop_duplicates())
femaleContributorCount = len(contributors_gender[contributors_gender["gender"]=="F"]["name"].drop_duplicates())
#percent of contributors that are male
maleContributorCount/float(maleContributorCount+femaleContributorCount)

0.59472049689441