In [11]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import unicodecsv as csv

ModuleNotFoundError: No module named 'unicodecsv'

In [None]:
# Category deduplication
REWRITE_CATEGORIES = {
    "Business Day": "Business",
    "nan": "Unknown",
    "New York and Region": "N.Y. / Region",
    "false": "Unknown"
}


def _rewrite_category(category):
    """Standardize the category name by performing a
    rewrite if necessary.

    Arguments:
        category {string} -- the name of the category

    Returns:
        string -- the standardized category
    """

    if category in REWRITE_CATEGORIES:
        return REWRITE_CATEGORIES[category]
    return category

In [None]:
def _unpack_categories(reported_category):
    """Utility method to get all the subcategories,
    separated by a semicolon.

    Arguments:
        reported_category {string} -- semicolon-separated supercategory

    Returns:
        [String] -- array of subcategory strings
    """

    return [_rewrite_category(category.strip()) for category in reported_category.split(";")]

In [None]:
def _process_row(k):
    k['section_name'] = _unpack_categories(str(k['section_name']))
    k['pub_date'] = datetime.strptime(k['pub_date'], "%Y-%m-%d").date()
    return k

In [None]:
def load_all_data():
    """Load all the CSVs in /data into a single
    dataframe.

    Returns:
        dataframe -- all the data
    """

    dataframes = []
    for data_file in os.listdir("data/"):
        data = pd.read_csv("data/" + data_file)
        dataframes.append(data)
    dataframe = pd.concat(dataframes)
    dataframe.apply(lambda k: _process_row(k), axis=1)
    return dataframe

In [None]:
def get_percent_by_women(dataframe, fil):
    total = 0
    matched = 0
    for index, row in dataframe.iterrows():
        if fil(row):
            total += 1
            if row["gender"] == "F":
                matched += 1
    if total == 0:
        return None
    return float(matched) / total

In [None]:
def _get_unique_categories(dataframe):
    """Utility method to get the unique categories in the dataframe, unpacked
    and standardized.

    Arguments:
        dataframe {dataframe} -- the dataframe which contains the NYT data

    Returns:
        [String] -- array of the unique categories
    """

    categories = set()
    for reported_category in dataframe.section_name.unique():
        for found_category in _unpack_categories(str(reported_category)):
            categories.add(found_category)
    return categories

In [None]:
data = load_all_data()
# this will also dedup categories
sort=True

In [None]:
# Example: how to get example articles for any given month
get_percent_by_women(data, lambda k: k['pub_date'].month == 6 and k['pub_date'].year == 2013 and 'Sports' in k['section_name'])

In [None]:
# Get all the unique categories
all_unique_categories = set()
for categories in data['section_name']:
    for subcategory in categories:
        all_unique_categories.add(subcategory)

In [None]:
monthly_stats = {}
for year in range(2011, 2017):
    monthly_stats[str(year)] = {}
    for month in range(1, 13):
        monthly_stats[str(year)][str(month)] = {}
        for category in all_unique_categories:
            monthly_stats[str(year)][str(month)][category] = {
                "total": 0, # total number of articles
                "women": 0  # number of those articles by women
            }

In [None]:
for index, row in data.iterrows():
    year = str(row['pub_date'].year)
    month = str(row['pub_date'].month)
    for category in row['section_name']:
        monthly_stats[year][month][category]["total"] += 1
        if row["gender"] == "F":
            monthly_stats[year][month][category]["women"] += 1

In [None]:
with open("monthly_stats.csv", "wb") as outfile:
    writer = csv.writer(outfile)
    columns = ["Year", "Month"]
    columns.extend(all_unique_categories)
    writer.writerow(columns)
    for year in range(2012, 2017):
        for month in range(1, 13):
            row = [str(year), str(month)]
            for category in all_unique_categories:
                women = float(monthly_stats[str(year)][str(month)][category]["women"])
                total = float(monthly_stats[str(year)][str(month)][category]["total"])
                if total == 0:
                    row.append(None)
                else:
                    row.append(women/total)
            writer.writerow(row)