In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
# Category deduplication
REWRITE_CATEGORIES = {
    "Business Day": "Business",
    "NaN": "Unknown",
    "New York and Region": "N.Y. / Region",
    "False": "Unknown"
}


def _rewrite_category(category):
    """Standardize the category name by performing a
    rewrite if necessary.

    Arguments:
        category {string} -- the name of the category

    Returns:
        string -- the standardized category
    """

    if category in REWRITE_CATEGORIES:
        return REWRITE_CATEGORIES[category]
    return category

In [3]:
def _unpack_categories(reported_category):
    """Utility method to get all the subcategories,
    separated by a semicolon.

    Arguments:
        reported_category {string} -- semicolon-separated supercategory

    Returns:
        [String] -- array of subcategory strings
    """

    return [_rewrite_category(category.strip()) for category in reported_category.split(";")]

In [4]:
def _process_row(k):
    k['section_name'] = _unpack_categories(str(k['section_name']))
    k['pub_date'] = datetime.strptime(k['pub_date'], "%Y-%m-%d").date()
    return k

In [5]:
def load_all_data():
    """Load all the CSVs in /data into a single
    dataframe.

    Returns:
        dataframe -- all the data
    """

    dataframes = []
    for data_file in os.listdir("data/")[0:5]:
        data = pd.read_csv("data/" + data_file)
        dataframes.append(data)
    dataframe = pd.concat(dataframes)
    dataframe.apply(lambda k: _process_row(k), axis=1)
    return dataframe

In [6]:
def get_percent_by_women(dataframe, filter):
    total = 0.
    matched = 0.
    for index, row in dataframe.iterrows():
        if filter(row):
            total += 1
            if row["gender"] == "F":
                matched += 1
    return matched / total

In [13]:
def _get_unique_categories(dataframe):
    """Utility method to get the unique categories in the dataframe, unpacked
    and standardized.

    Arguments:
        dataframe {dataframe} -- the dataframe which contains the NYT data

    Returns:
        [String] -- array of the unique categories
    """

    categories = set()
    for reported_category in dataframe.section_name.unique():
        for found_category in _unpack_categories(str(reported_category)):
            categories.add(found_category)
    return categories

In [8]:
data = load_all_data()
# this will also dedup categories

In [15]:
# Example: how to get example articles for any given month
get_percent_by_women(data, lambda k: k['pub_date'].month == 6 and k['pub_date'].year == 2013 and 'Sports' in k['section_name'])

0.22033898305084745