In [1]:
import json

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.utils import Bunch

### Define dataset loader & classifiers

In [2]:
def load_articles(return_X_y=False):
    """https://scikit-learn.org/stable/datasets/index.html#general-dataset-api"""
    with open('data/nytimes_articles.json') as articles_fp:
        articles = list(filter(lambda a: a['author_gender'] != 'unclear', json.load(articles_fp)))
        data=[f"{a['headline']} {a['summary']}" for a in articles]
        target=[a['author_gender'] for a in articles]
        return (data, target) if return_X_y else Bunch(data=data, target=target)


class HeOrSheBaselineClassifier(DummyClassifier):
    def __init__(self):
        super().__init__(strategy='constant', constant='male')


class HeOrSheMultinomialNBClassifier(Pipeline):
    def __init__(self):
        super().__init__([
            ('tfidf', TfidfVectorizer()),
            ('clf', MultinomialNB()),
        ])


class HeOrSheGradientBoostingClassifier(Pipeline):
    def __init__(self):
        super().__init__([
            ('tfidf', TfidfVectorizer()),
            ('clf', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)),    # default hyperparameters from scikit-learn docs
        ])

### Load the data

In [3]:
X, y = load_articles(return_X_y=True)
print(f"Male-to-female ratio: {y.count('male')} / {y.count('female')}")

Male-to-female ratio: 151372 / 60268


### Instantiate the classifiers & set up evaluation

In [4]:
bl_clf = HeOrSheBaselineClassifier()
nb_clf = HeOrSheMultinomialNBClassifier()
gb_clf = HeOrSheGradientBoostingClassifier()

scoring = {
    'acc': 'accuracy',
    'f1': 'f1_macro',
    'prec': 'precision_macro',
    'rec': 'recall_macro',
}
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)

### Cross-validate the classifiers

In [5]:
for clf in [bl_clf, nb_clf, gb_clf]:
    scores = cross_validate(clf, X, y, scoring=scoring, cv=k_fold)
    mean_scores = {f"mean_{k}": sum(v) / len(v) for k, v in scores.items()}
    print(json.dumps(mean_scores, indent=4))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{
    "mean_fit_time": 0.11309618949890136,
    "mean_score_time": 0.39141201972961426,
    "mean_test_acc": 0.7152334152334152,
    "mean_test_f1": 0.4169883107611906,
    "mean_test_prec": 0.3576167076167076,
    "mean_test_rec": 0.5
}
{
    "mean_fit_time": 6.643387746810913,
    "mean_score_time": 6.8762530326843265,
    "mean_test_acc": 0.7488045738045738,
    "mean_test_f1": 0.5418942223810495,
    "mean_test_prec": 0.8157115822711202,
    "mean_test_rec": 0.5640267253071867
}
{
    "mean_fit_time": 102.70812916755676,
    "mean_score_time": 6.879893779754639,
    "mean_test_acc": 0.7671801171801171,
    "mean_test_f1": 0.6511777942815625,
    "mean_test_prec": 0.7337284785417665,
    "mean_test_rec": 0.63698100750024
}
