In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics as sk_metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import random_split

from nam.wrapper import NAMClassifier, MultiTaskNAMClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random_state = 2016

In [3]:

dataset = pd.read_csv('~/data/nam/recid.data', delimiter=' ', header=None)
dataset.columns = ["age", "race", "sex", "priors_count", "length_of_stay", "c_charge_degree", "two_year_recid"]

In [4]:
dataset.head()

Unnamed: 0,age,race,sex,priors_count,length_of_stay,c_charge_degree,two_year_recid
0,69,6,2,0,1,1,0
1,34,1,2,0,10,1,1
2,24,1,2,4,1,1,1
3,44,6,2,0,1,2,0
4,41,3,2,14,6,1,1


In [5]:
binary = ['sex', 'c_charge_degree']
other = ['age', 'race', 'priors_count', 'length_of_stay']

In [6]:
scaler = MinMaxScaler((-1, 1))
dataset[other] = scaler.fit_transform(dataset[other])
dataset[binary] = dataset[binary] - 1

In [7]:
dataset

Unnamed: 0,age,race,sex,priors_count,length_of_stay,c_charge_degree,two_year_recid
0,0.307692,1.0,1,-1.000000,-0.9975,0,0
1,-0.589744,-1.0,1,-1.000000,-0.9750,0,1
2,-0.846154,-1.0,1,-0.789474,-0.9975,0,1
3,-0.333333,1.0,1,-1.000000,-0.9975,1,0
4,-0.410256,-0.2,1,-0.263158,-0.9850,0,1
...,...,...,...,...,...,...,...
6167,-0.871795,-1.0,1,-1.000000,-0.9950,0,0
6168,-0.871795,-1.0,1,-1.000000,-0.9950,0,0
6169,0.000000,1.0,1,-1.000000,-0.9975,0,0
6170,-0.615385,-1.0,0,-0.842105,-0.9975,1,0


In [8]:
data_train, data_test = train_test_split(dataset, train_size=0.8, test_size=0.2, random_state=random_state)
X_train, y_train = data_train[other + binary], data_train['two_year_recid']
X_test, y_test = data_test[other + binary], data_test['two_year_recid']

## Single Task NAMs Classification

In [9]:
model = NAMClassifier(
            num_epochs=1000,
            num_learners=20,
            metric='auroc',
            early_stop_mode='max',
            monitor_loss=False,
            n_jobs=10,
            random_state=random_state
        )

model.fit(X_train, y_train)

<nam.wrapper.wrapper.NAMClassifier at 0x7f964260b9d0>

In [10]:
pred = model.predict_proba(X_test)
sk_metrics.roc_auc_score(y_test, pred)

0.7409598053496822

## Multitask NAMs Classification

In [11]:
def make_gender_mtl_data(X, y):
    y_male = y.copy()
    y_male[X['sex'] == 1] = np.nan
    y_female = y.copy()
    y_female[X['sex'] == 0] = np.nan
    return pd.concat([y_female, y_male], axis=1)

In [12]:
y_train_mtl = make_gender_mtl_data(X_train, y_train)
y_test_mtl = make_gender_mtl_data(X_test, y_test)

In [13]:
X_train_mtl = X_train.drop(columns=['sex'])
X_test_mtl = X_test.drop(columns=['sex'])

In [14]:
# NaN indicates label missing
y_train_mtl

Unnamed: 0,two_year_recid,two_year_recid.1
4819,0.0,
1581,,1.0
0,0.0,
1575,0.0,
1159,1.0,
...,...,...
4604,0.0,
653,0.0,
4691,0.0,
5386,,1.0


In [15]:
model = MultiTaskNAMClassifier(
            num_learners=20,
            patience=60,
            num_epochs=1000,
            num_subnets=10,
            metric='auroc',
            monitor_loss=False,
            early_stop_mode='max',
            n_jobs=10,
            random_state=random_state
        )

model.fit(X_train_mtl, y_train_mtl)

<nam.wrapper.wrapper.MultiTaskNAMClassifier at 0x7f96bde92410>

In [16]:
pred = model.predict_proba(X_test_mtl)

In [17]:
# Flatten and remove nans
y_test_mtl_flat = y_test_mtl.to_numpy().reshape(-1)
pred_flat = pred.reshape(-1)

non_nan_indices = y_test_mtl_flat == y_test_mtl_flat 
y_test_mtl_flat = y_test_mtl_flat[non_nan_indices]
pred_flat = pred_flat[non_nan_indices]

In [18]:
sk_metrics.roc_auc_score(y_test_mtl_flat, pred_flat)

0.7416374531115772