In [1]:
nidx = 1
label = 'xe'

In [2]:
cpu_split = 1
fold_seed = 5295; 
s= 0; e = 80

In [3]:
GN = 0.025
IN_WT = 2

In [4]:
PATH =   ''# '../input/nrc-inputs/'
TRAIN_FILE = 'NIJ_s_Recidivism_Challenge_Training_Dataset.csv'
TEST_FILE = 'NIJ_s_Recidivism_Challenge_Test_Dataset1.csv'

In [5]:
import numpy as np
import pandas as pd
import time
from collections import defaultdict
from IPython.display import display, clear_output

In [None]:
!pip install torch

In [6]:
!pip install -q pytorch-lightning

In [7]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset#,# random_split
import pytorch_lightning as pl

In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error

In [9]:
def mish(x):
    return (x * torch.tanh(F.softplus(x)))

In [10]:
def norm(x, scaler = StandardScaler):
    return pd.DataFrame( scaler().fit_transform(x), 
                            index = x.index, columns = x.columns)#, scaler

In [11]:
def clean(t):
    return t.detach().cpu().numpy()

In [12]:
g = 8
alt_weight = 1

In [13]:
class MLP2(pl.LightningModule):
    def __init__(self, input_size, embedding_size = 32, 
                 input_dropout = 0.2, final_dropout = 0.5,
                     lr = 3e-4, weight_decay = 1e-3, prelu_init = 0.25,
                        n_outputs = 1):
        super().__init__()
        self.lr = lr
        self.embedding_size = embedding_size
        self.weight_decay = weight_decay
        self.prelu_init = prelu_init
        
        self.input_dropout = nn.Dropout(input_dropout)
        
        g = 8
        self.layer1 = nn.Linear(input_size, embedding_size)
        self.layer1_norm = nn.GroupNorm(g, embedding_size)
        self.dropout1 = nn.Dropout(final_dropout)
        
        self.layer2 = nn.Linear(embedding_size, embedding_size)
        self.layer2_norm = nn.GroupNorm(g, embedding_size)
        self.dropout2 = nn.Dropout(final_dropout)
        
        self.a1 = nn.LeakyReLU(0.25)
        self.a2 = nn.LeakyReLU(0.25)

        
        self.final = nn.Linear(embedding_size, n_outputs)
        
    def forward(self, x):
        xn = x + GN * torch.randn( x.shape )
        m1 = self.a1(self.layer1_norm(self.layer1(self.input_dropout( IN_WT * xn))))
        m2 = self.a2(self.layer2_norm(self.layer2(self.dropout1(m1))))
        out = self.final(self.dropout2(m2))
        return out
    
    def training_step(self, batch, batch_idx):
        x, y, ID = batch
       #  y = y.view(y.shape[0], -1)

        output = self.forward(x)
        loss = ( ( F.mse_loss(y, output) * 0.30
                    + F.mse_loss(y[:, -4:], output[:, -4:]) * 0.20 ) * alt_weight
                        + F.mse_loss(y[:, -1], output[:, -1] ) * 1.00 )
        self.log('train_loss', loss, on_step=False, on_epoch=True, 
                 prog_bar=True, logger=True)
        return loss
    
    
    def validation_step(self, batch, batch_idx):
        x, y, ID = batch
#         y = y.view(y.shape[0], -1)
        output = self.forward(x)
    
        is_test = y[:, -1] < 0
        
        holdout = output[~is_test]
        scored_y = y[~is_test]
        test = output[is_test]
        
#         print(scored_y.shape)
#         print(holdout.shape)
        

        loss = ( F.mse_loss(scored_y, holdout) * 0.0
                    + F.mse_loss(scored_y[:, -4:], holdout[:, -4:]) * 0.0
                        + F.mse_loss(scored_y[:, -1], holdout[:, -1] ) * 1.00 )
        self.log('holdout_loss', loss, on_step=False, on_epoch=True, 
                 prog_bar=True, logger=True)
        all_preds.append(pd.Series(clean(holdout[:, -1]), index = clean(ID[~is_test])))
        all_tests.append(pd.Series(clean(test[:, -1]), index = clean(ID[is_test])))

        return loss

    
    def test_step(self, batch, batch_idx):
        x, y, ID = batch
        output = self.forward(x)
        return output

    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr = self.lr,
                                             weight_decay = self.weight_decay )
        return optimizer
    
    
    

In [14]:
class NRCDataset(Dataset):

    def __init__(self, features, targets, indices, test_indices = None):
#         print(targets.shape)
        self.y = targets.iloc[indices]
        self.x = features.reindex(self.y.index)
        
        if test_indices is not None:
            
            self.y = pd.concat( (self.y,
                                     pd.DataFrame( -1 * np.ones((len(test_indices), len(targets.columns)) ), 
                                                      index = test_indices, columns = targets.columns) ) )
#             print(self.y)
            self.x = pd.concat( ( self.x, features.reindex(test_indices) ) )
    
    
#             idxs = np.arange(0, len(self.x))
#             random.shuffle(idxs)
            
#             self.y = x.iloc[idxs]
    
#             print(self.x)
        
        self.indices = self.y.index.to_list()
        
        # this is a weird bug -- but it's basically gaussian noise at inference time
#         self.x = norm(self.x)
# fixed
 
        self.x = self.x.values.astype(np.float32)
        self.y = self.y.values.astype(np.float32)  

        
    def __getitem__(self, index):
        return (self.x[index], self.y[index], self.indices[index] )

    def __len__(self):
        return len(self.x)


In [15]:
xe = pd.read_csv(PATH + 'xe.csv', index_col = 'ID')
# xe2 = pd.read_csv(PATH + 'xe2.csv', index_col = 'ID')
y = pd.read_csv(PATH + TRAIN_FILE, index_col = 'ID')#['Recidivism_Arrest_Year1']
test = pd.read_csv(PATH + TEST_FILE, index_col = 'ID')

In [16]:
y = pd.read_csv(PATH + 'all_targets.csv', index_col = 'ID').fillna(0)\
    .drop(columns = 'Avg_Days_per_DrugTest_1')
y = y[sorted([c for c in y.columns if 'Recidivism' not in c]) +  \
                  ['Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3',
                          'Recidivism_Within_3years',
                    'Recidivism_Arrest_Year1'  ]]

In [17]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 8)

In [18]:
xe = norm(xe)
# xe2 = norm(xe2)

In [19]:
y = norm(y,  MinMaxScaler
        ).astype(np.float32)

In [20]:
folds = []; 
for i in range(500):
    folds.extend(list(StratifiedKFold(random_state = i + fold_seed, shuffle = True)
                              .split(np.zeros(len(y)), y.iloc[:,-1])))

In [21]:
features = eval(label); 

network = [MLP2, MLP2,]
layers = [2, 2,]
dims = [384, 640,] 
wd = [0.5] * 2

batch_size = [256] * 2
drop_last = [True] * 2

epochs = [60] * 2 


In [22]:
N_WORKERS = 2

In [23]:
# %%time
start = time.time()

final_preds = defaultdict(list)
final_test_preds = defaultdict(list)

wtg_scores = defaultdict(list)
median_scores = []
wtg_median_scores = defaultdict(list)

for fold in folds[s:e]:

    test_indices = list(set(features.index)-set(y.index) )
    train_data = NRCDataset(features, y, fold[0])
    holdout_data = NRCDataset(features, y, fold[1], test_indices)

    train_loader = DataLoader(train_data, batch_size = batch_size[nidx], shuffle=True, 
                             num_workers = N_WORKERS, 
                                                      drop_last = drop_last[nidx], pin_memory = True)
    holdout_loader = DataLoader(holdout_data, batch_size = 32768, 
                             num_workers = N_WORKERS, pin_memory = True)
    
    mlp = network[nidx](len(train_data[0][0]), dims[nidx], weight_decay = wd[nidx],
                               n_outputs = y.shape[1])
    
    
    
    all_preds = []; all_tests = []
    trainer = pl.Trainer(min_epochs=epochs[nidx], max_epochs=epochs[nidx])#gpus = 1 if )
    trainer.fit(mlp, train_loader, holdout_loader)
    
    
    def expAvg(x, decay, final = 20):
        x = x[:final]
        wts = np.exp( - decay * np.arange(0, len(x)) )
        return (x.values * wts).sum() / wts.sum()
    

    clear_output(wait = True); 
    print('{:.1f}s elapsed for {} folds'.format(time.time() - start, 1 + len(median_scores)))

    combined_preds = pd.concat(all_preds)
    combined_tests = pd.concat(all_tests)

#     test_preds = combined_preds[combined_preds.index.isin(test_indices)]
#     preds = combined_preds[~combined_preds.index.isin(test_indices)]
    
#     preds = preds.iloc[preds.index.nunique()*0:preds.index.nunique()*1000]

    for last_epoch in [40, 60]:
        print(' through epoch {}'.format(last_epoch))
        for decay in [  -0.08, -0.15]:
            avg_pred = combined_preds.groupby(combined_preds.index).apply(
                            expAvg, decay, last_epoch).clip(0, 1)
            score = mean_squared_error( y.reindex(avg_pred.index).iloc[:, -1], avg_pred )
            print(' {:.2f}: {:.4f}'.format(decay, score ))
            wtg_scores['{} {:.2f}'.format(last_epoch, decay)].append(score)
        print()

    print()
    
    avg_pred = combined_preds.groupby(combined_preds.index).median()
    score = mean_squared_error( y.reindex(avg_pred.index).iloc[:, -1], avg_pred )
    print('median: {:.4f}'.format(score ))
    median_scores.append(score)
    
    
    for d, s in wtg_scores.items():
        print('{}: {:.5f}'.format(d, np.mean(s)))

    print()

    print('median: {:.5f}'.format(np.mean(median_scores)))



    for s in [ -0.08, -0.15]:
        for e in [40, 60]:
            final_preds['mean{}_{:.2f}'.format(e, -s)].append(combined_preds.groupby(combined_preds.index).apply(expAvg, s, e))
            final_test_preds['mean{}_{:.2f}'.format(e, -s)].append(combined_tests.groupby(combined_tests.index).apply(expAvg, s, e))
            
#     final_preds['median'].append(combined_preds.groupby(combined_preds.index).median())
#     final_preds['all'].append(combined_preds);

#     final_test_preds['median'].append(combined_tests.groupby(combined_tests.index).median())
#     final_test_preds['all'].append(combined_tests);
    



14849.3s elapsed for 80 folds
 through epoch 40
 -0.08: 0.1874
 -0.15: 0.1873

 through epoch 60
 -0.08: 0.1876
 -0.15: 0.1880


median: 0.1873
40 -0.08: 0.18706
40 -0.15: 0.18708
60 -0.08: 0.18739
60 -0.15: 0.18771

median: 0.18706


In [24]:
for d, s in wtg_scores.items():
    print('{}: {:.5f}'.format(d, np.mean(s)))

print()

print('median: {:.5f}'.format(np.mean(median_scores)))

40 -0.08: 0.18706
40 -0.15: 0.18708
60 -0.08: 0.18739
60 -0.15: 0.18771

median: 0.18706


In [25]:
stacked_preds = dict([(k, pd.concat(v)) for k, v in final_preds.items()])
stacked_tests = dict([(k, pd.concat(v)) for k, v in final_test_preds.items()])

In [26]:
if not os.path.exists('out/'): os.mkdir('out')
# if not os.path.exists('out_test/'): os.mkdir('out_test')

In [27]:
print(len(final_preds['mean']))
[ stacked_preds[k].to_csv('out/NN_{}_{}layer_{}ch_f{}*{}_{}.csv'.format(
    label, layers[nidx], dims[nidx], fold_seed, len(median_scores), k))
         for k in stacked_preds.keys() ]

[ stacked_tests[k].to_csv('out/NN_{}_{}layer_{}ch_f{}*{}_{}_test.csv'.format(
    label, layers[nidx], dims[nidx], fold_seed, len(median_scores), k))
         for k in stacked_tests.keys() ]

0


[None, None, None, None]