In [1]:
from pathlib import Path
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
data_path = Path('/kaggle/input/playground-series-s3e8/')
train = pd.read_csv(data_path/'train.csv')
del train['id']

test = pd.read_csv(data_path/'test.csv')

train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [3]:
train['clarity'].unique()

array(['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1'],
      dtype=object)

In [4]:
# the categorical predictors in the dataset have order
# use the order information from the original dataset
ordinal_encoding = {
    'cut':{'Fair':0,'Good':1,'Very Good':2,'Premium':3,'Ideal':4}, # worst to best
    'clarity':{c:i for i,c in enumerate(['IF','VVS1','VVS2','VS1','VS2','SI1','SI2','I1'])}, # best to worst
    'color':{c:i for i,c in enumerate(['D','E','F','G','H','I','J'])} # best to worst
}

for col,mapper in ordinal_encoding.items():
    train[col] = train[col].replace(mapper).astype(float)
    test[col] = test[col].replace(mapper).astype(float)

In [5]:
# SANITY check
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   carat    193573 non-null  float64
 1   cut      193573 non-null  float64
 2   color    193573 non-null  float64
 3   clarity  193573 non-null  float64
 4   depth    193573 non-null  float64
 5   table    193573 non-null  float64
 6   x        193573 non-null  float64
 7   y        193573 non-null  float64
 8   z        193573 non-null  float64
 9   price    193573 non-null  int64  
dtypes: float64(9), int64(1)
memory usage: 14.8 MB


In [6]:
# preprocess data
from sklearn.preprocessing import RobustScaler, StandardScaler

input_sc = RobustScaler()
output_sc = StandardScaler()

X = input_sc.fit_transform(train.drop('price',axis=1).values)
y = output_sc.fit_transform(train[['price']].values).ravel()

In [7]:
import tensorflow as tf
import keras
from typing import Dict
from sklearn.utils import resample

In [8]:
def construct_model(params:Dict) -> keras.Model:
    """
    Build a 4-layer MLP for hyperparameter tuning
    """
    n_hidden = 4 # TODO: use this as a hyperparameter
    n_hidden_list = [params['hsize%d'%i] for i in range(1,n_hidden+1)]
    dropouts_list = [params['dropout%d'%i] for i in range(1,n_hidden+1)]

    model = keras.Sequential()
    model.add(keras.layers.Dense(n_hidden_list[0],input_shape=(X.shape[-1],)))
    model.add(keras.layers.LayerNormalization())
    model.add(keras.layers.Activation(keras.activations.relu))
    model.add(keras.layers.Dropout(dropouts_list[0]))

    if n_hidden > 1:
        for i in range(1,n_hidden):
            model.add(keras.layers.Dense(n_hidden_list[i]))
            model.add(keras.layers.LayerNormalization())
            model.add(keras.layers.Activation(keras.activations.relu))
            model.add(keras.layers.Dropout(dropouts_list[i]))
            
    # output is between 0 and 1
    model.add(keras.layers.Dense(1))

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=params['learning_rate']),
        loss='mean_squared_error'
    )

    return model


def build_and_fit(params:Dict,verbose:bool=False,resample_seed:int=0) -> keras.Model:
    '''
    Train a MLP with the given hyperparameters on a bootstrap sample. The function uses
    the out-of-bag samples as the validation set for early stopping.
    '''
    # first draw bootrstrap samples
    idxs_train = resample(np.arange(X.shape[0]),replace=True,random_state=resample_seed)
    # use original observations not sampled as test data
    sampled = set(idxs_train)
    idxs_val = np.array([idx for idx in np.arange(X.shape[0]) if idx not in sampled])
    print(idxs_val.shape[0])
    
    # get training and validation data
    X_train,X_val,y_train,y_val = X[idxs_train,:],X[idxs_val,:], y[idxs_train],y[idxs_val]        
        
    # create model
    model = construct_model(params)
    
    # optimization setup 
    EPOCHS = 100
    BATCH_SIZE = params['batch_size']
        
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        mode='min',
        verbose=0,
        patience=10,
        restore_best_weights=True
    )

    # train model
    _ = model.fit(
        X_train,
        y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        callbacks=[early_stopping],
        validation_data=(X_val, y_val),
        verbose=verbose,
    )

    return model

In [9]:
# best hyperparameters found through hyperopt
best = {
    'batch_size': 459,
    'dropout1': 0.05017815582897199,
    'dropout2': 0.21965806468947463,
    'dropout3': 0.10533572184114517,
    'dropout4': 0.051436413944709755,
    'hsize1': 128,
    'hsize2': 137,
    'hsize3': 86,
    'hsize4': 117,
    'learning_rate': 0.0007263155962552988
}

In [10]:
keras.backend.clear_session()

n_ensemble = 10
models = [None]*n_ensemble

for i in range(n_ensemble):
    print(f'********* Model {i+1} *********')
    models[i] = build_and_fit(best,verbose=True,resample_seed=i)
    print()

********* Model 1 *********
71284
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100

********* Model 2 *********
71294
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


In [11]:
# Create ensemble model
model_input = keras.Input(shape=(X.shape[1],))
model_outputs = [model(model_input) for model in models]
ensemble_output = keras.layers.Average()(model_outputs)
ensemble_model = keras.Model(inputs=model_input, outputs=ensemble_output)

print(ensemble_model.summary())

ensemble_model.save('ensemble_model.h5')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 9)]          0           []                               
                                                                                                  
 sequential (Sequential)        (None, 1)            42054       ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 1)            42054       ['input_1[0][0]']                
                                                                                                  
 sequential_2 (Sequential)      (None, 1)            42054       ['input_1[0][0]']                
                                                                                              

In [12]:
# generate predictions on the test set
X_test = input_sc.transform(test.drop('id',axis=1).values)
y_test_pred = ensemble_model.predict(X_test)

y_test_pred_orig = output_sc.inverse_transform(y_test_pred.reshape(-1,1)).ravel()



In [13]:
submission = pd.DataFrame({'id':test['id'],'price':y_test_pred_orig })
submission.to_csv('submission.csv',index=False)