In [1]:
import numpy as np
import pandas as pd
import subprocess
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [65]:
train = pd.read_csv('../input/train.csv.zip')
test = pd.read_csv('../input/test.csv.zip')
submission = pd.read_csv('../input/sample_submission.csv.zip')

In [8]:
train.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [9]:
test.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,4,A,B,A,A,A,A,A,A,B,...,0.281143,0.466591,0.317681,0.61229,0.34365,0.38016,0.377724,0.369858,0.704052,0.392562
1,6,A,B,A,B,A,A,A,A,B,...,0.836443,0.482425,0.44376,0.7133,0.5189,0.60401,0.689039,0.675759,0.453468,0.208045
2,9,A,B,A,B,B,A,B,A,B,...,0.718531,0.212308,0.325779,0.29758,0.34365,0.30529,0.24541,0.241676,0.258586,0.297232
3,12,A,A,A,A,B,A,A,A,A,...,0.397069,0.36993,0.342355,0.40028,0.33237,0.3148,0.348867,0.341872,0.592264,0.555955
4,15,B,A,A,A,A,B,A,A,A,...,0.302678,0.398862,0.391833,0.23688,0.43731,0.50556,0.359572,0.352251,0.301535,0.825823


In [11]:
np.random.seed(123)
index = list(train.index)
print(index[0:10])
np.random.shuffle(index)
print(index[0:10])
train = train.iloc[index]
'train = train.iloc[np.random.permutation(len(train))]'

## set test loss to NaN
test['loss'] = np.nan

## response and IDs
y = np.log(train['loss'].values+200)
id_train = train['id'].values
id_test = test['id'].values

## stack train test
ntrain = train.shape[0]
tr_te = pd.concat((train, test), axis = 0)

## Preprocessing and transforming to sparse data
sparse_data = []

f_cat = [f for f in tr_te.columns if 'cat' in f]
for f in f_cat:
    dummy = pd.get_dummies(tr_te[f].astype('category'))
    tmp = csr_matrix(dummy)
    sparse_data.append(tmp)

f_num = [f for f in tr_te.columns if 'cont' in f]
scaler = StandardScaler()
tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))
sparse_data.append(tmp)

del(tr_te, train, test)

## sparse train and test data
xtr_te = hstack(sparse_data, format = 'csr')
xtrain = xtr_te[:ntrain, :]
xtest = xtr_te[ntrain:, :]

print('Dim train', xtrain.shape)
print('Dim test', xtest.shape)

del(xtr_te, sparse_data, tmp)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[33898, 43407, 3458, 100139, 60086, 117290, 89806, 7751, 164059, 143297]
Dim train (188318, 1190)
Dim test (125546, 1190)


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization, PReLU

In [17]:
y

array([7.56987608, 7.84829365, 7.41602364, ..., 7.33236921, 9.17228019,
       6.37051723])

In [18]:
xtrain

<188318x1190 sparse matrix of type '<class 'numpy.float64'>'
	with 24481340 stored elements in Compressed Sparse Row format>

In [66]:
# Separate features and targets
X = xtrain
Y = y

# Set the input shape
input_shape = (xtrain.shape[1],)
print(f'Feature shape: {input_shape}')

# Create the model
model = Sequential()
model.add(Dense(16, input_shape=input_shape, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))


Feature shape: (1190,)


In [67]:
X.toarray()

array([[ 0.        ,  1.        ,  1.        , ..., -0.95914517,
        -0.61232072, -1.26329393],
       [ 1.        ,  0.        ,  1.        , ...,  0.39395194,
        -0.71240011, -1.1963163 ],
       [ 1.        ,  0.        ,  1.        , ...,  0.91652738,
         0.94750019, -0.90385702],
       ...,
       [ 0.        ,  1.        ,  1.        , ..., -0.83567771,
        -0.01839175,  1.55127219],
       [ 1.        ,  0.        ,  0.        , ..., -0.23903515,
        -0.80958157, -0.56602101],
       [ 0.        ,  1.        ,  1.        , ...,  0.39395194,
         1.53407398, -0.95125139]])

In [68]:
# Configure the model and start training
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(X.toarray(), Y, epochs=20, batch_size=1024, verbose=1, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f20e41ee070>

In [70]:
preds = np.exp(model.predict(xtest))-200
preds

array([[ 1296.1677],
       [ 1648.4943],
       [11054.149 ],
       ...,
       [ 2436.859 ],
       [  904.2169],
       [ 3309.8083]], dtype=float32)

In [72]:
submission['loss'] = preds
submission.to_csv('../submissions/sub_simple_mlp_0.csv', index=False)
submission.head()

Unnamed: 0,id,loss
0,4,1296.167725
1,6,1648.494263
2,9,11054.149414
3,12,5829.130859
4,15,771.655334


In [74]:
## neural net
def nn_model():
    model = Sequential()
    
    model.add(Dense(400, input_dim = xtrain.shape[1], kernel_initializer = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
        
    model.add(Dense(200, kernel_initializer = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(50, kernel_initializer = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(1, kernel_initializer = 'he_normal'))
    model.compile(loss = 'mae', optimizer = 'adadelta')
    return(model)

In [78]:
model = nn_model()

In [79]:
model.fit(X.toarray(), Y, epochs=600, batch_size=1024, verbose=1, validation_split=0.2)

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

<keras.callbacks.History at 0x7f20c80a25b0>

In [80]:
preds = np.exp(model.predict(xtest))-200
preds

array([[ 840.42236],
       [1138.6089 ],
       [2698.374  ],
       ...,
       [3150.0344 ],
       [ 999.2999 ],
       [2696.4177 ]], dtype=float32)

In [81]:
submission['loss'] = preds
submission.to_csv('../submissions/sub_cmplx_mlp_0.csv', index=False)
submission.head()

Unnamed: 0,id,loss
0,4,840.422363
1,6,1138.608887
2,9,2698.374023
3,12,4880.146484
4,15,738.596924


In [82]:
# Configure the model and start training
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(X.toarray(), Y, epochs=20, batch_size=100, verbose=1, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f20b047a430>

In [83]:
preds = np.exp(model.predict(xtest))-200
preds

array([[1631.2096],
       [1847.1476],
       [9865.234 ],
       ...,
       [2451.0864],
       [1141.5431],
       [3928.2632]], dtype=float32)

In [85]:
submission['loss'] = preds
submission.to_csv('../submissions/sub_cmplx_mlp_1.csv', index=False)
submission.head()

Unnamed: 0,id,loss
0,4,1631.209595
1,6,1847.147583
2,9,9865.234375
3,12,7891.284668
4,15,862.37085


In [87]:
model.summary()

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_48 (Dense)             (None, 400)               476400    
_________________________________________________________________
p_re_lu_15 (PReLU)           (None, 400)               400       
_________________________________________________________________
batch_normalization_15 (Batc (None, 400)               1600      
_________________________________________________________________
dropout_15 (Dropout)         (None, 400)               0         
_________________________________________________________________
dense_49 (Dense)             (None, 200)               80200     
_________________________________________________________________
p_re_lu_16 (PReLU)           (None, 200)               200       
_________________________________________________________________
batch_normalization_16 (Batc (None, 200)             

In [88]:
# Configure the model and start training
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(X.toarray(), Y, epochs=20, batch_size=50, verbose=1, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f20b01d0730>

In [89]:
preds = np.exp(model.predict(xtest))-200
submission['loss'] = preds
submission.to_csv('../submissions/sub_cmplx_mlp_2.csv', index=False)
submission.head()

Unnamed: 0,id,loss
0,4,1593.620483
1,6,1755.819824
2,9,9094.369141
3,12,8058.703125
4,15,849.200195


In [91]:
# Configure the model and start training
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(X.toarray(), Y, epochs=10, batch_size=1024, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f20985559d0>

In [92]:
preds = np.exp(model.predict(xtest))-200
submission['loss'] = preds
submission.to_csv('../submissions/sub_cmplx_mlp_3.csv', index=False)
submission.head()

Unnamed: 0,id,loss
0,4,1480.35791
1,6,1785.122314
2,9,9858.220703
3,12,7311.822266
4,15,842.061646


In [94]:
# Configure the model and start training
model = Sequential()
model.add(Dense(16, input_shape=input_shape, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(X.toarray(), Y, epochs=10, batch_size=2048, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f209806d280>

In [96]:
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(X.toarray(), Y, epochs=10, batch_size=128, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f208460c490>

In [97]:
model.compile(loss='mae', optimizer='adam', metrics=['mae'])
model.fit(X.toarray(), Y, epochs=20, batch_size=128, verbose=1, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f20844ec520>

In [100]:
model = nn_model()
model.fit(X.toarray(), Y, epochs=100, batch_size=1000, verbose=1, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f20b02d5bb0>

In [101]:
model.fit(X.toarray(), Y, epochs=100, batch_size=100, verbose=1, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f20505c55b0>