In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics.scorer import make_scorer
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [3]:
from preprocess import preprocess_train, preprocess, FEATURES, CATEGORICAL_FEATURES, TEST_FEATURES, CATEGORICAL_TEST_FEATURES_IDX

In [4]:
X, y = preprocess_train(train, categotical_features=CATEGORICAL_FEATURES, features=TEST_FEATURES)

In [5]:
X, y = shuffle(X, y)
y = y[:, None]

In [6]:
# Scale data
scaler = StandardScaler()

In [7]:
y.shape

(8716, 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75)

## Feature preprocessing

In [9]:
def encode_categorical(X, cat_feat):
    '''
    Encodes categorical features with one-hot encoding and adds it into model
    
    Parameters
    ----------
    X: numpy.ndarray
        Training features
    cat_feat: list of int
        Categorical features indices
    
    Returns
    -------
    tweaked_X: numpy.ndarray
        Tweaked X
        
    '''
    
    # All the rest
    rest = np.ones(X.shape[1], np.bool)
    rest[cat_feat] = False
    
    X_rest = X[:, rest]
    
    # Encoded
    one_hot_encoded = []
    
    for col_idx in cat_feat:  
        encoded = label_binarize(X[:, col_idx], np.unique(X[:, col_idx]).astype(int))
        
        #print encoded.shape
        
        one_hot_encoded.append(
            encoded
        )
    
    one_hot_encoded.append(X_rest)
    
    return np.hstack(one_hot_encoded)

In [10]:
encoded_X = encode_categorical(X, CATEGORICAL_TEST_FEATURES_IDX)

In [11]:
encoded_X = scaler.fit_transform(encoded_X)

In [12]:
print X.shape, encoded_X.shape

(8716, 44) (8716, 46)


In [13]:
def batch_iterator(X, y, batch_size):
    for i in range(0, len(X) - batch_size, batch_size):
        yield X[i:i+batch_size], y[i:i+batch_size]

## Neural Network

#### CrossVal mean scores
* catboost_d10: 224.01548277848525
* catboost_d16: 

In [14]:
# Training simple model
from sklearn.model_selection import KFold
import tensorflow as tf

In [15]:
kfold = KFold(n_splits=5)

In [16]:
BATCH_SIZE = 32
EPOCHS = 1000

In [None]:
tf.reset_default_graph()

fold_num = 1

for train, test in kfold.split(encoded_X):
   
    print "Fold {}".format(fold_num)
    
    X_train, X_test = encoded_X[train], encoded_X[test]
    y_train, y_test = y[train], y[test]
    
    # Create validation datasest
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)
    
    # Model
    input_var = tf.placeholder(tf.float32, shape=[None, X_train.shape[1]])
    gt_var = tf.placeholder(tf.float32, shape=[None, 1])
    
    model = tf.layers.dense(input_var, 50, activation=tf.nn.elu)
    model = tf.layers.dense(model, 50, activation=tf.nn.elu)
    model = tf.layers.dense(model, 50, activation=tf.nn.elu)
    model = tf.layers.dense(model, 50, activation=tf.nn.elu)
    model = tf.layers.dense(model, 50, activation=tf.nn.elu)
    output = tf.layers.dense(model, 1) 
    
    # Loss function 
    loss = tf.reduce_mean(tf.losses.mean_squared_error(gt_var, output)**.5 )
    opt = tf.train.AdamOptimizer().minimize(loss)
    
    init = tf.global_variables_initializer()
  
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(1, EPOCHS+1):  
            epoch_loss = []
            for X_batch, y_batch in batch_iterator(X_train, y_train, BATCH_SIZE):
                batch_loss, pred, _ = sess.run([loss, output, opt], feed_dict={input_var: X_batch, gt_var: y_batch})                      
                epoch_loss.append(batch_loss)                                
             # Validation
            val_loss = sess.run(loss, feed_dict={input_var: X_val, gt_var: y_val})
                                               
            print "Epoch {} Loss:{} Val loss: {}".format(epoch, np.mean(epoch_loss), val_loss)
        fold_num += 1

Fold 1
Epoch 1 Loss:383.995269775 Val loss: 347.532440186
Epoch 2 Loss:341.911956787 Val loss: 337.476104736
Epoch 3 Loss:334.416351318 Val loss: 330.11340332
Epoch 4 Loss:326.847381592 Val loss: 321.784698486
Epoch 5 Loss:318.927429199 Val loss: 314.336547852
Epoch 6 Loss:311.939758301 Val loss: 308.616119385
Epoch 7 Loss:305.973480225 Val loss: 303.845458984
Epoch 8 Loss:300.771514893 Val loss: 299.565093994
Epoch 9 Loss:296.221923828 Val loss: 295.799682617
Epoch 10 Loss:292.042358398 Val loss: 292.37701416
Epoch 11 Loss:288.230957031 Val loss: 289.510955811
Epoch 12 Loss:284.77166748 Val loss: 286.966888428
Epoch 13 Loss:281.52243042 Val loss: 284.697601318
Epoch 14 Loss:278.445800781 Val loss: 282.68838501
Epoch 15 Loss:275.51171875 Val loss: 280.936523438
Epoch 16 Loss:272.685577393 Val loss: 279.291778564
Epoch 17 Loss:270.029174805 Val loss: 277.783111572
Epoch 18 Loss:267.564605713 Val loss: 276.409942627
Epoch 19 Loss:265.238800049 Val loss: 275.252471924
Epoch 20 Loss:263.05

In [84]:
# scorer = make_scorer(lambda a, b: mean_squared_error(a, b)**.5)
# scores = cross_val_score(lr, encoded_X, y, cv=5, scoring=scorer, verbose=1)

In [57]:
np.mean(scores)

340.449211318938

In [36]:
y_pred = cbr.fit(X_train, y_train, cat_features=CATEGORICAL_TEST_FEATURES_IDX).predict(X_test)
print "Mean squared error: {}".format(mean_squared_error(y_test, y_pred)**.5)

Mean squared error: 284.83019248


In [13]:
# Make submission
cbr.fit(X, y, cat_features=CATEGORICAL_TEST_FEATURES_IDX)

<catboost.core._CatBoostBase at 0x7f8a7f594a50>

In [14]:
test = preprocess(test, CATEGORICAL_FEATURES)

In [15]:
X = test[TEST_FEATURES].values

In [16]:
predictions = cbr.predict(X)

In [17]:
predictions

array([ 879.78794459, 1165.70020084,  279.39788828, ...,  154.47198918,
        -62.34725293,   36.55400176])

In [18]:
def make_submission(ids, predictions):
    df = pd.concat([ids, pd.Series(predictions)], axis=1)
    return df.rename(columns={0: 'value'})

In [19]:
l = test.id

In [20]:
df = pd.concat([test.id, pd.Series(predictions)], axis=1)

In [21]:
df = df.rename(columns={0: 'value'})

In [22]:
df.to_csv("catboost_d10.csv", index=False)