In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics.scorer import make_scorer
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler
np.random.seed(7)

In [20]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [21]:
from preprocess import preprocess_train, preprocess, FEATURES, CATEGORICAL_FEATURES, TEST_FEATURES, CATEGORICAL_TEST_FEATURES_IDX

In [22]:
X, y = preprocess_train(train, categotical_features=CATEGORICAL_FEATURES, features=TEST_FEATURES)

In [23]:
X, y = shuffle(X, y)
y = y[:, None]    

In [24]:
# Scale data
scaler = StandardScaler()

In [25]:
y.shape

(8716, 1)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75)

## Feature preprocessing

In [27]:
def encode_categorical(X, cat_feat):
    '''
    Encodes categorical features with one-hot encoding and adds it into model
    
    Parameters
    ----------
    X: numpy.ndarray
        Training features
    cat_feat: list of int
        Categorical features indices
    
    Returns
    -------
    tweaked_X: numpy.ndarray
        Tweaked X
        
    '''
    
    # All the rest
    rest = np.ones(X.shape[1], np.bool)
    rest[cat_feat] = False
    
    X_rest = X[:, rest]
    
    # Encoded
    one_hot_encoded = []
    
    for col_idx in cat_feat:  
        encoded = label_binarize(X[:, col_idx], np.unique(X[:, col_idx]).astype(int))
        
        #print encoded.shape
        
        one_hot_encoded.append(
            encoded
        )
    
    one_hot_encoded.append(X_rest)
    
    return np.hstack(one_hot_encoded)

In [28]:
encoded_X = encode_categorical(X, CATEGORICAL_TEST_FEATURES_IDX)

In [29]:
encoded_X = scaler.fit_transform(encoded_X)

In [30]:
print X.shape, encoded_X.shape

(8716, 44) (8716, 46)


In [31]:
def batch_iterator(X, y, batch_size):
    for i in range(0, len(X) - batch_size, batch_size):
        yield X[i:i+batch_size], y[i:i+batch_size]

## Ynet4

#### CrossVal mean scores
* catboost_d10: 224.01548277848525
* catboost_d16: 

In [42]:
# Training simple model
from sklearn.model_selection import KFold
import tensorflow as tf

In [43]:
kfold = KFold(n_splits=5)

In [46]:
BATCH_SIZE = 64
EPOCHS = 1000
LEARNING_RATE=0.05

In [49]:
tf.reset_default_graph()

fold_num = 1

for train, test in kfold.split(encoded_X):
   
    print "Fold {}".format(fold_num)
    
    X_train, X_test = encoded_X[train], encoded_X[test]
    y_train, y_test = y[train], y[test]
    
    # Create validation datasest
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)
    
    # Model
    input_var = tf.placeholder(tf.float32, shape=[None, X_train.shape[1]])
    gt_var = tf.placeholder(tf.float32, shape=[None, 1])
    
    model = tf.layers.dense(input_var, 50, activation=tf.nn.elu)
    
    # Block1
    block1 = tf.layers.dense(model, 50, activation=tf.nn.elu)
    output1 = tf.layers.dense(block1, 1) 
    
    # Block2
    block2 = tf.layers.dense(model, 20, activation=tf.nn.elu)
    output2 = tf.layers.dense(block2, 1) 
    
    # Block2
    block3 = tf.layers.dense(model, 10, activation=tf.nn.elu)
    output3 = tf.layers.dense(block3, 1) 
    
    # Block2
    block4 = tf.layers.dense(model, 10, activation=tf.nn.elu)
    output4 = tf.layers.dense(block4, 1) 
    
    # Final output
    #final_output = tf.reduce_mean([output1, output2, output3, output4], axis=0)
    final_output = tf.layers.dense(tf.concat([output1, output2, output3, output4], axis=1), 1)
    
    # Loss function 
    loss = tf.reduce_mean(tf.losses.mean_squared_error(gt_var, output1)**.5 ) \
    + tf.reduce_mean(tf.losses.mean_squared_error(gt_var, output2)**.5 ) \
    + tf.reduce_mean(tf.losses.mean_squared_error(gt_var, output3)**.5 ) \
    + tf.reduce_mean(tf.losses.mean_squared_error(gt_var, output3)**.5 ) \
    + tf.reduce_mean(tf.losses.mean_squared_error(gt_var, final_output)**.5 )   
    
    metric = tf.reduce_mean(tf.losses.mean_squared_error(gt_var, final_output)**.5 )   
    
    
    opt = tf.train.AdamOptimizer().minimize(loss)
    
    init = tf.global_variables_initializer()
      
    results = []    
    
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(1, EPOCHS+1):  
            epoch_loss = []
            for X_batch, y_batch in batch_iterator(X_train, y_train, BATCH_SIZE):
                batch_loss, pred, _ = sess.run([metric, final_output, opt], feed_dict={input_var: X_batch, gt_var: y_batch})                      
                epoch_loss.append(batch_loss)                                
             # Validation
            val_loss = sess.run(metric, feed_dict={input_var: X_val, gt_var: y_val})
            if epoch % 25 == 0:
                print "Epoch {} Loss:{} Val loss: {}".format(epoch, np.mean(epoch_loss), val_loss)                                
        # Testing
        fold_test_loss = sess.run(metric, feed_dict={input_var: X_test, gt_var: y_test})

        results.append(fold_test_loss)        
        
        fold_num += 1   
    
    print "After 5 folds: {}".format(np.mean(results))

Fold 1
Epoch 25 Loss:314.149505615 Val loss: 340.738677979
Epoch 50 Loss:287.370697021 Val loss: 317.534088135
Epoch 75 Loss:275.498962402 Val loss: 309.536956787
Epoch 100 Loss:267.550140381 Val loss: 304.236724854
Epoch 125 Loss:260.264801025 Val loss: 299.75112915
Epoch 150 Loss:253.576538086 Val loss: 296.292510986
Epoch 175 Loss:247.419265747 Val loss: 293.496368408
Epoch 200 Loss:241.627487183 Val loss: 291.108856201
Epoch 225 Loss:235.946426392 Val loss: 289.057373047
Epoch 250 Loss:230.761474609 Val loss: 287.2605896
Epoch 275 Loss:226.099151611 Val loss: 285.719665527
Epoch 300 Loss:221.905426025 Val loss: 284.526123047
Epoch 325 Loss:218.067077637 Val loss: 283.495605469
Epoch 350 Loss:214.459060669 Val loss: 282.548126221
Epoch 375 Loss:211.055526733 Val loss: 281.543731689
Epoch 400 Loss:207.642318726 Val loss: 280.653747559
Epoch 425 Loss:204.280700684 Val loss: 280.069580078
Epoch 450 Loss:200.874588013 Val loss: 278.985717773
Epoch 475 Loss:197.576934814 Val loss: 278.32

In [84]:
# scorer = make_scorer(lambda a, b: mean_squared_error(a, b)**.5)
# scores = cross_val_score(lr, encoded_X, y, cv=5, scoring=scorer, verbose=1)

In [57]:
np.mean(scores)

340.449211318938

In [36]:
y_pred = cbr.fit(X_train, y_train, cat_features=CATEGORICAL_TEST_FEATURES_IDX).predict(X_test)
print "Mean squared error: {}".format(mean_squared_error(y_test, y_pred)**.5)

Mean squared error: 284.83019248


In [13]:
# Make submission
cbr.fit(X, y, cat_features=CATEGORICAL_TEST_FEATURES_IDX)

<catboost.core._CatBoostBase at 0x7f8a7f594a50>

In [14]:
test = preprocess(test, CATEGORICAL_FEATURES)

In [15]:
X = test[TEST_FEATURES].values

In [16]:
predictions = cbr.predict(X)

In [17]:
predictions

array([ 879.78794459, 1165.70020084,  279.39788828, ...,  154.47198918,
        -62.34725293,   36.55400176])

In [18]:
def make_submission(ids, predictions):
    df = pd.concat([ids, pd.Series(predictions)], axis=1)
    return df.rename(columns={0: 'value'})

In [19]:
l = test.id

In [20]:
df = pd.concat([test.id, pd.Series(predictions)], axis=1)

In [21]:
df = df.rename(columns={0: 'value'})

In [22]:
df.to_csv("catboost_d10.csv", index=False)