In [1]:
import gym as kagglegym
import tensorflow
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LinearRegression, SGDRegressor, Ridge
from sklearn import decomposition
from sklearn import ensemble
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import gc

%matplotlib inline

In [2]:
class Target(object):
    columns = ["id", "timestamp"]
class Observation(object):
    target = Target()
    train = None
    
observation = Observation()
with pd.HDFStore("./train.h5", "r") as train:
    
    observation.train = train.get("train")
    print(observation.train.columns)
    
# The "environment" is our interface for code competitions
# env = kagglegym.make('CartPole-v0')

# # We get our initial observation by calling "reset"
# observation = env.reset()
# print(observation)
print(observation.train[observation.target.columns].head())
# # Note that the first observation we get has a "train" dataframe
print("Train has {} rows".format(len(observation.train)))

# # The "target" dataframe is a template for what we need to predict:
print("Target column names: {}".format(", ".join(['"{}"'.format(col) for col in list(observation.target.columns)])))



Index(['id', 'timestamp', 'derived_0', 'derived_1', 'derived_2', 'derived_3',
       'derived_4', 'fundamental_0', 'fundamental_1', 'fundamental_2',
       ...
       'technical_36', 'technical_37', 'technical_38', 'technical_39',
       'technical_40', 'technical_41', 'technical_42', 'technical_43',
       'technical_44', 'y'],
      dtype='object', length=111)
   id  timestamp
0  10          0
1  11          0
2  12          0
3  25          0
4  26          0
Train has 1710756 rows
Target column names: "id", "timestamp"


In [3]:
cols = list(observation.train.columns)
cols.remove('id')
cols.remove('timestamp')
cols.remove('y')

mean_values = observation.train[cols].mean(axis=0)
xStd = observation.train[cols].std(axis=0)  # Remember to save this value



In [4]:
for c in cols:
    observation.train[c].fillna(mean_values[c], inplace=True)
    observation.train[c] = np.tanh((observation.train[c] - mean_values[c])/xStd[c])

In [None]:
observation.train.describe()

In [5]:
train, test = train_test_split(observation.train.copy(), test_size=0.33, random_state=42)

In [6]:
(train.shape, test.shape)

((1146206, 111), (564550, 111))

In [7]:
desc = dict(observation.train.y.describe())

cut_max = 0.093
cut_min = -0.086
(desc["max"], desc["min"])

(0.093497805297374725, -0.086094126105308533)

In [8]:
median_group = dict(observation.train.groupby(["id"])["y"].median())

In [None]:
#median_group

In [None]:
sns.distplot(observation.train.y);

In [9]:
def get_weighted_y(series):
    id, y = series["id"], series["y"]
    return 0.95 * y + 0.05 * median_group[id] if id in median_group else y

In [None]:
# Ridge

In [10]:
ridge_cols_to_use = ['technical_30', 'technical_20', 'fundamental_11'] + ['technical_19', 'technical_36'] 
y_is_above_cut = (train.y > cut_max)
y_is_below_cut = (train.y < cut_min)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)

In [11]:
X = np.array(train.loc[y_is_within_cut, ridge_cols_to_use].values)
y = train.loc[y_is_within_cut, "y"]

poly = PolynomialFeatures(3)
X_poly = poly.fit_transform(np.array(train.loc[y_is_within_cut, ridge_cols_to_use].values))


In [12]:
model = Ridge()
model.fit(X_poly, y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [15]:
ridge_pred_test = test[["id", "y"]].copy()
X_test = poly.transform(test[ridge_cols_to_use])
ridge_pred_test.y = model.predict(X_test).clip(cut_min, cut_max)
ridge_pred_test.y = ridge_pred_test.apply(get_weighted_y, axis = 1)
test["y_ridge"] = ridge_pred_test.y.astype(np.float32)

ridge_pred_train = train[["id", "y"]].copy()
ridge_pred_train.y = model.predict(poly.transform(train[ridge_cols_to_use])).clip(cut_min, cut_max)
ridge_pred_train.y = ridge_pred_train.apply(get_weighted_y, axis = 1)
train["y_ridge"] = ridge_pred_train.y.astype(np.float32)

In [26]:
# GBoost
gb_cols = ridge_cols_to_use + ['y_ridge']
X_gb = np.array(train.loc[y_is_within_cut, gb_cols].values)

In [37]:
gboptions = {"verbose":True, "max_depth":4, "n_estimators":250, "subsample":0.8, "learning_rate":0.01}
rfoptions = {"verbose":True, "max_depth":4, "n_estimators":250, "n_jobs":-1}

regression = ensemble.GradientBoostingRegressor(**gboptions)
#regression = ensemble.RandomForestRegressor(**rfoptions)
regression.fit(X_gb, y)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.8min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=-1, oob_score=False, random_state=None,
           verbose=True, warm_start=False)

In [38]:
gb_pred_test = test[["id", "y"]].copy()
gb_pred_test.y = regression.predict(test[gb_cols])
gb_pred_test.y = ridge_pred_test.apply(get_weighted_y, axis = 1)
#test["y_ridge"] = ridge_pred_test.y.astype(np.float32)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    0.8s finished


In [39]:
# Metric
metrics.r2_score(test.y.values, gb_pred_test.y)

0.00090052366552484386

In [None]:
# metrics.r2_score(test.y.values, gb_pred_test.y)
metrics.r2_score(test.y.values, ridge_pred_test.y)
# ElasticNetCV - 0.000107 (3)
# ElasticNetCV - 0.000109 (5)
# Ridge        - 0.000270 (3)
# Ridge        - 0.000303 (5)
# Ridge        - 0.000727 (5) + poly 3
# Ridge        - 0.000823 (5) + poly 4
# GBoost       - 0.000912 (5)

In [None]:
plt.plot(range(len(test)), np.cumsum(ridge_pred_test.y.values))
plt.plot(range(len(test)), np.cumsum(test.y.values))

In [None]:
# cols = list(train.columns)
# cols.remove('id')
# cols.remove('timestamp')
# cols.remove('y')
cols = ridge_cols_to_use + ['y_ridge']

In [None]:
#GBoost

In [None]:
gboptions = {"verbose":True, "max_depth":4, "n_estimators":200, "subsample":0.8, "learning_rate":0.02}

regression = ensemble.GradientBoostingRegressor(**gboptions)
regression.fit(X, y)

In [None]:
metrics.r2_score(test.y.values, gb_pred_test.y)

In [None]:
# NN for dummies

In [None]:
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=cols.__len__())]
DNN = tf.contrib.learn.DNNRegressor(hidden_units=[200, 100, 40], feature_columns=feature_columns, activation_fn=tf.tanh)
DNN.fit(train[cols].values, train["y"].values, steps=50000, batch_size=16)


In [None]:
prediction_tf = DNN.predict(test[cols].values)

In [None]:
prediction_tf_2  = list(prediction_tf)

In [None]:
metrics.r2_score(test.y.values, prediction_tf_2)

In [None]:
# NN Classic

In [None]:
def wb(wshape=[None], bshape=[None], device='/cpu:0'):
    with tf.device(device):
        w = tf.get_variable("w", wshape, initializer=tf.truncated_normal_initializer(stddev=0.1))
        b = tf.get_variable('b', bshape, initializer=tf.constant_initializer(0.0))
    return w, b

In [None]:
batch_size = 32
l2_reg_norm = 5e-5
features = cols.__len__()
train_size = train.shape[0]

widest = 64

tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    X_tf = tf.placeholder(tf.float32, shape=(batch_size, features))
    y_tf = tf.placeholder(tf.float32, shape=(batch_size, 1))
    X_test = tf.constant(test[cols].values)
    y_test = tf.constant(test[["y"]].values)

    with tf.variable_scope("Layer1"):
            layer1_weights, layer1_biases = wb([features, widest], [widest])
    with tf.variable_scope("Layer2"):
            layer2_weights, layer2_biases = wb([widest, widest], [widest])
    with tf.variable_scope("Layer3"):
            layer3_weights, layer3_biases = wb([widest, 1], [1])

    def model(data, train=True):
        print("data", data.get_shape())
        
        layer1 = tf.nn.relu(tf.matmul(data, layer1_weights) + layer1_biases)
        print("layer1", layer1.get_shape())
        if train:
            layer1 = tf.nn.dropout(layer1, 0.5)
        
        layer2 = tf.nn.relu(tf.matmul(layer1, layer2_weights) + layer2_biases)
        print("layer2", layer2.get_shape())
        if train:
            layer2 = tf.nn.dropout(layer2, 0.5)
            
        layer3 = tf.tanh(tf.matmul(layer2, layer3_weights) + layer3_biases)
        print("layer3", layer3.get_shape())
        
        return layer3
    
    with tf.device("/cpu:0"):
        predictions = model(X_tf)
        regularizers = (tf.nn.l2_loss(layer1_weights) + tf.nn.l2_loss(layer1_biases) + 
                        tf.nn.l2_loss(layer2_weights) + tf.nn.l2_loss(layer2_biases) + 
                        tf.nn.l2_loss(layer3_weights) + tf.nn.l2_loss(layer3_biases))
        loss_l2 = l2_reg_norm * regularizers
        loss_data = tf.reduce_mean(tf.square(predictions - y_tf))
        loss = loss_data #+ loss_l2
        

        # Optimizer.
        global_step = tf.Variable(0, trainable=False)
        learn_rate  = tf.train.exponential_decay(.001, global_step*batch_size, train_size, 0.5, staircase=True)
        optimizer = tf.train.AdamOptimizer(learn_rate).minimize(loss, global_step=global_step, name="Optimizer")

        #test
    
        test_prediction = model(X_test, train=False)
        loss_test = metrics.r2_score(y_test, test_prediction)

In [None]:
num_steps = train_size//batch_size * 4
print(num_steps)
y_vals = train[["y"]].values
x_vals = train[cols].values
train_preds_nn = []
with tf.Session(graph=graph) as sess:
    init_op = tf.initialize_all_variables()
    saver = tf.train.Saver()
    init_op.run()
    print("Initialized valiables")
    for i in range(num_steps):

        offset = (i*batch_size) % (train_size - batch_size)
        y_ = y_vals[offset:offset+batch_size, :]
        X_ = x_vals[offset:offset+batch_size, :]

        feed_dict = {X_tf : X_, y_tf : y_}
        _, l, pred = sess.run([optimizer, loss, predictions], feed_dict=feed_dict)
        if (i < 1000 and i%100 == 0) or i%10000 == 0:
            print(i, l)
        if i>0 and i%30000 == 0:
            print(i, "test", loss_test.eval())
#         if i>0 and i%30000 == 0:
#             save_path = saver.save(sess, "ts_{}.ckpt".format(i))
    train_preds_nn = test_prediction.eval()
    save_path = saver.save(sess, "ts_end.ckpt")

In [None]:

# fig = plt.figure(figsize=(8, 20))
# plot_count = 0
# for col in cols:
#     plot_count += 1
#     plt.subplot(cols.__len__(), 1, plot_count)
#     plt.scatter(range(train.shape[0]), train[col].values)
#     plt.title("Distribution of "+col)
# plt.show()

In [None]:

plt.plot(range(100), test.y.values[:100])
plt.plot(range(100), train_preds_nn[:100])

In [None]:
ridge_pred_test.y.values[:10]