In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Standard plotly imports
import plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
import cufflinks
import cufflinks as cf
import plotly.figure_factory as ff
import os


import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import xgboost as xgb
print("XGBoost version:", xgb.__version__)

In [None]:
%%time

train_data_datatable = dt.fread('../input/jane-street-market-prediction/train.csv')
train = train_data_datatable.to_pandas().sample(replace=False, frac=0.007, random_state=42)

del train_data_datatable

features = pd.read_csv('../input/jane-street-market-prediction/features.csv')
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')
print ("Data is loaded!")

In [None]:
print('train shape is {}'.format(train.shape))
print('features shape is {}'.format(features.shape))
print('example_test shape is {}'.format(example_test.shape))
print('sample_prediction_df shape is {}'.format(sample_prediction_df.shape))

In [None]:
train.head()

In [None]:
example_test.head()

In [None]:
sample_prediction_df.head()

In [None]:
missing_values_count = train.isnull().sum()
print (missing_values_count)
total_cells = np.product(train.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)

In [None]:
train = train[train['weight'] != 0]

train['action'] = (train['resp'].values > 0).astype('int')


X_train = train.loc[:, train.columns.str.contains('feature')]
y_train = train.loc[:, 'action']

In [None]:
# Model params and some modifications taked from this kernel
# https://www.kaggle.com/wilddave/xgb-starter
#X_train = X_train.fillna(-999)
f_mean = X_train.mean()
X_train.fillna(f_mean)

In [None]:
x = train['action'].value_counts().index
y = train['action'].value_counts().values

trace2 = go.Bar(
     x=x ,
     y=y,
     marker=dict(
         color=y,
         colorscale = 'Viridis',
         reversescale = True
     ),
     name="Imbalance",    
 )
layout = dict(
     title="Data imbalance - action",
     #width = 900, height = 500,
     xaxis=go.layout.XAxis(
     automargin=True),
     yaxis=dict(
         showgrid=False,
         showline=False,
         showticklabels=True,
 #         domain=[0, 0.85],
     ), 
)
fig1 = go.Figure(data=[trace2], layout=layout)
iplot(fig1)

In [None]:
del x, y, train, features, example_test, sample_prediction_df

In [None]:
%%time

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import uniform, randint

clf = xgb.XGBClassifier(random_state = 10)

param_grid = {'max_depth': [6], 
               'n_estimators': [100] #[120]
              }

tree_cv = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, cv=5)

In [None]:
%time tree_cv.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
    y_preds = tree_cv.predict(X_test)
    sample_prediction_df.action = y_preds
    #print(classification_report(, y_preds, target_names=target_names))
    env.predict(sample_prediction_df)