# Table of content

1. Data processing and manipulation
2. Model training with initial setting
3. RandomizedSearchCv to search for best hyper parameter
4. Model training with tuned setting
5. Model error comparison between initial setting and tuned hyper-parameter model
6. Tuned-model feature importance visualization

### Part 1:Data processing and manipulation

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import janestreet
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import time
from xgboost import XGBRegressor# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
chunksize = 10 ** 6
filename = r'/kaggle/input/jane-street-market-prediction/train.csv'
data_chunk = []
start_time = time.time()
data_chunk = pd.read_csv(filename)
print("--- %s seconds ---" % (time.time() - start_time))

***
Start with processing the label , since evaluation is only fixed to either buy or pass

Define a function to return either 0 = Pass , 1 = Buy 

**Buy = weight * resp > 0**

**Pass = weight * resp <= 0** 
***

In [None]:
def buy_or_pass(df):
    if df['action'] > 0:
        return 1
    else:
        return 0

Process on feature and action for model learning

In [None]:
# Data mentioned the return will be based on weight and resp columns . create another column call return 
def feature_action_split(dataframe_market):
    '''
    Input : Sample dataframe from Jane market prediction data
    Output : feature = not response , weight , date or ts_id
             action = 0 for pass and 1 for buy
    '''
#     dataframe_market = dataframe_market[dataframe_market['weight'] > 0]
    dataframe_market['action'] = dataframe_market['weight']*dataframe_market['resp']
    dataframe_market['action'] = dataframe_market.apply(buy_or_pass,axis=1) 
    feature = dataframe_market.drop(['date','weight','resp_1','resp_2','resp_3','resp_4','resp','ts_id','action'],axis=1)
    print("Features columns : ",feature.columns)
    action = dataframe_market[['action']]
    print("Action counts : \n",action.value_counts())
    return feature,action
data_chunk = data_chunk[data_chunk['weight'] > 0]
feature,action = feature_action_split(data_chunk)


In [None]:
action_transform = ((data_chunk['weight'].values * data_chunk['resp'].values) > 0).astype('int')


In [None]:
action_transform

In [None]:
action[:10]

### Part 2 : Model training with initial setting

Model training . Imputer is not necessary needed for xgboost.

With basic model , we fixed the params to run.

If you are running xgboost on gpu , enable tree_method = 'gpu_hist' else run on normal CPU. 

In [None]:
import xgboost as xgb
import time


train_X, test_X, train_y, test_y = train_test_split(feature, action.values.flatten(), test_size=0.25) # By default shuffle is true

my_imputer = SimpleImputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)
start_time = time.time()

initial_model = xgb.XGBClassifier(n_estimators=1000, 
                        max_depth=5, 
                        learning_rate=0.1, 
                        subsample=0.7,
                        colsample_bytree=0.8, 
                        colsample_bylevel=0.8, 
                        base_score=train_y.mean(),
                        tree_method= 'gpu_hist',
                        random_state=42, seed=42)

init_mod = initial_model.fit(train_X, train_y, 
                    early_stopping_rounds=10, 
                    eval_set=[(test_X, test_y)], eval_metric='error', 
                    verbose=100)

print("--- %s seconds ---" % (time.time() - start_time))


In [None]:
class_names = ['pass','buy']

disp = plot_confusion_matrix(init_mod, test_X, test_y,
                             display_labels=class_names,
                             cmap=plt.cm.Blues)
plt.title('Initial Model without tuning using Xgboost')

Initial result is decent where test data of 250k points. the losses will incurred when it's predicted __buy__ and it's actual __pass__.

Ideal case for the market is to minimize losses where prediction false positive where true label is pass but predicted as buy and secondly maximizing profit where true positive is predicted buy and true label as buy.



### Part 3: Model tuning using RandomizedSearchCv to search for best hyper parameter



This run will take time , it took 155 minutes to run over 1500 fits with GPU on. 
* Please make sure you on GPU setting when you are running this.

__This part will be commented out as it will take approximately 150 minutes to run__

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# params = {
#         'learning_rate': [0.03, 0.01, 0.003, 0.001],
#         'min_child_weight': [1,3, 5,7, 10],
#         'gamma': [0, 0.5, 1, 1.5, 2, 2.5, 5],
#         'subsample': [0.6, 0.8, 1.0, 1.2, 1.4],
#         'colsample_bytree': [0.6, 0.8, 1.0, 1.2, 1.4],
#         'max_depth': [3, 4, 5, 6, 7, 8, 9 ,10, 12, 14],
#         'reg_lambda':np.array([0.4, 0.6, 0.8, 1, 1.2, 1.4])}

# # specific parameters. I set early stopping to avoid overfitting and specify the validation dataset 
# fit_params = { 
#         'early_stopping_rounds':10,
#         'eval_set':[(test_X, test_y)]}

# # let's run the optimization
# random_search = RandomizedSearchCV(init_mod, param_distributions=params, n_iter=500,
#                                    scoring="precision", n_jobs=-1,  verbose=3, random_state=42, cv=3 )


In [None]:
# random_search.fit(train_X,train_y, **fit_params)
# print(" Results from Random Search " )
# print("\n The best estimator across ALL searched params:\n", random_search.best_estimator_)
# print("\n The best score across ALL searched params:\n", random_search.best_score_)
# print("\n The best parameters across ALL searched params:\n", random_search.best_params_)

### Part 4:  Model training with best hyper-parameter result

In [None]:
import xgboost as xgb
import time



train_X, test_X, train_y, test_y = train_test_split(feature, action.values.flatten(), test_size=0.25) # By default shuffle is true

my_imputer = SimpleImputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)
start_time = time.time()

tuned_model = xgb.XGBClassifier(n_estimators=1000, 
                        max_depth=14, 
                        learning_rate=0.01, 
                        subsample=1,
                        colsample_bytree=0.8, 
                        colsample_bylevel=0.8, 
                        gamma=0.5,reg_lambda = 1.4,
                        base_score=train_y.mean(),
                        tree_method= 'gpu_hist',
                        random_state=42, seed=42)


tuned_mod = tuned_model.fit(train_X, train_y, 
                    early_stopping_rounds=10, 
                    eval_set=[(test_X, test_y)], eval_metric='error', 
                    verbose=100)

print("--- %s seconds ---" % (time.time() - start_time))


In [None]:
class_names = ['pass','buy']

disp = plot_confusion_matrix(tuned_mod, test_X, test_y,
                             display_labels=class_names,
                             cmap=plt.cm.Blues)
plt.title('Tuned Model')

### Part 5: Model error comparison between initial Xgboost classifer model and tuned hyperparameter xgboost classifier model

In [None]:

init_model_res = init_mod.evals_result()
tuned_model_res = tuned_mod.evals_result()
epochs_init = len(init_model_res['validation_0']['error'])
epochs_tuned = len(tuned_model_res['validation_0']['error'])
x_axis_init = range(0, epochs_init)
x_axis_tuned = range(0,epochs_tuned)
# plot classification error
fig, ax = plt.subplots()
ax.plot(x_axis_init, init_model_res['validation_0']['error'], label='Initial classification error')
ax.plot(x_axis_tuned, tuned_model_res['validation_0']['error'], label='Tuned classification error')

ax.legend()
plt.ylabel('Classification Error')
plt.title('XGBoost Classification error on test data')
plt.show()

### Part 6: Tuned-model feature importance visualization

In [None]:
import matplotlib.pyplot as plt
from xgboost import plot_importance
print(tuned_mod.get_booster().get_score(importance_type='weight'))

fig,ax = plt.subplots(figsize=(15,15))
plot_importance(tuned_mod,ax=ax,max_num_features=10)
plt.show()

In [None]:
test_df.iloc[:,2:].values

In [None]:
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df.action = tuned_mod.predict(test_df.iloc[:,2:].values)
    env.predict(sample_prediction_df)


In [None]:
import torch