In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
from sklearn.linear_model import Lasso
import lightgbm as lgbm

X = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')\
.drop(pd.read_csv('../input/math5470/validation_indexes.csv', index_col=0).index).sample(frac=0.5)
X.index = range(X.shape[0])
X.shape

(1178029, 304)

In [2]:
pd.read_csv('../input/math5470/validation_indexes.csv', index_col=0).shape

(785352, 0)

In [3]:
features = X.columns[4:]
corrs = list() 
for col in features:
    corr = np.corrcoef(X['target'], X[col])[0][1]
    corrs.append(corr)
    
corrs_feature_assetNum =pd.Series(corrs, index=features)

In [4]:
import numpy as np
from sklearn.model_selection import KFold
import scipy.stats as st
import xgboost as xgb

feature_subset = corrs_feature_assetNum.sort_values(ascending=False).index

y = X['target']
X = X[feature_subset]

kf = KFold(n_splits=3)
kf.get_n_splits(X)

params_1 = [0.1,0.2,0.3]
params_2 = [1,10]
scores = {}

for param_1 in params_1:
    for param_2 in params_2:
        scores[(param_1, param_2)] = []
        
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    for param_1 in params_1:
        for param_2 in params_2:
            model = xgb.XGBRegressor(
                n_estimators=100,
                learning_rate=param_1,
                max_depth=param_2,
                subsample=0.5,
                random_state=1)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            scores[(param_1, param_2)] += [st.pearsonr(y_pred, y_test)[0]]
            print(scores[(param_1, param_2)])

[0.1076037017768402]
[0.1454906606749752]
[0.11412031608057359]
[0.1265881388215182]
[0.11592074189737428]
[0.10424746288636874]
[0.1076037017768402, 0.10735933986155867]
[0.1454906606749752, 0.14155094808967827]
[0.11412031608057359, 0.11376039236937585]
[0.1265881388215182, 0.12209790826626798]
[0.11592074189737428, 0.11636415565407836]
[0.10424746288636874, 0.1099567292775272]
[0.1076037017768402, 0.10735933986155867, 0.11007961402087993]
[0.1454906606749752, 0.14155094808967827, 0.14487564515846368]
[0.11412031608057359, 0.11376039236937585, 0.11528279872544556]
[0.1265881388215182, 0.12209790826626798, 0.1195026430952446]
[0.11592074189737428, 0.11636415565407836, 0.11706001503109267]
[0.10424746288636874, 0.1099567292775272, 0.10341988019556358]


In [5]:
# feature_subset = corrs_feature_assetNum.sort_values(ascending=False)[:100].index

# X = X[feature_subset]

# kf = KFold(n_splits=3)
# kf.get_n_splits(X)

# params_1 = [0.01,0.1,1]
# params_2 = [1,6]
# scores = {}

# for param_1 in params_1:
#     for param_2 in params_2:
#         scores[(param_1, param_2)] = []
        
# for train_index, test_index in kf.split(X):
#     X_train, X_test = X.loc[train_index], X.loc[test_index]
#     y_train, y_test = y.loc[train_index], y.loc[test_index]
    
#     for param_1 in params_1:
#         for param_2 in params_2:
#             model = xgb.XGBRegressor(
#                 n_estimators=100,
#                 learning_rate=param_1,
#                 max_depth=param_2,
#                 subsample=0.5,
#                 random_state=1)
#             model.fit(X_train, y_train)
#             y_pred = model.predict(X_test)
#             scores[(param_1, param_2)] += [st.pearsonr(y_pred, y_test)[0]]
#             print(scores[(param_1, param_2)])

In [6]:
# feature_subset = corrs_feature_assetNum.sort_values(ascending=False)[:50].index

# X = X[feature_subset]

# kf = KFold(n_splits=3)
# kf.get_n_splits(X)

# params_1 = [0.01,0.1,1]
# params_2 = [1,6]
# scores = {}

# for param_1 in params_1:
#     for param_2 in params_2:
#         scores[(param_1, param_2)] = []
        
# for train_index, test_index in kf.split(X):
#     X_train, X_test = X.loc[train_index], X.loc[test_index]
#     y_train, y_test = y.loc[train_index], y.loc[test_index]
    
#     for param_1 in params_1:
#         for param_2 in params_2:
#             model = xgb.XGBRegressor(
#                 n_estimators=100,
#                 learning_rate=param_1,
#                 max_depth=param_2,
#                 subsample=0.5,
#                 random_state=1)
#             model.fit(X_train, y_train)
#             y_pred = model.predict(X_test)
#             scores[(param_1, param_2)] += [st.pearsonr(y_pred, y_test)[0]]
#             print(scores[(param_1, param_2)])

In [7]:
best_params = pd.DataFrame.from_dict(scores, orient='index').T.mean().sort_values().index[-1]
best_params

(0.1, 10)

In [8]:
# del X, X_train, X_test
# del y, y_train, y_test
# import gc
# gc.collect()

In [9]:
# X = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')#.sample(frac=0.2)
# feature_subset = corrs_feature_assetNum.sort_values(ascending=False).index
# y = X['target']
# X = X[feature_subset]

In [10]:
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=best_params[0],
    max_depth=best_params[1],
    subsample=0.5,
    random_state=1)
model.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=1, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.5,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = model.predict(test_df[feature_subset])  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
