In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import gc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ubiquant-market-prediction/example_sample_submission.csv
/kaggle/input/ubiquant-market-prediction/example_test.csv
/kaggle/input/ubiquant-market-prediction/train.csv
/kaggle/input/ubiquant-market-prediction/ubiquant/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/ubiquant-market-prediction/ubiquant/__init__.py


In [2]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
# insIdCt=getDistinctCountByColumn(path,'investment_id')
# insIdCt=getCount(path)

In [4]:
from sklearn.multioutput import MultiOutputRegressor
import lightgbm as lgb
# !pip install optuna
import optuna
import os.path
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.metrics import mean_squared_error as MSE
class Objective():

    def __init__(self):
        self.best_booster = None
        self._booster = None

    def __call__(self, trial):
        param = {
            "objective": "regression",
            "metric": "mean_squared_error",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "feature_fraction": trial.suggest_loguniform("feature_fraction", 0.5, 1.0),
            "bagging_fraction": trial.suggest_loguniform("bagging_fraction", 0.3, 1.0),
#             "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 0.3, 1.0),
            "max_depth": trial.suggest_int("max_depth", 400, 550),
#             "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
#             "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
            "num_leaves": trial.suggest_int("num_leaves", 900, 1200),
#             "num_iterations": trial.suggest_int("num_iterations", 20, 100),
            "learning_rate":trial.suggest_loguniform("learning_rate", 0.01, 0.1),
            "n_estimators": trial.suggest_int("n_estimators", 10, 25),
            "max_bin":trial.suggest_int("max_bin", 500, 1000),
        }
        dtrain = lgb.Dataset(X, label=y)
        # Add a callback for pruning.
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "mean_squared_error")
        if(os.path.isfile("./ubiquant-train-models/saved_models/lgbm/mdl.txt")==True):
            gbm=lgb.Booster(model_file=path1+"mdl.txt")
            gbm = lgb.train(param, dtrain, valid_sets=[dtrain], verbose_eval=False,keep_training_booster =True,init_model=gbm)
            gbm.save_model(path1+'mdl.txt')
        if(os.path.isfile("./ubiquant-train-models/saved_models/lgbm/mdl.txt")==False):
            gbm = lgb.train(param, dtrain, valid_sets=[dtrain], verbose_eval=False,keep_training_booster =True)
            gbm.save_model(path1+'mdl.txt')

        self._booster = gbm

        preds = gbm.predict(X)
        pred_labels = np.rint(preds)
        accuracy = mean_absolute_error(y, pred_labels)
        return accuracy


In [5]:
def hyperTune(n_trials):
    objective = Objective()
    optuna.logging.set_verbosity(optuna.logging.ERROR)
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize"
    )
    study.optimize(objective, n_trials=n_trials)
    trial_co = study.best_trial
    print("  Params: ")
    for key, value in trial_co.params.items():
        print("    {}: {}".format(key, value))
    return objective.best_booster


In [6]:
import math
import lightgbm as lgb
import numpy as np
import pandas as pd
import pickle
import os
from pathlib import Path
from sklearn.metrics import mean_squared_error as MSE
import gc
gc.enable()
Path("./ubiquant-train-models/saved_models/lgbm/").mkdir(parents=True, exist_ok=True)
path="../input/ubiquant-market-prediction/train.csv"
path1="./ubiquant-train-models/saved_models/lgbm/"
# model=lgb.LGBMRegressor(max_depth=100,num_leaves=600,num_iterations=100, keep_training_booster=True )
sample_size=50000
# loop=math.ceil(insIdCt[0][0]/sample_size)
i=0
df=df=reduce_mem_usage(pd.read_csv("../input/ubiquant-market-prediction/train.csv", skiprows  = [j for j in range(1, i*sample_size) ],nrows=sample_size))
lgb_path="./lg_model/"
while(len(df)>0):
    print("*************************")
    print(i,len(df))
    print("*************************")
#     df=extractData(i*sample_size,(i*sample_size)+sample_size,path,'investment_id').toPandas()
    df=reduce_mem_usage(pd.read_csv("../input/ubiquant-market-prediction/train.csv", skiprows  = [j for j in range(1, i*sample_size) ],nrows=sample_size))
    df_1=pd.read_csv("../input/ubiquant-market-prediction/train.csv", skiprows = 0,nrows=1)
    df.columns=df_1.columns
    del df_1
    if(len(df)==0):
        break;
    X=np.array(df.drop(['target','row_id','investment_id'],axis=1))
    y=np.array(df['target'])
    
    params={
        "boosting_type": "gbdt",
        "feature_fraction": 0.7,
    "bagging_fraction": 0.30,
#     "min_gain_to_split": 0.48621442782475605,
    "max_depth": 700,
#     "lambda_l1": 0.03742217988743394,
#     "lambda_l2": 0.20030511153917615294,
    "num_leaves": 1400,
    "num_iterations": 20,
    "learning_rate": 0.1,
    "n_estimators": 50,
    "max_bin": 500}
    dtrain = lgb.Dataset(X, label=y, free_raw_data=False)
#     params =  hyperTune(25)
#     gbm=lgb.Booster(model_file=path1+"mdl.txt")
    gbm= lgb.LGBMRegressor(params)
    if(i==0):
        print("&&&&&&&&&&&&&&&&&&&&&&")
        gbm = lgb.train(params, dtrain, valid_sets=[dtrain], verbose_eval=False,keep_training_booster =True)
        gbm.save_model(path1+'mdl.txt')
    gbm=lgb.Booster(model_file=path1+"mdl.txt")
    gbm = lgb.train(params, dtrain, valid_sets=[dtrain], verbose_eval=False,keep_training_booster =True,init_model=gbm)
    gbm.save_model(path1+'mdl.txt')
#     gbm = model.fit(X,y)
    y_pred=gbm.predict(X)
    MSE(y, y_pred)
    print("***************************")
    print(i,MSE(pd.DataFrame(y), pd.DataFrame(y_pred)))
    print("***************************")
    del X
    del y
    del dtrain
    i+=1
    gc.collect()
# del df

Memory usage of dataframe is 115.97 MB
Memory usage after optimization is: 29.28 MB
Decreased by 74.8%
*************************
0 50000
*************************
Memory usage of dataframe is 115.97 MB
Memory usage after optimization is: 29.28 MB
Decreased by 74.8%
&&&&&&&&&&&&&&&&&&&&&&
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 129415
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 300
[LightGBM] [Info] Start training from score -0.023366
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 129415
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 300
***************************
0 0.09763994784252958
***************************
*************************
1 50000
*************************
Memory usage of dataframe is 115.97 MB
Memory usage after optimization is: 29.28 MB
Decreased by 74.8%
You can set `force_col_wise=true` to rem

In [7]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
model=lgb.Booster(model_file=path1+"mdl.txt") 
for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = model.predict(test_df.drop(['row_id'],axis=1))
#     sample_prediction_df
    env.predict(sample_prediction_df)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [8]:
# insIdCt=getCount(path)

In [9]:
# import math
# import lightgbm as lgb
# import numpy as np
# import pandas as pd
# from sklearn.metrics import mean_squared_error as MSE

# def generator(size1):
# #     loop=math.ceil(insIdCt[0][0]/size1)
#     i=0
#     df=reduce_mem_usage(pd.read_csv("../input/ubiquant-market-prediction/train.csv", skiprows = [j for j in range(1, i*size1) ],nrows=size1))
#     while(i>=0):
        
#         df=reduce_mem_usage(pd.read_csv("../input/ubiquant-market-prediction/train.csv", skiprows = [j for j in range(1, i*size1) ],nrows=size1))
#         if(len(df)==0):
#             i=-1
#         df_1=pd.read_csv("../input/ubiquant-market-prediction/train.csv",nrows=1)
#         df.columns=df_1.columns
#         X=np.array(df.drop(['target','row_id'],axis=1))
#         y=np.array(df['target'])
#         i+=1
# #         del df
#         yield X,y

In [10]:
# i=0
# size1=20000
# df=reduce_mem_usage(pd.read_csv("../input/ubiquant-market-prediction/train.csv", skiprows = [j for j in range(1, i*size1) ],nrows=size1))
# while(i>=0):
#     df=reduce_mem_usage(pd.read_csv("../input/ubiquant-market-prediction/train.csv", skiprows = [j for j in range(1, i*size1) ],nrows=size1))
#     print(len(df),i)
#     if(len(df)==0):
#         i=-1
#     df_1=pd.read_csv("../input/ubiquant-market-prediction/train.csv",nrows=1)
#     df.columns=df_1.columns
#     X=np.array(df.drop(['target','row_id'],axis=1))
#     y=np.array(df['target'])
#     i+=1
# #         del df
# print(i)

In [11]:
# size1=64
# train_generator =generator(size1)

In [12]:
# # import the necessary modules from the library
# size1=128
# train_generator =generator(size1)
# from keras.models import Sequential
# from keras.layers import Dense, Conv2D, Flatten, Activation, LSTM, Dropout,BatchNormalization,RepeatVector
# model = Sequential()
# model.add(BatchNormalization())
# model.add(Dense(1024))
# model.add(Dense(1024))
# model.add(Dense(1024))
# model.add(Activation('swish'))
# model.add(RepeatVector(3))
# model.add(LSTM(512, activation = 'swish', return_sequences=True))
# # model.add(Dropout(0.3))
# model.add(LSTM(256, activation = 'swish', return_sequences=True))
# # model.add(Dropout(0.3))
# model.add(Dense(1))
# model.add(Activation('softmax'))
# model.compile(loss='MeanSquaredError',
#               optimizer='adam',#'rmsprop',
#               metrics=['mean_squared_error'])



In [13]:
# model.fit_generator(
#         train_generator,
#         steps_per_epoch=size1, #// batch_size,
#         epochs=1
# #         validation_data=train_generator#,
# #         validation_steps=64
# )


In [14]:
# a=extractData(i*5,(i*5)+5,path,'investment_id')
# [j for j in range(1, i*size1) ]

In [15]:
# # b=a.toPandas()[100]
# a.dtypes