# Classification lightgbm with parameter tuning
## Optuna will be used for hyper parameter optimization


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics
import datatable as dt
!pip install --quiet optuna
import optuna
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Reducing the data frame by choosing appropraite data types depending on the type of data not only decreases the modelling time also makes the use of computational resources appropriately.
[https://towardsdatascience.com/make-working-with-large-dataframes-easier-at-least-for-your-memory-6f52b5f4b5c4](http://)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

Using datatable improves the dataload part, it also can be used directly without any conversions.

In [None]:
train = reduce_mem_usage(dt.fread('../input/jane-street-market-prediction/train.csv').to_pandas())
features = reduce_mem_usage(dt.fread('../input/jane-street-market-prediction/features.csv').to_pandas())
test = reduce_mem_usage(dt.fread('/kaggle/input/jane-street-market-prediction/example_test.csv').to_pandas())
y=np.where(train['resp'] > 0,1,0)
train= train.loc[:, ~train.columns.str.startswith('res')] #train[features['feature']]
train= train.loc[:, ~train.columns.str.startswith('ts_id')]
test= test.loc[:, ~test.columns.str.startswith('res')] #train[features['feature']]
test= test.loc[:, ~test.columns.str.startswith('ts_id')]
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train)
X_test = scaler.transform(test)
train_x, valid_x, train_y, valid_y = train_test_split(X_train, y, test_size=0.25)
dtrain = lgb.Dataset(train_x, label=train_y)
dvalid = lgb.Dataset(valid_x, label=valid_y)

In [None]:
train.head()

In [None]:
train[features['feature']].isnull().sum().sort_values(ascending=False)

In [None]:
train.info()

In [None]:
test.info()

### Hyper Parameters to be tuned
metric: different metric types like auc.<br><br>
boosting type: there are 4 different values:<br><br>
                default: gbdt - Stable reliable but time consuming.<br><br>
                dart: better accuracy but requires hyper parameters to be tuned.<br><br>
                goss: converges faster but may overfit.<br><br>
max_bin: choose smaller value to avoid overfit<br><br>
num_leaves: choose smaller value to avoid overfit<br><br>
bagging_fraction and bagging_freq:  example, {"bagging_freq": 5, "bagging_fraction": 0.75} tells LightGBM “re-sample without replacement every 5 iterations, and draw samples of 75% of the training data”<br><br>
feature_fraction: value between 0-1, tells what fraction of features have to be considered.<br><br>
min_gain_to_split: Gain is basically the reduction in training loss that results from adding a split point. Default value is 0, can increase this value to increase spees also for regularization. <br><br>
max_depth: keep it as low as possible to avoid overfitting.<br><br>
lambda1(l1) and lambda2(l2): helps in regularizing and to avoid overfitting.<br><br>
num_iterations: Start with a low number for base model and increase to increase efficiency.<br><br>
early_stopping_rounds: Ideally should be 10% of num_iterations.<br><br>
categorical_feature: can provide a list which has categorical values<br><br>
is_unbalance: if true will do balancing autoatically<br><br>
sample_pos_weight : value between 0-1.number of negative samples / number of positive samples

In [None]:

class Objective:

    def __init__(self):
        self.best_booster = None
        self._booster = None

    def __call__(self, trial):
        param = {
            "objective": "binary",
            "metric": "auc",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "bagging_fraction": trial.suggest_loguniform("bagging_fraction", 0.3, 1.0),
            "feature_fraction": trial.suggest_loguniform("feature_fraction", 0.3, 1.0),
            "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", 0.3, 1.0),
            "max_depth": -1,
            "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
            "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
            "num_leaves": trial.suggest_int("num_leaves", 2, 300),
            "num_iterations": trial.suggest_int("num_iterations", 200, 1000),
            "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 20, 100),
            "objective":"binary",
            "learning_rate":trial.suggest_loguniform("learning_rate", 0.01, 0.1),
            "n_estimators": trial.suggest_int("n_estimators", 50, 1000)
        }

        # Add a callback for pruning.
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
        gbm = lgb.train(
            param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback]
        )

        self._booster = gbm

        preds = gbm.predict(valid_x)
        pred_labels = np.rint(preds)
        accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
        return accuracy

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_booster = self._booster

In [None]:
objective = Objective()

study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="maximize"
)
study.optimize(objective, n_trials=10, callbacks=[objective.callback])

print("Best trial:")
trial = study.best_trial

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

best_model = objective.best_booster

new_preds = best_model.predict(X_train)
new_pred_labels = np.rint(new_preds)
print(sklearn.metrics.accuracy_score(y, new_pred_labels))

In [None]:
# import lightgbm as lgb
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import train_test_split

# xtr,xval,ytr,yval = train_test_split(X_train ,y,test_size=0.30,stratify=y)
# lg = lgb.LGBMClassifier(eval_metric='auc',metric='auc', 
#                         bagging_fraction= 0.7074669591958046,
#     feature_fraction= 0.42131413818337216,
#     min_gain_to_split= 0.3812146181153679,
#     lambda_l1= 0.00167686294813971,
#     lambda_l2= 1.3583979451485277,
#     num_leaves= 189,
#     num_iterations= 569,
#     learning_rate= 0.09196718727893016,
#     n_estimators= 214,
#                         objective='binary',#learning_rate=0.05, 
#                         silent=False,
#                         force_col_wise=True)
# lg.fit(xtr,ytr)

In [None]:
# y_pred=lg.predict(xtr)
# y_val1=lg.predict(xval)

In [None]:
# from sklearn.metrics import f1_score,accuracy_score,classification_report
# print(f1_score(ytr, y_pred, average='macro'))
# print(accuracy_score(ytr, y_pred))
# print(classification_report(ytr, y_pred))
# print("********************************************")
# print(f1_score(yval, y_val1, average='macro'))
# print(accuracy_score(yval, y_val1))
# print(classification_report(yval, y_val1))

In [None]:
# import lightgbm as lgbm
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import learning_curve, GridSearchCV
# from sklearn.model_selection import train_test_split
# params={"num_leaves":[100,150,200,300],
#        "max_bin":[30,50,7],
# #        "feature_fraction":0.52,
# #        "bagging_fraction":0.52,
#        "objective":["binary"],
#        "learning_rate":[0.01,0.05,0.1],
#        "boosting_type":["gbdt"],
#        "metric":["auc"],
#         "n_estimators": [200]
#        }
# models = [] # list of model , we will train 
# xtr,xval,ytr,yval = train_test_split(X_train ,y,test_size=0.25,stratify=y)

# lg = lgb.LGBMClassifier(silent=False)
# grid_search = GridSearchCV(lg, n_jobs=2, param_grid=params, cv = 3, scoring="roc_auc", verbose=5)
# grid_search.fit(xtr,ytr)
# grid_search.best_estimator_

# # d_train = lgbm.Dataset(xtr,label=ytr)
# # d_eval = lgbm.Dataset(xval,label=yval,reference=d_train)
# # clf = lgbm.train(params,d_train,valid_sets=[d_train,d_eval],num_boost_round=2000,\
# #                 early_stopping_rounds=100,verbose_eval=100)
# # models.append(clf)