In [3]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
boston = load_boston()
data = boston.data
target = boston.target
X_train,X_test,y_train,y_test = train_test_split(data,target,test_size=0.2)



In [4]:
lgb_train = lgb.Dataset(X_train,y_train)
lgb_eval = lgb.Dataset(X_test,y_test,reference=lgb_train)


In [6]:
params = {
    'task':'train',
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':{'l2','auc'},
    'num_leaves':31,
    'learning_rate': 0.05,
    'feature_fraction':0.9,
    'bagging_freq':5,
    'bagging_fraction':0.8,
    'verbose':1
}
gbm = lgb.train(params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1030
[LightGBM] [Info] Number of data points in the train set: 404, number of used features: 13
[LightGBM] [Info] Start training from score 22.499257
[1]	valid_0's l2: 84.3308	valid_0's auc: 1
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 78.0457	valid_0's auc: 1
[3]	valid_0's l2: 71.6756	valid_0's auc: 1
[4]	valid_0's l2: 66.6292	valid_0's auc: 1
[5]	valid_0's l2: 61.3443	valid_0's auc: 1
[6]	valid_0's l2: 56.4748	valid_0's auc: 1
Early stopping, best iteration is:
[1]	valid_0's l2: 84.3308	valid_0's auc: 1


In [8]:
import joblib
joblib.dump(gbm,'lgb.pkl')



['lgb.pkl']

In [10]:
y_pred = gbm.predict(X_test,num_iteration=gbm.best_iteration)
print(mean_squared_error(y_pred,y_test))

84.3308375403153


In [11]:
gbm1= lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.05,n_estimators=20)
gbm1.fit(X_train,y_train,eval_set=[(X_test,y_test)],eval_metric='l1',early_stopping_rounds=5)

[1]	valid_0's l1: 6.85262	valid_0's l2: 83.4476
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 6.56281	valid_0's l2: 76.3647
[3]	valid_0's l1: 6.31694	valid_0's l2: 70.2935
[4]	valid_0's l1: 6.05934	valid_0's l2: 64.5003
[5]	valid_0's l1: 5.79987	valid_0's l2: 59.2033
[6]	valid_0's l1: 5.60159	valid_0's l2: 54.831
[7]	valid_0's l1: 5.38863	valid_0's l2: 50.7664
[8]	valid_0's l1: 5.18645	valid_0's l2: 47.0761
[9]	valid_0's l1: 4.99656	valid_0's l2: 43.4979
[10]	valid_0's l1: 4.84842	valid_0's l2: 40.5659
[11]	valid_0's l1: 4.70451	valid_0's l2: 38.0053
[12]	valid_0's l1: 4.53619	valid_0's l2: 35.3042
[13]	valid_0's l1: 4.39066	valid_0's l2: 33.0963
[14]	valid_0's l1: 4.26279	valid_0's l2: 31.1426
[15]	valid_0's l1: 4.1234	valid_0's l2: 29.1759
[16]	valid_0's l1: 3.99968	valid_0's l2: 27.561
[17]	valid_0's l1: 3.89134	valid_0's l2: 25.9545
[18]	valid_0's l1: 3.78875	valid_0's l2: 24.6478
[19]	valid_0's l1: 3.69682	valid_0's l2: 23.4513
[20]	valid_0's l1: 3.

LGBMRegressor(learning_rate=0.05, n_estimators=20, objective='regression')

In [12]:
y_pred=gbm1.predict(X_test,num_iteration=gbm1.best_iteration_)
print(mean_squared_error(y_test,y_pred))

22.22435973047348


In [13]:
list(gbm1.feature_importances_)

[50, 0, 6, 0, 24, 52, 13, 25, 8, 20, 10, 9, 81]

In [15]:
estimator = lgb.LGBMRegressor(num_leaves=31)
import sklearn
param_grid = {
    'learning_rate':[0.01,0.1,1],
    'n_estimators':[20,40]
}
from sklearn.model_selection import GridSearchCV
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(X_train, y_train)
print('Best parameters found by grid search are:', gbm.best_params_)


Best parameters found by grid search are: {'learning_rate': 0.1, 'n_estimators': 40}


In [20]:
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cannce_data = load_breast_cancer()
X=cannce_data.data
y = cannce_data.target


In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size=0.2)
params = {
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':'auc',
    'nthread':4,
    'learning_rate':0.1,
    'num_leaves':30,
    'max_depth':5,
    'subsample':0.8,
    'colsample_bytree':0.8
}
data_train = lgb.Dataset(X_train,y_train)
cv_results = lgb.cv(params,data_train,num_boost_round=1000,nfold=5,stratified=False,
                   shuffle=True,metrics='auc',early_stopping_rounds=50,seed=0)

[LightGBM] [Info] Number of positive: 235, number of negative: 129
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4548
[LightGBM] [Info] Number of data points in the train set: 364, number of used features: 30
[LightGBM] [Info] Number of positive: 229, number of negative: 135
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4548
[LightGBM] [Info] Number of data points in the train set: 364, number of used features: 30
[LightGBM] [Info] Number of positive: 229, number of negative: 135
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4548
[LightGBM] [Info] Number of data points in the train set: 364, number of used features: 30
[LightGBM] [Info] Number of positive: 232, number of negative: 132
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4548
[LightGBM] [Info] Number of data points in the train set: 364, number of used features: 30








In [24]:
len(cv_results['auc-mean']),pd.Series(cv_results['auc-mean']).max()


(58, 0.989688290479698)

In [27]:
param_test1={'max_depth':range(3,8,1),'num_leaves':range(5,100,5)}
gsearch1 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1,
                                                     n_estimators=188,max_depth=6,bagging_fraction=0.8,feature_fraction=0.8),
                       param_grid=param_test1,scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(X_train,y_train)





GridSearchCV(cv=5,
             estimator=LGBMClassifier(bagging_fraction=0.8,
                                      feature_fraction=0.8, max_depth=6,
                                      metrics='auc', n_estimators=188,
                                      objective='binary'),
             n_jobs=-1,
             param_grid={'max_depth': range(3, 8),
                         'num_leaves': range(5, 100, 5)},
             scoring='roc_auc')

In [30]:
gsearch1.best_score_,gsearch1.best_params_

(0.99435736677116, {'max_depth': 4, 'num_leaves': 15})

In [31]:
param_test1={'max_bin':range(5,256,10),'min_data_in_leaf':range(1,102,10)}
gsearch1 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1,
                                                     n_estimators=188,max_depth=4,num_leaves=15,bagging_fraction=0.8,feature_fraction=0.8),
                       param_grid=param_test1,scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(X_train,y_train)


GridSearchCV(cv=5,
             estimator=LGBMClassifier(bagging_fraction=0.8,
                                      feature_fraction=0.8, max_depth=4,
                                      metrics='auc', n_estimators=188,
                                      num_leaves=15, objective='binary'),
             n_jobs=-1,
             param_grid={'max_bin': range(5, 256, 10),
                         'min_data_in_leaf': range(1, 102, 10)},
             scoring='roc_auc')

In [32]:
gsearch1.best_score_,gsearch1.best_params_

(0.9948798328108672, {'max_bin': 55, 'min_data_in_leaf': 21})

In [34]:
params_test3={'feature_fraction': [0.6,0.7,0.8,0.9,1.0],
'bagging_fraction': [0.6,0.7,0.8,0.9,1.0],
'bagging_freq': range(0,81,10)
}
gsearch1 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',max_bin=55,min_data_in_leaf=21,objective='binary',metrics='auc',learning_rate=0.1,
                                                     n_estimators=188,max_depth=4,num_leaves=15,bagging_fraction=0.8,feature_fraction=0.8),
                       param_grid=params_test3,scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=LGBMClassifier(bagging_fraction=0.8,
                                      feature_fraction=0.8, max_bin=55,
                                      max_depth=4, metrics='auc',
                                      min_data_in_leaf=21, n_estimators=188,
                                      num_leaves=15, objective='binary'),
             n_jobs=-1,
             param_grid={'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
                         'bagging_freq': range(0, 81, 10),
                         'feature_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]},
             scoring='roc_auc')

In [35]:
gsearch1.best_score_,gsearch1.best_params_

(0.9956112852664578,
 {'bagging_fraction': 0.6, 'bagging_freq': 50, 'feature_fraction': 0.8})

In [41]:
params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}
gsearch1 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',bagging_fraction=0.6,bagging_freq=50,feature_fraction=0.8,max_bin=55,min_data_in_leaf=21,objective='binary',metrics='auc',learning_rate=0.1,
                                                     n_estimators=188,max_depth=4,num_leaves=15),
                       param_grid=params_test5,scoring='roc_auc',cv=5,n_jobs=-1)

gsearch1.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=LGBMClassifier(bagging_fraction=0.6, bagging_freq=50,
                                      feature_fraction=0.8, max_bin=55,
                                      max_depth=4, metrics='auc',
                                      min_data_in_leaf=21, n_estimators=188,
                                      num_leaves=15, objective='binary'),
             n_jobs=-1,
             param_grid={'min_split_gain': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
                                            0.7, 0.8, 0.9, 1.0]},
             scoring='roc_auc')

In [42]:
gsearch1.best_score_,gsearch1.best_params_

(0.9956112852664578, {'min_split_gain': 0.0})

In [43]:
L = []
def my_func(x):
    return 2*x
for i in range(5):
    L.append(my_func(i))
L

[0, 2, 4, 6, 8]

In [44]:
[my_func(i) for i in range(5)]

[0, 2, 4, 6, 8]

In [46]:
[m+'_'+n for m in ['a','b'] for n in ['c','d']]


['a_c', 'a_d', 'b_c', 'b_d']

In [49]:
value ='cat' if 2>1 else 'dog'
value

'cat'

In [52]:
a,b = 'cat','dog'
condition = 2>1
if condition:
    value = a
else:
    value = b


In [54]:
L = [1,2,3,4,5,6,7]
[i if i<=5 else 5 for i in L]


[1, 2, 3, 4, 5, 5, 5]

In [56]:
my_func = lambda x:2*x
my_func(3)

6

In [58]:
multi_para_func = lambda a,b : a+b
multi_para_func(1,2)

3

In [60]:
list(map(lambda x:2*x,range(5)))


[0, 2, 4, 6, 8]

In [62]:
list(map(lambda x,y : str(x)+'_'+y,range(5),list('abcde')))


['0_a', '1_b', '2_c', '3_d', '4_e']

In [64]:
L1,L2,L3 = list('abc'),list('def'),list('hij')
list(zip(L1,L2,L3))

[('a', 'd', 'h'), ('b', 'e', 'i'), ('c', 'f', 'j')]

In [68]:

tuple(zip(L1,L2,L3))

(('a', 'd', 'h'), ('b', 'e', 'i'), ('c', 'f', 'j'))

In [71]:
for i , j, k  in zip(L1,L2,L3):
    print(i,j,k)

a d h
b e i
c f j


In [73]:
L = list('abcd')
for index,value in enumerate(L):
    print(index,value)

0 a
1 b
2 c
3 d


In [76]:
for index,value in zip(range(len(L)),L):
    print(index,value)

0 a
1 b
2 c
3 d


In [78]:
dict(zip(L1,L2))


{'a': 'd', 'b': 'e', 'c': 'f'}

In [80]:
zipped = list(zip(L1,L2,L3))
zipped

[('a', 'd', 'h'), ('b', 'e', 'i'), ('c', 'f', 'j')]

In [82]:
list(zip(*zipped))

[('a', 'b', 'c'), ('d', 'e', 'f'), ('h', 'i', 'j')]

In [84]:
import numpy as np
np.array([1,2,3])

array([1, 2, 3])

In [86]:
np.linspace(1,5,11),np.arange(1,5,2)

(array([1. , 1.4, 1.8, 2.2, 2.6, 3. , 3.4, 3.8, 4.2, 4.6, 5. ]), array([1, 3]))

In [87]:
np.zeros((2,3)),np.eye(3),np.eye(3,k=1)


(array([[0., 0., 0.],
        [0., 0., 0.]]),
 array([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]),
 array([[0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 0.]]))

In [89]:
np.full((2,2),10)

array([[10, 10],
       [10, 10]])

In [91]:
np.full((2,3),[1,2,3])

array([[1, 2, 3],
       [1, 2, 3]])

In [93]:
np.random.rand(3)


array([0.13508113, 0.38272667, 0.66226664])

In [95]:
np.random.randn(3)

array([-0.22948393,  0.56309834, -1.00320601])

In [97]:
np.random.rand(3,3)

array([[0.09910211, 0.70766039, 0.12430163],
       [0.14508177, 0.45484778, 0.03790371],
       [0.94906725, 0.79257723, 0.65671829]])

In [98]:
np.random.randn(3,3
               )

array([[-0.85502157,  0.38838427, -0.35737843],
       [-0.77362833,  0.22848659,  0.79613381],
       [-0.78329183,  0.82028966,  0.73736822]])

In [101]:
a,b = 15, 5
(a-b)*np.random.rand(3)+b

array([8.30037229, 9.30891052, 6.48149008])

In [103]:
np.random.uniform(5,15,3)

array([13.71220679, 13.86281842,  5.12453837])

In [105]:
np.random.randn(3)

array([ 1.53179681, -1.66242795,  0.38690958])

In [107]:
np.random.randn(2,2)

array([[-0.1054035 , -0.07254433],
       [-0.4241826 , -0.80355696]])

In [109]:
sigma,mu = 2.5,3
mu + np.random.randn(3)*sigma

array([3.15609406, 6.60486886, 0.47063194])

In [111]:
np.random.normal(3,2.5,3)

array([6.65005155, 5.37790123, 4.63672487])

In [113]:
low,high, size = 5,15,(2,3)
np.random.randint(low,high,size)

array([[ 8, 12, 10],
       [ 9, 10, 13]])

In [115]:
my_list = ['a','b','c','d']
np.random.choice(my_list,2,replace=False,p=[0.1,0.7,0.1,0.1])

array(['b', 'c'], dtype='<U1')

In [117]:
np.random.choice(my_list,(3,3))

array([['a', 'd', 'b'],
       ['b', 'b', 'b'],
       ['a', 'a', 'd']], dtype='<U1')

In [119]:
np.random.permutation(my_list)

array(['d', 'a', 'c', 'b'], dtype='<U1')

In [121]:
np.random.seed(0)
np.random.rand()

0.5488135039273248

In [123]:
np.random.seed(0)
np.random.rand()

0.5488135039273248

In [126]:
np.zeros((2,4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [127]:
np.zeros((2,4)).T

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [128]:
np.r_[np.zeros((2,3)),np.zeros((2,3))]

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [129]:
np.c_[np.zeros((2,3)),np.zeros((2,3))]

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [131]:
try :
    np.r_[np.array([2,2]),np.zeros((2,1))]
except Exception as e:
    Err_Msg = e

In [132]:
Err_Msg

ValueError('all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)')

In [133]:
np.r_[np.array([2,2]),np.zeros(2)]


array([2., 2., 0., 0.])

In [134]:
np.c_[np.array([2,2]),np.zeros((2,1))]

array([[2., 0.],
       [2., 0.]])

In [136]:
target = np.arange(8).reshape(2,4)


In [137]:
target

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [138]:
target.reshape((4,2),order='C')

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [140]:
target.reshape((4,2),order='F')

array([[0, 2],
       [4, 6],
       [1, 3],
       [5, 7]])

In [141]:
target.reshape((4,-1))


array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [142]:
target=np.ones((3,1))
target.reshape(-1)

array([1., 1., 1.])