In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from joblib import Parallel, delayed
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("/Users/wumx/Downloads/train.csv")

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            #             df[col] = df[col].astype('category')
            pass

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
data = reduce_mem_usage(data)

Memory usage of dataframe is 7285.99 MB
Memory usage after optimization is: 1839.47 MB
Decreased by 74.8%


In [5]:
data['investment_id'] = data['investment_id'].astype('category')

In [6]:
time_id = data['time_id']
row_id = data['row_id']
y = data['target']
X = data.drop(['row_id', 'time_id', 'target'], axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [15]:
# # 参数
# params = {
#     'task': 'train',
#     'boosting_type': 'gbdt',  # 设置提升类型
#     'objective': 'regression',  # 目标函数
#     'metric': {'auc'},  # 评估函数
#     'max_bin': 255,  # 大会有更准的效果,更慢的速度
#     'min_data_in_leaf': 91,
#     'num_iterations': 500,
#     'max_depth': 4,  # 指定树的最大深度 [3, 5, 6, 7, 9, 12, 15, 17, 25]
#     'num_leaves': 10,  # 叶子节点数
#     'learning_rate': 0.1,  # 学习速率 [0.01, 0.015, 0.025, 0.05, 0.1]
#     'feature_fraction': 1.0,  # 建树的特征选择比例 [0.6, 0.7, 0.8, 0.9, 1]
#     'bagging_fraction': 0.6,  # 建树的样本采样比例 [0.6, 0.7, 0.8, 0.9, 1]
#     # 'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
#     'min_sum_hessian_in_leaf': 3.0,  # 防止过拟合
#     'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
# }
params = {
        'learning_rate':0.1,
        "objective": "regression",
        "metric": "rmse",
        'boosting_type': "gbdt",
        'verbosity': -1,
        'n_jobs': -1, 
        'seed': 21,
        'lambda_l1': 1.1895057699067542, 
        'lambda_l2': 1.9079686837880768e-08, 
        'num_leaves': 112, 
        'subsample':None,
        'feature_fraction': 0.6259927292757151, 
        'bagging_fraction': 0.9782210574588895, 
        'bagging_freq': 1, 
        'n_estimators': 306, 
        'max_depth': 12, 
        'max_bin': 255, 
        'min_data_in_leaf': 366,
        'colsample_bytree': None,
        'subsample_freq': None,
        'min_child_samples': None,
        'reg_lambda': None,
        'reg_alpha': None,
    }

In [16]:
gbm = lgb.train(params, lgb_train, num_boost_round=500, valid_sets=[lgb_eval],
                verbose_eval=100, early_stopping_rounds=300)
# 模型保存
gbm.save_model('model.txt')



Training until validation scores don't improve for 300 rounds
[100]	valid_0's rmse: 0.906656
[200]	valid_0's rmse: 0.903319
[300]	valid_0's rmse: 0.900994
Did not meet early stopping. Best iteration is:
[306]	valid_0's rmse: 0.900828


<lightgbm.basic.Booster at 0x7f78e355ac90>

In [17]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

The rmse of prediction is: 0.9008282186381865
