# GBDT with LightGBM

In [None]:
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', 500)

### 导入数据

In [None]:
train_df = pd.read_csv('data_train.csv', low_memory=False, index_col=0)
test_df = pd.read_csv('data_test.csv', low_memory=False, index_col=0)

## 处理数据

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

In [None]:
feature = train_df.describe().columns.values.tolist()[5:]

In [None]:
label = train_df.describe().columns.values.tolist()[0:5]
print(label)

In [None]:
to_fill = train_df.median()
X = train_df.loc[:,feature].fillna(to_fill)
Y = train_df.loc[:,label].fillna(to_fill)
X_test = test_df.loc[:,feature].fillna(to_fill)
# X = train_df.loc[:,feature]
# Y = train_df.loc[:,label]
# X_test = test_df.loc[:,feature]

In [None]:
X.head()

In [None]:
X.shape, Y.shape, X_test.shape

In [None]:
cv = ShuffleSplit(n_splits=2, test_size=0.3, random_state=0)

### GBDT

In [None]:
import lightgbm as lgb

In [None]:
X_train,X_eval,y_train,y_eval = train_test_split(X,Y,test_size=0.2, random_state=0)
Y_pred_df = pd.DataFrame()
rmse =[]
for i in range(5):
    lgb_train = lgb.Dataset(X_train, y_train.iloc[:,i])
    lgb_eval = lgb.Dataset(X_eval, y_eval.iloc[:,i])

    params = {
        'learning_rate': 0.02,
        'boosting_type': 'gbdt',
        'objective': 'rmse',
        'metric': 'rmse',
        'sub_feature': 0.7,
        'num_leaves': 70,
        'min_data': 120,
        'min_hessian': 1,

    }

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=1000,
                    valid_sets=lgb_eval,
                   )

    y_pred = gbm.predict(X_eval, num_iteration=gbm.best_iteration)
    y_pred_test = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    Y_pred_df[label[i]] = y_pred_test
    rmse.append(mean_squared_log_error(y_eval.iloc[:,i], np.ndarray.round(y_pred,3)))
    print('The rmse of prediction is:', rmse[i])

In [None]:
sum(rmse)/5

In [None]:
# %matplotlib tk
lgb.plot_importance(gbm, max_num_features=50)

In [None]:
rmse

In [None]:
Y_pred_df.head()

In [None]:
Y_pred_df['vid']=test_df['vid']
Y_pred_gbdt_df = Y_pred_df.loc[:, ['vid']+label]
Y_pred_gbdt_df.to_csv('gbdt_output.csv',index=False, header=False)

## 回归树

In [None]:
# regr = DecisionTreeRegressor()
# regr.fit(X_train, Y_train)
# acc_decision_tree = round(regr.score(X_train, Y_train) * 100, 2)
# acc_decision_tree

# Y_pred_regr = regr.predict(X_test)
# Y_pred_regr_df = pd.DataFrame(Y_pred_regr, columns=label)
# Y_pred_regr_df.head()

# Y_pred_regr_df['vid']=test_df['vid']
# Y_pred_regr_df = Y_pred_regr_df.loc[:, ['vid']+label]

# Y_pred_regr_df.to_csv('regr_output.csv',index=False, header=False)

### *可参考的一些处理错误数据的方法

In [None]:
# 可以对一些明显错误的数据进行修改，数据量很小，后来选择直接改成 NaN
# train_df[train_df['100006'].str.contains(r'[0-9]')==False]['100006']
# train_df[train_df['269004'].str.contains(r'[0-9]')==False]['269004']
# train_df[train_df['269005'].str.contains(r'[0-9]')==False]['269005']
# **把所有为`---`的值改为NaN**

# train_df.replace(to_replace=r'\-+', value=np.nan, inplace=True, regex=True)

# for col in test_df.describe().columns.values:
#     if col not in train_df.describe().columns.values:
#         train_df[col] = pd.to_numeric(train_df[col], downcast='float', errors='ignore')

# 只是多了两列特征，因为不只是---的问题

# train_df[train_df['269004'].str.match(r'^(-?\d+\.\d+)?;(-?\d+\.\d+)?')==True]['269004']

# **有很多特征值有重复，变成了数值；数值的格式**

# train_df.replace(to_replace=r'^(-?\d+\.\d+)?;(-?\d+\.\d+)?', value=r'\1', inplace=True, regex=True)

# train_df.loc[23268]['269004']

# for col in test_df.describe().columns.values:
#     if col not in train_df.describe().columns.values:
#         train_df[col] = pd.to_numeric(train_df[col], downcast='float', errors='ignore')

# 还有几列有问题，print出来看看。

# for col in test_df.describe().columns.values:
#     if col not in train_df.describe().columns.values:
#         print(col)

# print(train_df['269012'].count(),
# train_df['313'].count(),
# train_df['32'].count(),
# train_df['38'].count())

# train_df[train_df['269012'].str.match(r'^(-?\d+)(\.\d+)?')==False]['269012']

# pd.to_numeric(train_df['269012'], downcast='float')

# 可以看到还有末尾多一个小数点的数据，把小数点去掉

# train_df.replace(to_replace=r'^(-?\d+\.\d+)?(-?\d+)?.$', value=r'\1', regex=True, inplace=True)

# 处理特例

# train_df.loc[26333]['313'] 

# train_df.loc[26333,['313']] = 189

# for col in test_df.describe().columns.values:
#     if col not in train_df.describe().columns.values:
#         train_df[col] = pd.to_numeric(train_df[col], downcast='float', errors='ignore') #用apply更好