In [None]:
import pandas as pd
import numpy as np
from util import read_data, save, plt_encoding_error, error, normal
import matplotlib.pyplot as plt
from Feature import Feature
from Smote import Smote

In [None]:
train, test_A, _ = read_data()

In [None]:
num_train = len(train)
num_test_A = len(test_A)

train_m = pd.concat([train, test_A])

In [None]:
pd.set_option('max_columns',100)
train_m.shape

In [None]:
train_y = train['血糖']

In [None]:
train_m['体检日期'] = pd.to_datetime(train_m['体检日期'])
train_m['weekday'] = train_m['体检日期'].dt.weekday
train_m.drop(['体检日期'], axis=1, inplace=True)
train_m = pd.get_dummies(train_m, columns=['weekday'])

In [None]:
train_m.head()

In [None]:
important_feature = ['*天门冬氨酸氨基转换酶','*丙氨酸氨基转换酶','*碱性磷酸酶','*r-谷氨酰基转换酶','*总蛋白','白蛋白','*球蛋白','白球比例','甘油三酯','总胆固醇','高密度脂蛋白胆固醇','低密度脂蛋白胆固醇','尿素','肌酐','尿酸','乙肝表面抗原','乙肝表面抗体','乙肝e抗原','乙肝e抗体','乙肝核心抗体']
trian_imp = train.loc[:,important_feature]
row_index = trian_imp[trian_imp.T.count()==0].index
train_m.drop(row_index, inplace=True)

In [None]:
train_m.shape

In [None]:
feature = Feature(train_m)
feature.drop_feature(['id','体检日期', '性别'])
feature.long_tail()
feature.fix_missing()
feature.statistics()

In [None]:
feature = Feature(train_m)
feature.statistics()
feature.long_tail()
drop_list = ['id', '性别','体检日期', '乙肝表面抗原','乙肝表面抗体','乙肝e抗原','乙肝e抗体','乙肝核心抗体']
feature.drop_feature(drop_list)
train_m = feature.get_train()

In [None]:
pd.set_option('max_columns',100)

In [None]:
# 重新切分训练与测试数据
train_x = train_m.iloc[:num_train]
test_A_new = train_m.iloc[num_train:num_test_A + num_train]
# train_x = train_x[train_x['血糖']<=10]
# train_y = train_y[train_x['血糖']<=10]
train_x.drop(['血糖'], axis=1, inplace=True)

In [None]:
# 过采样
# glucose_index = np.where(train_m.columns == '血糖')
# large_glucose = train_m[train_m['血糖'] > 10].index
# train_large_glucose = train_m.iloc[large_glucose]
# train_x_matrix = train_large_glucose.as_matrix()
# s = Smote(train_x_matrix, N=200)
# over_sampling_train = s.over_sampling()
# print(over_sampling_train.shape)

# over_y = over_sampling_train[:,glucose_index]
# over_y = over_y.reshape(len(over_y))
# over_sampling_train = np.delete(over_sampling_train, glucose_index, axis=1)

In [None]:
features = train_x.columns.tolist()
train_X = train_x.as_matrix()
# train_X = np.vstack((train_X, over_sampling_train))
train_Y = train_y.as_matrix()
# train_Y = np.hstack((train_Y, over_y))
test_X = test_A_new.as_matrix()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=1)

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

param_grid = dict(num_leaves=[17,31,48],
                  n_estimators=[250, 200],
                  learning_rate=[0.01, 0.02,0.03],
                  min_child_samples=[10, 20, 40],
                  subsample=[0.8],
                  reg_lambda=[0.0, 0.2, 0.4])
lgb_regressor = lgb.LGBMRegressor(
                  objective='regression',
#                   min_child_samples=20,
                  subsample_freq=1,
                  colsample_bytree=1.0,
                  reg_alpha=1.0,
                  n_jobs=-1)
grid = GridSearchCV(cv=5, estimator=lgb_regressor, n_jobs=4, param_grid=param_grid, scoring='neg_mean_squared_error')
grid.fit(train_X, train_Y)
print('Best parameters found by grid search are:', grid.best_params_, '  best_score: ', grid.best_score_)

In [None]:
# result = grid.predict(test_X)
# data1 = pd.DataFrame(result)
# save(data1, 'lgb_grid_cv')

In [None]:
params = {
    'boosting': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'learning_rate': 0.02,
    'lambda_l1':1,
    'lambda_l2':0.2,
    'cat_smooth':10,
    'feature_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [None]:
from sklearn.model_selection import KFold
import lightgbm as lgb
def cv_estimate(n_splits, lgb, train_X, train_Y, online=False):
    test_preds = np.zeros((test_X.shape[0], n_splits))
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_scores = 0
    i = 0
    for train, test in cv.split(train_X, train_Y):
        train_x_k, train_y_k= train_X[train], train_Y[train]                
        test_x_k, test_y_k = train_X[test], train_Y[test]
        
        lgb_train = lgb.Dataset(train_x_k, train_y_k, feature_name = features)
        lgb_test = lgb.Dataset(test_x_k, test_y_k)
         
        gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=300)
        y = gbm.predict(test_x_k, num_iteration=gbm.best_iteration)
        val_scores += error(y, test_y_k)
        
        test_preds[:,i] = gbm.predict(test_X)
        i += 1
    val_scores /= n_splits
    if online:
        submission = pd.DataFrame({'pred':test_preds.mean(axis=1)})
        save(submission, 'lgb_kfold')
    return val_scores
scores = cv_estimate(5, lgb, train_X, train_Y, online=False)
print(scores)

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train, feature_name = features)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, feature_name = features)

train_all = lgb.Dataset(train_X, train_Y, feature_name = features)

# specify your configurations as a dict

print('Start training...')
# train
gbm = lgb.train(params,
                   lgb_train,
                    num_boost_round=283,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=20)

print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
error(y_test, y_pred)

# online

predict = gbm.predict(test_X, num_iteration=gbm.best_iteration)
data1 = pd.DataFrame(predict)
# save
# save(data1, 'lgb')

# gbm_online = lgb.train(params,
#                 train_all,
#                 num_boost_round=280)
# # predict
# predict = gbm_online.predict(test_X, num_iteration=gbm_online.best_iteration)
# data1 = pd.DataFrame(predict)
# # save
# save(data1, 'lgb')

In [None]:
plt_encoding_error()

In [None]:
lgb.plot_importance(gbm)
plt.show()

In [None]:
matplotlib qt