In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [9]:
train_data = pd.read_table('../../datasets/zhengqi/zhengqi_train.txt')
test_data = pd.read_table('../../datasets/zhengqi/zhengqi_test.txt')

In [10]:
train_data.describe()

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V29,V30,V31,V32,V33,V34,V35,V36,V37,target
count,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,...,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0
mean,0.123048,0.056068,0.28972,-0.06779,0.012921,-0.558565,0.182892,0.116155,0.177856,-0.169452,...,0.097648,0.055477,0.127791,0.020806,0.007801,0.006715,0.197764,0.030658,-0.13033,0.126353
std,0.928031,0.941515,0.911236,0.970298,0.888377,0.517957,0.918054,0.955116,0.895444,0.953813,...,1.0612,0.901934,0.873028,0.902584,1.006995,1.003291,0.985675,0.970812,1.017196,0.983966
min,-4.335,-5.122,-3.42,-3.956,-4.742,-2.182,-4.576,-5.048,-4.692,-12.891,...,-2.912,-4.507,-5.859,-4.053,-4.627,-4.789,-5.695,-2.608,-3.63,-3.044
25%,-0.297,-0.22625,-0.313,-0.65225,-0.385,-0.853,-0.31,-0.295,-0.159,-0.39,...,-0.664,-0.283,-0.17025,-0.40725,-0.499,-0.29,-0.2025,-0.413,-0.79825,-0.35025
50%,0.359,0.2725,0.386,-0.0445,0.11,-0.466,0.388,0.344,0.362,0.042,...,-0.023,0.0535,0.2995,0.039,-0.04,0.16,0.364,0.137,-0.1855,0.313
75%,0.726,0.599,0.91825,0.624,0.55025,-0.154,0.83125,0.78225,0.726,0.042,...,0.74525,0.488,0.635,0.557,0.462,0.273,0.602,0.64425,0.49525,0.79325
max,2.121,1.918,2.828,2.457,2.689,0.489,1.895,1.918,2.245,1.335,...,4.58,2.689,2.013,2.395,5.465,5.11,2.324,5.238,3.0,2.538


In [11]:
y_train = train_data['target'].values
X_train = train_data.drop('target', axis=1).values
X_test = test_data.values

min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.fit_transform(X_test)

X_train
X_test

In [23]:
##### xgb
xgb_params = {'eta': 0.02, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 16}

# reg:linear 线性回归；reg:logistic 逻辑回归

folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_xgb = np.zeros(len(train_data))
predictions_xgb = np.zeros(len(test_data))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
    val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])

    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
    predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y_train)))

fold n°1
[0]	train-rmse:1.03032	valid_data-rmse:1.04804
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.
[100]	train-rmse:0.212907	valid_data-rmse:0.396633
[200]	train-rmse:0.073367	valid_data-rmse:0.35062
[300]	train-rmse:0.037428	valid_data-rmse:0.345292
[400]	train-rmse:0.020981	valid_data-rmse:0.343765
[500]	train-rmse:0.012301	valid_data-rmse:0.343202
[600]	train-rmse:0.00703	valid_data-rmse:0.34292
[700]	train-rmse:0.004044	valid_data-rmse:0.342757
[800]	train-rmse:0.002402	valid_data-rmse:0.342697
[900]	train-rmse:0.001426	valid_data-rmse:0.342668
[1000]	train-rmse:0.000881	valid_data-rmse:0.342656
[1100]	train-rmse:0.000594	valid_data-rmse:0.34265
[1200]	train-rmse:0.00052	valid_data-rmse:0.342649
[1300]	train-rmse:0.000502	valid_data-rmse:0.342647
[1400]	train-rmse:0.000492	valid_data-rmse:0.342647
Stopping. Best iteration:
[1281]	train-rmse:0.000503	valid_data-rmse:0.342

In [13]:
##### ridge
reg = Ridge(alpha = .5)

folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_reg = np.zeros(len(train_data))
predictions_reg = np.zeros(len(test_data))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))

    clf = reg.fit(X_train[trn_idx], y_train[trn_idx])
    oof_reg[val_idx] = clf.predict(X_train[val_idx])
    print("fold n°{}".format(mean_squared_error(oof_reg[val_idx], y_train[val_idx])))
    predictions_reg += clf.predict(X_test) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_reg, y_train)))

fold n°1
fold n°0.10888544804801513
fold n°2
fold n°0.13954093483790903
fold n°3
fold n°0.0943499326025632
fold n°4
fold n°0.11501970468666108
fold n°5
fold n°0.10329170773049902
CV score: 0.11221967


In [18]:
#  lgb
param = {'num_leaves': 120,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.008,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}

# max_depth < 0 模型深度没限制

folds = KFold(n_splits=6, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(train_data))
predictions_lgb = np.zeros(len(test_data))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, y_train)))

fold n°1
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.116777	valid_1's l2: 0.160869
[400]	training's l2: 0.0506182	valid_1's l2: 0.121711
[600]	training's l2: 0.031426	valid_1's l2: 0.11517
[800]	training's l2: 0.0211446	valid_1's l2: 0.112382
[1000]	training's l2: 0.014749	valid_1's l2: 0.11096
[1200]	training's l2: 0.0104901	valid_1's l2: 0.11008
[1400]	training's l2: 0.00762403	valid_1's l2: 0.109445
[1600]	training's l2: 0.00564447	valid_1's l2: 0.109022
[1800]	training's l2: 0.00424934	valid_1's l2: 0.108665
[2000]	training's l2: 0.00323886	valid_1's l2: 0.108534
Early stopping, best iteration is:
[1952]	training's l2: 0.0034542	valid_1's l2: 0.108507
fold n°2
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.11778	valid_1's l2: 0.165324
[400]	training's l2: 0.0516095	valid_1's l2: 0.112552
[600]	training's l2: 0.0323487	valid_1's l2: 0.104511
[800]	training's l2: 0.0221759	valid_1's l2: 0.101972
[1000]	t

In [19]:
series = pd.Series(predictions_lgb)
series.to_csv('../../datasets/zhengqi/submit.txt', index=None, header=None)