In [73]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [68]:
train_data = pd.read_table('../../datasets/zhengqi/zhengqi_train.txt')
test_data = pd.read_table('../../datasets/zhengqi/zhengqi_test.txt')

In [69]:
train_data.describe()

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V29,V30,V31,V32,V33,V34,V35,V36,V37,target
count,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,...,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0,2888.0
mean,0.123048,0.056068,0.28972,-0.06779,0.012921,-0.558565,0.182892,0.116155,0.177856,-0.169452,...,0.097648,0.055477,0.127791,0.020806,0.007801,0.006715,0.197764,0.030658,-0.13033,0.126353
std,0.928031,0.941515,0.911236,0.970298,0.888377,0.517957,0.918054,0.955116,0.895444,0.953813,...,1.0612,0.901934,0.873028,0.902584,1.006995,1.003291,0.985675,0.970812,1.017196,0.983966
min,-4.335,-5.122,-3.42,-3.956,-4.742,-2.182,-4.576,-5.048,-4.692,-12.891,...,-2.912,-4.507,-5.859,-4.053,-4.627,-4.789,-5.695,-2.608,-3.63,-3.044
25%,-0.297,-0.22625,-0.313,-0.65225,-0.385,-0.853,-0.31,-0.295,-0.159,-0.39,...,-0.664,-0.283,-0.17025,-0.40725,-0.499,-0.29,-0.2025,-0.413,-0.79825,-0.35025
50%,0.359,0.2725,0.386,-0.0445,0.11,-0.466,0.388,0.344,0.362,0.042,...,-0.023,0.0535,0.2995,0.039,-0.04,0.16,0.364,0.137,-0.1855,0.313
75%,0.726,0.599,0.91825,0.624,0.55025,-0.154,0.83125,0.78225,0.726,0.042,...,0.74525,0.488,0.635,0.557,0.462,0.273,0.602,0.64425,0.49525,0.79325
max,2.121,1.918,2.828,2.457,2.689,0.489,1.895,1.918,2.245,1.335,...,4.58,2.689,2.013,2.395,5.465,5.11,2.324,5.238,3.0,2.538


In [70]:
y_train = train_data['target'].values
X_train = train_data.drop('target', axis=1).values
X_test = test_data.values

min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.fit_transform(X_test)

In [71]:
##### xgb
xgb_params = {'eta': 0.1, 'max_depth': 4, 'subsample': 0.6, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 16}

# reg:linear 线性回归；reg:logistic 逻辑回归

folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_xgb = np.zeros(len(train_data))
predictions_xgb = np.zeros(len(test_data))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
    val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])

    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
    predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y_train)))

fold n°1
[0]	train-rmse:0.962224	valid_data-rmse:0.981312
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.
[100]	train-rmse:0.213893	valid_data-rmse:0.332196
[200]	train-rmse:0.153767	valid_data-rmse:0.327006
[300]	train-rmse:0.114048	valid_data-rmse:0.325163
[400]	train-rmse:0.088236	valid_data-rmse:0.321667
[500]	train-rmse:0.068231	valid_data-rmse:0.320212
[600]	train-rmse:0.053331	valid_data-rmse:0.320307
[700]	train-rmse:0.041446	valid_data-rmse:0.320105
[800]	train-rmse:0.03242	valid_data-rmse:0.319863
[900]	train-rmse:0.025701	valid_data-rmse:0.320174
[1000]	train-rmse:0.02062	valid_data-rmse:0.320053
Stopping. Best iteration:
[810]	train-rmse:0.031623	valid_data-rmse:0.319828

fold n°2
[0]	train-rmse:0.977985	valid_data-rmse:0.904619
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved 

In [79]:
##### ridge
reg = Ridge(alpha = .5)

folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_reg = np.zeros(len(train_data))
predictions_reg = np.zeros(len(test_data))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))

    clf = reg.fit(X_train[trn_idx], y_train[trn_idx])
    oof_reg[val_idx] = clf.predict(X_train[val_idx])
    print("fold n°{}".format(mean_squared_error(oof_reg[val_idx], y_train[val_idx])))
    predictions_reg += clf.predict(X_test) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_reg, y_train)))

fold n°1
fold n°0.10888544804801513
fold n°2
fold n°0.13954093483790903
fold n°3
fold n°0.0943499326025632
fold n°4
fold n°0.11501970468666108
fold n°5
fold n°0.10329170773049902
CV score: 0.11221967


In [84]:
series = pd.Series(predictions_reg)
series.to_csv('../../datasets/zhengqi/submit.txt', index=None, header=None)