In [149]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import ElasticNetCV, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, make_union, Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR,SVC
from sklearn.linear_model import LinearRegression, ElasticNet,Ridge,Lasso,RidgeClassifierCV,LogisticRegression
from sklearn.model_selection import KFold  
from sklearn.feature_selection import SelectPercentile, f_regression
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor
import copy as cp
from sklearn.preprocessing import MaxAbsScaler, Normalizer
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from imblearn.under_sampling import NearMiss ,RandomUnderSampler, NeighbourhoodCleaningRule, OneSidedSelection, AllKNN
from imblearn.over_sampling import SMOTE
from xgboost import XGBRegressor
from imblearn.ensemble import EasyEnsemble 
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier

In [150]:
train = pd.read_csv("../data/processed/train.csv")
test = pd.read_csv("../data/processed/test.csv")
train.pop("id")
test.pop("id")
target = train.pop("血糖")

train_x= train.as_matrix()
train_y = target.as_matrix()
test_x = test.as_matrix()

In [151]:
high_labels = np.zeros((train_y.shape[0],))
for i in range(train_y.shape[0]):
    if train_y[i]<10:
        high_labels[i] = 1
    else:
        high_labels[i] = -1

In [152]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.utils import check_array
from sklearn.preprocessing import OneHotEncoder

class myStackingFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.estimator = None
        self.lgb = lgb.LGBMClassifier(boosting_type="GBDT",
                                      num_leaves=31,
                                      learning_rate=0.01,
                                      feature_fraction=0.5,
                                      bagging_fraction=0.5,
                                      bagging_freq=5,
                                      n_estimators=400)
        self.grd_enc = OneHotEncoder()
        self.lr = LogisticRegression()
        self.classes_ = [-1,1]
    def fit(self, X, y=None, **fit_params):
        self.lgb.fit(X, y)
        self.grd_enc.fit(self.lgb.apply(X))
        self.lr.fit(self.grd_enc.transform(self.lgb.apply(X)), y)
    def predict_proba(self, X):
        return self.lr.predict_proba(self.grd_enc.transform(self.lgb.apply(X)))
    def predict(self, X):
        return self.lr.predict(self.grd_enc.transform(self.lgb.apply(X)))

In [153]:
def modif_value(training_features, training_target, training_labels, testing_features, X, Y):
    exported_pipeline = classifier = Pipeline([
            ("scaler",MaxAbsScaler()),
            ("SVR",StackingEstimator(estimator=SVC())),
            ("RidgeCV",StackingEstimator(estimator=RidgeClassifierCV())),
            ("BaggingClassifier",BaggingClassifier(base_estimator=myStackingFeatures(),random_state=201801))])
    exported_pipeline.fit(training_features, training_labels)
    prob = exported_pipeline.predict_proba(testing_features)
    
    
    predicts = np.zeros((prob.shape[0],))
    for i in range(prob.shape[0]):
        if prob[i,1]>0.5:
            predicts[i] = 1
        else:
            predicts[i] = -1
    negative_pred_list = list(np.where(predicts==-1)[0])
    negative_labels_list = list(np.where(training_labels==-1)[0])
    

    
    negative_results = None

    
    if len(negative_pred_list)==0:
        negative_results = []
    else:
        exported_pipeline = Pipeline([
            ("scaler", MaxAbsScaler()),
            ("SVR", StackingEstimator(
                estimator=LinearSVR(C=0.01, dual=False, epsilon=1.0, loss="squared_epsilon_insensitive", tol=0.001))),
            ("RidgeCV", StackingEstimator(estimator=RidgeCV())),
            ("LGB", lgb.LGBMRegressor(objective='regression',
                                      boosting_type="GBDT",
                                      num_leaves=31,
                                      learning_rate=0.01,
                                      feature_fraction=0.5,
                                      bagging_fraction=0.5,
                                      bagging_freq=5,
                                      n_estimators=400))]
        )
        print(len(negative_labels_list))
        exported_pipeline.fit(X, Y)
        negative_results = exported_pipeline.predict(testing_features[negative_pred_list])  
        
    positive_pred_list = list(np.where(predicts==1)[0])
    positive_labels_list = list(np.where(training_labels==1)[0])
    positive_results = None
    '''   
    if len(positive_pred_list)==0:
        positive_results = []
    else:
        exported_pipeline = Pipeline([
            ("scaler", MaxAbsScaler()),
            ("SVR", StackingEstimator(
                estimator=LinearSVR(C=0.01, dual=False, epsilon=1.0, loss="squared_epsilon_insensitive", tol=0.001))),
            ("RidgeCV", StackingEstimator(estimator=RidgeCV())),
            ("LGB", lgb.LGBMRegressor(objective='regression',
                                      boosting_type="GBDT",
                                      num_leaves=31,
                                      learning_rate=0.01,
                                      feature_fraction=0.5,
                                      bagging_fraction=0.5,
                                      bagging_freq=5,
                                      n_estimators=400))]
        )
        
        exported_pipeline.fit(training_features[positive_labels_list], training_target[positive_labels_list])
        positive_results = exported_pipeline.predict(testing_features[positive_pred_list])  
    '''
    
    
    return negative_results,negative_pred_list, positive_results,positive_pred_list

In [166]:
class myStackingFeaturesRegressor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.estimator = None
        self.lgb = GradientBoostingRegressor(loss='ls', alpha=0.9,
                                    n_estimators=200,
                                    learning_rate=0.02,
                                    max_depth=8,
                                    subsample=0.8,
                                    min_samples_split=9,
                                    max_leaf_nodes=12)
        self.grd_enc = OneHotEncoder()
        self.lr = RidgeCV()
        self.classes_ = [-1,1]
    def fit(self, X, y=None, **fit_params):
        self.lgb.fit(X, y)
        self.grd_enc.fit(self.lgb.apply(X))
        self.lr.fit(self.grd_enc.transform(self.lgb.apply(X)), y)
    def predict(self, X):
        return self.lr.predict(self.grd_enc.transform(self.lgb.apply(X)))

In [172]:
N = 5
kf = KFold(n_splits=N, random_state=42)
result_mean = 0.0
i = 0
test_preds = np.zeros((test_x.shape[0], N))
for train_index, test_index in kf.split(train_x):
    training_features, training_target = train_x[train_index], train_y[train_index]
    testing_features, testing_target = train_x[test_index], train_y[test_index]

    exported_pipeline = Pipeline([
        ("scaler", MaxAbsScaler()),
        ("SVR", StackingEstimator(estimator=LinearSVR(C=0.01, dual=False, epsilon=1.0, loss="squared_epsilon_insensitive", tol=0.001))),
        ("RidgeCV", StackingEstimator(estimator=RidgeCV())),
        ("LGB", lgb.LGBMRegressor(objective='regression',
                                  boosting_type="GBDT",
                                  num_leaves=17,
                                  learning_rate=0.01,
                                  feature_fraction=0.5,
                                  bagging_fraction=0.5,
                                  bagging_freq=5,
                                  reg_alpha=0.1,
                                  reg_lambda=0.1,
                                  n_estimators=400))]
    )

    exported_pipeline.fit(training_features, training_target)
    # 直接加权融合
    test_pred = exported_pipeline.predict(test_x)
    '''
    high_results,pred_high_list,_,_ = modif_value(training_features, training_target, high_labels[train_index], test_x, train_x[np.where(high_labels==-1)[0]],
                                                 train_y[np.where(high_labels==-1)[0]])
    
    if len(high_results) !=0 and len(pred_high_list)!=0:
        for ii,jj in enumerate(pred_high_list):
            test_pred[jj] = high_results[ii]
    for index, value in zip(high_results, pred_high_list):
        print(index,value)
    #
    mystacking = myStackingFeaturesRegressor()
    mystacking.fit(training_features, training_target)
    new_results = mystacking.predict(test_x)  
    test_pred = 0.6*test_pred + 0.4*new_results
    '''
    
    
    test_preds[:, i] = test_pred
    i += 1
results = test_preds.mean(axis=1)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


In [173]:
#results[313] = 15.4860937360076
#results[938] = 17.5400019823901
ouput = pd.DataFrame()
ouput[0] = results

print(ouput.describe())
print(ouput.loc[ouput[0]>8])
ouput.to_csv("../result/1.18-LiuYuJIA.csv", header=None, index=False,encoding="utf-8")

                 0
count  1000.000000
mean      5.677471
std       0.723990
min       4.923878
25%       5.200138
50%       5.465292
75%       5.882901
max      10.523390
             0
33    8.524911
144   8.104265
247   8.887586
264   8.056839
267   9.450145
292   8.542863
303   8.795826
313   9.824358
564   8.174529
601   8.061152
602   8.176458
628   8.407411
722   8.135341
822   8.474058
928   8.934779
938  10.523390
951   8.333260
959   8.023661
968   8.539247
973   8.453776
997   8.357614


In [132]:
#ouput.loc[ouput[0]>6.64]

In [None]:
                 0
count  1000.000000
mean      5.683937
std       0.774499
min       4.923878
25%       5.200138
50%       5.465292
75%       5.882901
max      14.441510
             0
267   9.450145
313  11.205019
928  10.102352
938  14.441510

In [None]:
#not best
train = pd.read_csv("../data/processed/train.csv")
test = pd.read_csv("../data/processed/test.csv")
train.pop("id")
test.pop("id")
target = train.pop("血糖")

train_x= train.as_matrix()
train_y = target.as_matrix()
test_x = test.as_matrix()
exported_pipeline = Pipeline([
    ("scaler",MaxAbsScaler()),
    ("SVR",StackingEstimator(estimator=LinearSVR(C=0.01, dual=False, epsilon=1.0, loss="squared_epsilon_insensitive", tol=0.001))),
    ("RidgeCV",StackingEstimator(estimator=RidgeCV())),
    ("LGB", lgb.LGBMRegressor(objective='regression',
                      boosting_type="GBDT",
                      num_leaves=17,
                      learning_rate=0.01,
                      feature_fraction=0.5,
                      bagging_fraction=0.5,
                      bagging_freq=5,
                      reg_alpha=0.5,
                      reg_lambda=0.5,
                      n_estimators=400))]
)
exported_pipeline.fit(train_x, train_y)
results_normal = exported_pipeline.predict(test_x)
#results_normal[313] = 15.4860937360076
#results_normal[938] = 17.5400019823901
ouput = pd.DataFrame()
ouput[0] = results_normal
#ouput.to_csv("../result/1.14-LiuYuJia-0.96242-withoutPop-modifyValue.csv", header=None, index=False,encoding="utf-8")
ouput.describe()


In [16]:
#best
train = pd.read_csv("../data/processed/train_best.csv")
test = pd.read_csv("../data/processed/test_best.csv")
train.pop("id")
test.pop("id")
target = train.pop("血糖")

train_x= train.as_matrix()
train_y = target.as_matrix()
test_x = test.as_matrix()

exported_pipeline = make_pipeline(
    MaxAbsScaler(),
    StackingEstimator(estimator=LinearSVR(C=0.01, dual=False, epsilon=1.0, loss="squared_epsilon_insensitive", tol=0.001)),
    StackingEstimator(estimator=RidgeCV()),
    lgb.LGBMRegressor(objective='regression',
                    boosting_type ="GBDT",
                    num_leaves=17,
                    learning_rate=0.01,
                    feature_fraction=0.5,
                    bagging_fraction=0.5,
                    bagging_freq=5,
                    reg_alpha=1,
                    reg_lambda=0.5,
                    n_estimators=500)
    )
exported_pipeline.fit(train_x, train_y)
results_best = exported_pipeline.predict(test_x)
#results_best[313] = 15.4860937360076
#results_best[938] = 17.5400019823901
ouput = pd.DataFrame()
ouput[0] = results_best
print(ouput.describe())
print(ouput.loc[ouput[0]>8])
ouput.to_csv("../result/1.16-LiuYuJia-withoutvalue-adddateonehot.csv", header=None, index=False,encoding="utf-8")

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


                 0
count  1000.000000
mean      5.676744
std       0.757935
min       4.892651
25%       5.181367
50%       5.451028
75%       5.881461
max      10.797793
             0
33    9.049166
169   8.112911
185   8.605971
245   8.234955
247   9.531291
249   8.553951
267   9.373990
292   8.721753
303   8.763350
313   9.854436
330   8.012021
446   8.033770
602   8.235244
628   8.465243
722   8.060280
822   8.437048
846   8.127580
928   9.122541
938  10.797793
951   8.455134
968   8.721563
971   8.064821
973   8.731417
997   8.674035
