In [26]:
#模型融合
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
svr = SVC(C=10, kernel='rbf')
xgb = XGBClassifier(learning_rate=0.1,
                      n_estimators=470,           # 树的个数-10棵树建立xgboost
                      max_depth=12,               # 树的深度
                      min_child_weight = 1,      # 叶子节点最小权重
                      gamma=0.3,                  # 惩罚项中叶子结点个数前的参数
                      subsample=1,               # 所有样本建立决策树
                      colsample_btree=1,         # 所有特征建立决策树
                      random_state=27)
knn = KNeighborsClassifier(n_neighbors=25)
gb = GradientBoostingClassifier(learning_rate=0.1,n_estimators=200,min_samples_leaf=80,min_samples_split=80,
                                  max_depth=8,max_features='sqrt',subsample=0.8,random_state=10)
rf = RandomForestClassifier(n_estimators= 150, max_depth=7, min_samples_split=70,
                                  min_samples_leaf=60,random_state=10)

class StackingAveragedModels(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    def fit(self, x_train , y):
        self.base_models_ = [list () for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)

        # 使用K-fold的方法来进行交叉验证，将每次验证的结果作为新的特征来进行处理
        out_of_fold_predictions = np.zeros((x_train.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(x_train, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(x_train[train_index],  y[train_index])
                y_pred = instance.predict(x_train[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred

        # 将交叉验证预测出的结果 和 训练集中的标签值进行训练
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self

    # 从得到的新的特征  采用新的模型进行预测  并输出结果
    def predict_proba(self, X):
        meta_features = np.column_stack ([
            np.column_stack([model.predict (X) for model in base_models]).mean (axis=1)
            for base_models in self.base_models_])
        return self.meta_model_.predict_proba(meta_features)

stacked_averaged_models = StackingAveragedModels(base_models=(svr, gb, knn,rf,xgb), meta_model= xgb )

In [18]:
import pandas as pd
import numpy as np
train=pd.read_csv('train_128.csv',index_col=0)
test=pd.read_csv('test/test_128.csv',index_col=0)

In [19]:
x=train.iloc[:,:-1]
x=np.array(x)
y=train.iloc[:,-1]
y=np.array(y)

In [20]:
# 使用imlbearn库中上采样方法中的SMOTE接口
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 
from collections import Counter
# 定义SMOTE模型，random_state相当于随机数种子的作用
smo = SMOTE(random_state=42)
X,Y = smo.fit_sample(x, y)
print(Counter(Y))
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=0)
print(len(x_train))
print(len(x_test))

Counter({0: 4524, 1: 4524})
6333
2715


In [27]:
test=np.array(test)
ensemble = stacked_averaged_models.fit( x , y)
#ensemble.score(x_train , y)
#y_pred=ensemble.predict_proba(test)
y_pred=stacked_averaged_models.predict_proba(test)



In [28]:
y_pred.shape

(13899, 2)

In [29]:
label=pd.DataFrame(y_pred,columns=['probability0','probability'])

In [30]:
label=label.drop(['probability0'],axis=1)

In [31]:
label.to_csv('test/ronghe.csv')