In [1]:
import pandas as pd
import numpy as np
import gc
import time
from contextlib import contextmanager
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import category_encoders as ce
import pickle
from sklearn.svm import SVC
import os
from sklearn.linear_model import LogisticRegression
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
train = pd.read_csv("../data/sample_train.txt", delimiter="\t")
test = pd.read_csv("../data/test_id.txt", delimiter="\t")
df = pd.concat([train, test], axis=0)

In [12]:
test.shape

(6000, 1)

# 加载训练好的模型结果

In [3]:
cook_book = {
    "lgb_628":5,
    "lgb_572":5,
    "lgb_692":5,
    "xgb_652":5,
    "xgb_572":5,
    "lgb_585":5,
}

In [4]:
for name, count in cook_book.items():
    model_dir = "../model_output/random/%s/" % name
    for i in range(count):
        pred_train = pd.read_csv(model_dir + str(i) + "/pred_train.csv")
        pred_test = pd.read_csv(model_dir + str(i) + "/all_test_preds.csv")

        pred_train['rank_prob'] = pred_train.prob.rank() / train.shape[0]
        pred_test["rank_prob"] = pred_test.groupby("fold_id").prob.rank() / test.shape[0]
        pred_test = pred_test.groupby("id").mean().reset_index().drop("fold_id", axis=1)

        a = pred_test.prob.rank()
        b = pred_test.rank_prob.rank()

        d = pd.concat([pred_train, pred_test], axis=0)
        d = d.rename(columns={"prob": "%s_%d_prob" % (name, i), "rank_prob": "%s_%d_rank_prob" % (name, i)})

        df = df.merge(d, on="id", how="left")

# 训练

In [5]:
def  process_feature(train_x, valid_x, test_df):
    result = []
    drop_cols = ['id','label']
    for df in [train_x, valid_x, test_df]:
        result.append(df.drop(drop_cols, axis=1))
    return result 

In [6]:
def cv(df, num_folds, param, model_dir, classfier="lr", stratified=True, debug=False):
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    train_df = df[df.label.notnull()]
    test_df = df[df.label.isnull()]
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=178)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=178)

    oof_preds = np.zeros(train_df.shape[0])
    all_test_preds = []    
    feature_importance_df = pd.DataFrame()
#     feats = [f for f in train_df.columns if f not in ['id','label', "prob", "tag", "loan_dt"]]
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_df['label'])):
        train_x, train_y = train_df.iloc[train_idx], train_df['label'].iloc[train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], train_df['label'].iloc[valid_idx]
        fold_preds = test_df[["id"]]
        
        train_x, valid_x, test = process_feature(train_x, valid_x, test_df)
        if n_fold == 0:
            print(train_x.shape, valid_x.shape, test.shape)
        
        if classfier == "lgb":
            train_data = lgb.Dataset(train_x, label=train_y)
            validation_data = lgb.Dataset(valid_x, label=valid_y)

            clf=lgb.train(params,
                          train_data,
                          num_boost_round=10000,
                          valid_sets=[train_data, validation_data],
                          valid_names=["train", "valid"],
                          early_stopping_rounds=200,
                          verbose_eval=100)

            valid_preds = clf.predict(valid_x, num_iteration=clf.best_iteration)
            test_preds = clf.predict(test, num_iteration=clf.best_iteration)
        
        if classfier == "lr":
            clf = LogisticRegression(penalty="l2", solver="sag", n_jobs=32)
            clf.fit(train_x, train_y)

            valid_preds = clf.predict_proba(valid_x)[:, 1]
            test_preds = clf.predict_proba(test)[:, 1]

        fold_preds['prob'] = test_preds
        fold_preds['fold_id'] = n_fold + 1
        all_test_preds.append(fold_preds)

        oof_preds[valid_idx] = valid_preds
        
        if classfier == "lgb":
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = train_x.columns.tolist()
            fold_importance_df["importance"] = clf.feature_importance()
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, valid_preds)))
        
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    print('Full AUC score %.6f' % roc_auc_score(train_df['label'], oof_preds))
    
    if not debug:
        train_df["prob"] = oof_preds
        train_df[['id', 'prob']].to_csv(model_dir + "pred_train.csv", index= False)

        all_test_preds = pd.concat(all_test_preds, axis=0)
        all_test_preds.to_csv(model_dir + "all_test_preds.csv", index=False)
        
        sub = pd.DataFrame()
        sub['id'] = all_test_preds.id.unique()
        sub.set_index("id", inplace=True)
        sub["prob"] = all_test_preds.groupby("id").prob.mean()
        sub.reset_index().to_csv(model_dir + "sub_test_random.txt", index=False)
    return

# 特征选择

In [7]:
X = df[df.label.notna()].drop(['id', 'label'], axis=1)
y = df[df.label.notna()].label
feature_name = X.columns.tolist()

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from lightgbm import LGBMClassifier


feature_score = pd.DataFrame(index=X.columns.tolist())
# pearson_cor
def pearson_cor(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    return np.abs(cor_list)
feature_score["pearson_cor"] = pearson_cor(X, y)
feature_score["pearson_cor_rank"] = feature_score.pearson_cor.rank(ascending=False)

# chi2
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2)
chi_selector.fit(X_norm, y)
chi_score = chi_selector.scores_
chi_feature = X.iloc[:,np.argsort(chi_score)[::-1]].columns.tolist()
feature_score["chi_2"] = chi_score
feature_score["chi_2"] = feature_score["chi_2"].fillna(0)
feature_score["chi_2_rank"] = feature_score.chi_2.rank(ascending=False)

# lgb
lgb = LGBMClassifier(
            nthread=20,
            #is_unbalance=True,
            n_estimators=100,
            learning_rate=0.02,
            num_leaves=8,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            #scale_pos_weight=11
            )
rfe_selector = RFE(lgb, step=0.1, verbose=1)
rfe_selector.fit(X, y)
feature_score["ref_rank"] = rfe_selector.ranking_
rank_count = feature_score.ref_rank.value_counts().sort_index().tolist()
feature_score["ref_rank"] = feature_score["ref_rank"].apply(lambda x: sum(rank_count[:x]))

# lr
lr = LogisticRegression(penalty="l2", solver="sag", n_jobs=20)
lr.fit(X_norm, y)
feature_score["lr"] = np.abs(lr.coef_)[0]
feature_score["lr_rank"] = feature_score.lr.rank(ascending=False)

# rf
rf = RandomForestClassifier(n_jobs=20)
rf.fit(X, y)
feature_score["rf"] = rf.feature_importances_
feature_score["rf_rank"] = feature_score.rf.rank(ascending=False)

# total rank
feature_score["rank_"] = feature_score.chi_2_rank + feature_score.pearson_cor_rank + feature_score.ref_rank + feature_score.rf_rank + feature_score.lr_rank

# feature_score.sort_values("rank_")

Fitting estimator with 60 features.
Fitting estimator with 54 features.
Fitting estimator with 48 features.
Fitting estimator with 42 features.
Fitting estimator with 36 features.


In [8]:
fs = feature_score.sort_values("rank_").head(9).index.tolist()

In [9]:
fs

['lgb_628_3_rank_prob',
 'xgb_652_1_prob',
 'xgb_572_3_prob',
 'lgb_628_4_rank_prob',
 'lgb_628_1_rank_prob',
 'lgb_572_3_prob',
 'xgb_652_3_prob',
 'lgb_572_1_prob',
 'lgb_628_2_rank_prob']

In [10]:
params = {'boosting_type': 'goss', 
          'colsample_bytree': 0.6188451188326409, 
          'learning_rate': 0.02278643912197006,
          'max_bin': 200, 
          'metric': 'auc', 
          'min_child_weight': 19.91986754624656,
          'num_leaves': 71, 
          'reg_alpha': 3.6554523524605216, 
          'reg_lambda': 2676.0505164555602, 
          'subsample': 1.0}
model_dir = "../model_output/random/stacking/"
result = cv(df[["id", "label"] + fs], 5, params, model_dir, "lr")

(15166, 9) (3793, 9) (6000, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold  1 AUC : 0.710556
Fold  2 AUC : 0.716254
Fold  3 AUC : 0.728293
Fold  4 AUC : 0.702893
Fold  5 AUC : 0.720742
Full AUC score 0.715684


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
