In [701]:
import pandas as pd
import numpy as np
import gc
import time
from contextlib import contextmanager
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import category_encoders as ce
import pickle
from sklearn.svm import SVC
import os
from sklearn.linear_model import LogisticRegression
warnings.simplefilter(action='ignore', category=FutureWarning)

In [702]:
train = pd.read_csv("../data/sample_train.txt", delimiter="\t")
test = pd.read_csv("../data/test_id.txt", delimiter="\t")
df = pd.concat([train, test], axis=0)

In [703]:
# dp_knn = pd.read_csv("../features/knn/dp_knn_100.csv")
# df = df.merge(dp_knn, on="id", how="left")

# 加载训练好的模型结果

In [704]:

model_dir = "../model_output//"
for i in range(5):
    pred_train = pd.read_csv(model_dir + str(i) + "/pred_train.csv")
    pred_test = pd.read_csv(model_dir + str(i) + "/all_test_preds.csv")
    
    pred_train['rank_prob'] = pred_train.prob.rank() / train.shape[0]
    pred_test["rank_prob"] = pred_test.groupby("fold_id").prob.rank() / valid.shape[0]
    pred_test = pred_test.groupby("id").mean().reset_index().drop("fold_id", axis=1)
    
    a = pred_test.prob.rank()
    b = pred_test.rank_prob.rank()

    diff = a - b
    print(diff.max(), diff.min())
    
    d = pd.concat([pred_train, pred_test], axis=0)
    d = d.rename(columns={"prob": "42_%d_prob" % i, "rank_prob": "42_%d_rank_prob" % i})
    
    df = df.merge(d, on="id", how="left")

182.0 -146.0
194.0 -174.0
265.0 -138.0
195.0 -206.0
213.0 -159.0


In [705]:
model_dir = "../model_output/B/"

paras = pd.read_csv(model_dir + "params_41.csv").sort_values("score", ascending=False)

for i in paras.iloc[:3].iteration:
    pred_train = pd.read_csv(model_dir + str(i) + "/pred_train.csv")
    pred_test = pd.read_csv(model_dir + str(i) + "/all_test_preds.csv")
    
    pred_train['rank_prob'] = pred_train.prob.rank() / train.shape[0]
    pred_test["rank_prob"] = pred_test.groupby("fold_id").prob.rank() / valid.shape[0]
    pred_test = pred_test.groupby("id").mean().reset_index().drop("fold_id", axis=1)
    
    a = pred_test.prob.rank()
    b = pred_test.rank_prob.rank()

    diff = a - b
    print(diff.max(), diff.min())
    
    d = pd.concat([pred_train, pred_test], axis=0)
    d = d.rename(columns={"prob": "41_%d_prob" % i, "rank_prob": "41_%d_rank_prob" % i})
    
    df = df.merge(d, on="id", how="left")

198.0 -267.0
210.0 -312.0
238.0 -292.0


# 训练

In [706]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')

In [707]:
def  process_feature(train_x, valid_x, test_df):
    result = []
    drop_cols = ['id','label']
    for df in [train_x, valid_x, test_df]:
        result.append(df.drop(drop_cols, axis=1))
    return result 

In [708]:
def cv(df, num_folds, param, model_dir, classfier="lr", stratified=True, debug=False):
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    train_df = df[df.label.notnull()]
    test_df = df[df.label.isnull()]
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)

    oof_preds = np.zeros(train_df.shape[0])
    all_test_preds = []    
    feature_importance_df = pd.DataFrame()
#     feats = [f for f in train_df.columns if f not in ['id','label', "prob", "tag", "loan_dt"]]
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_df['label'])):
        train_x, train_y = train_df.iloc[train_idx], train_df['label'].iloc[train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], train_df['label'].iloc[valid_idx]
        fold_preds = test_df[["id"]]
        
        train_x, valid_x, test = process_feature(train_x, valid_x, test_df)
        if n_fold == 0:
            print(train_x.shape, valid_x.shape, test.shape)
        
        if classfier == "lgb":
            train_data = lgb.Dataset(train_x, label=train_y)
            validation_data = lgb.Dataset(valid_x, label=valid_y)

            clf=lgb.train(params,
                          train_data,
                          num_boost_round=10000,
                          valid_sets=[train_data, validation_data],
                          valid_names=["train", "valid"],
                          early_stopping_rounds=200,
                          verbose_eval=100)

            valid_preds = clf.predict(valid_x, num_iteration=clf.best_iteration)
            test_preds = clf.predict(test, num_iteration=clf.best_iteration)
        
        if classfier == "lr":
            clf = LogisticRegression(penalty="l2", solver="sag", n_jobs=32)
            clf.fit(train_x, train_y)

            valid_preds = clf.predict_proba(valid_x)[:, 1]
            test_preds = clf.predict_proba(test)[:, 1]

        fold_preds['prob'] = test_preds
        fold_preds['fold_id'] = n_fold + 1
        all_test_preds.append(fold_preds)

        oof_preds[valid_idx] = valid_preds
        
        if classfier == "lgb":
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = train_x.columns.tolist()
            fold_importance_df["importance"] = clf.feature_importance()
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, valid_preds)))
        
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    print('Full AUC score %.6f' % roc_auc_score(train_df['label'], oof_preds))
    
    if not debug:
        train_df["prob"] = oof_preds
        train_df[['id', 'prob']].to_csv(model_dir + "pred_train.csv", index= False)

        all_test_preds = pd.concat(all_test_preds, axis=0)
        all_test_preds.to_csv(model_dir + "all_test_preds.csv", index=False)
        
        sub = pd.DataFrame()
        sub['id'] = all_test_preds.id.unique()
        sub.set_index("id", inplace=True)
        sub["prob"] = all_test_preds.groupby("id").prob.mean()
        sub.reset_index().to_csv(model_dir + "sub_test.txt", index=False)
    if classfier == "lgb":
        display_importances(feature_importance_df)
    return

In [709]:
params = {'boosting_type': 'goss', 
          'colsample_bytree': 0.6188451188326409, 
          'learning_rate': 0.02278643912197006,
          'max_bin': 200, 
          'metric': 'auc', 
          'min_child_weight': 19.91986754624656,
          'num_leaves': 71, 
          'reg_alpha': 3.6554523524605216, 
          'reg_lambda': 2676.0505164555602, 
          'subsample': 1.0}
model_dir = "../model_output/stacking/"
result = cv(df, 5, params, model_dir, "lr")

(15166, 16) (3793, 16) (4000, 16)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fold  1 AUC : 0.720431
Fold  2 AUC : 0.721972
Fold  3 AUC : 0.716434
Fold  4 AUC : 0.693365
Fold  5 AUC : 0.709078
Full AUC score 0.711951


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [710]:
# pred_test = pd.read_csv("../model_output/stacking/all_test_preds.csv")
# pred_test["rank_prob"] = pred_test.groupby("fold_id").prob.rank() / valid.shape[0]
# pred_test = pred_test.groupby("id").mean().reset_index().drop("fold_id", axis=1)
# pred_test.head()