In [63]:
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)


from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,roc_curve,auc, f1_score, recall_score, precision_score, classification_report
from sklearn.model_selection import train_test_split,GridSearchCV

In [51]:
path = './data/'

train_df = pd.read_csv(f'{path}dataTrain.csv')
no_label_df = pd.read_csv(f'{path}dataNoLabel.csv')
test_df = pd.read_csv(f'{path}dataA.csv')

print(train_df.shape, no_label_df.shape)

(59872, 48) (39884, 47)


In [52]:
import itertools

train_df['f3'] = train_df['f3'].map({'low': 1, 'mid': 2, 'high': 3})
no_label_df['f3'] = no_label_df['f3'].map({'low': 1, 'mid': 2, 'high': 3})
test_df['f3'] = test_df['f3'].map({'low': 1, 'mid': 2, 'high': 3})

# 暴力Feature 位置
loc_f = ['f1', 'f2', 'f4', 'f5', 'f6']
for df, i in itertools.product(tqdm([train_df, test_df,no_label_df]), range(len(loc_f))):
    for j in range(i + 1, len(loc_f)):
        df[f'{loc_f[i]}+{loc_f[j]}'] = df[loc_f[i]] + df[loc_f[j]]
        df[f'{loc_f[i]}-{loc_f[j]}'] = df[loc_f[i]] - df[loc_f[j]]
        df[f'{loc_f[i]}*{loc_f[j]}'] = df[loc_f[i]] * df[loc_f[j]]
        df[f'{loc_f[i]}/{loc_f[j]}'] = df[loc_f[i]] / (df[loc_f[j]] + 1)

# 暴力Feature 通话
com_f = ['f43', 'f44', 'f45', 'f46']
for df, i in itertools.product(tqdm([train_df, test_df,no_label_df]), range(len(com_f))):
    for j in range(i + 1, len(com_f)):
        df[f'{com_f[i]}+{com_f[j]}'] = df[com_f[i]] + df[com_f[j]]
        df[f'{com_f[i]}-{com_f[j]}'] = df[com_f[i]] - df[com_f[j]]
        df[f'{com_f[i]}*{com_f[j]}'] = df[com_f[i]] * df[com_f[j]]
        df[f'{com_f[i]}/{com_f[j]}'] = df[com_f[i]] / (df[com_f[j]] + 1)


100%|██████████| 3/3 [00:00<?, ?it/s]
100%|██████████| 3/3 [00:00<?, ?it/s]


In [53]:
def model_train(model,train,y, model_name, kfold=5):
    oof_preds = np.zeros((train.shape[0]))
    test_preds = np.zeros(test.shape[0])
    skf = StratifiedKFold(n_splits=kfold)
    print(f"Model = {model_name}")
    for k, (train_index, test_index) in enumerate(skf.split(train, y)):
        x_train, x_test = train.iloc[train_index, :], train.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(x_train,y_train)

        y_pred = model.predict_proba(x_test)[:,1]
        oof_preds[test_index] = y_pred.ravel()
        auc = roc_auc_score(y_test,y_pred)
        print("- KFold = %d, val_auc = %.4f" % (k, auc))
        test_fold_preds = model.predict_proba(test)[:, 1]
        test_preds += test_fold_preds.ravel()
    print("Overall Model = %s, AUC = %.4f" % (model_name, roc_auc_score(y, oof_preds)))
    return test_preds / kfold

In [54]:
def lgb(train,y,test):
    KF = StratifiedKFold(n_splits=5, random_state=2022, shuffle=True)
    
#     feat_imp_df = pd.DataFrame({'feat': feature_columns, 'imp': 0})
    
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'n_jobs': 30,
        'learning_rate': 0.05,
        'num_leaves': 2 ** 6,
        'max_depth': 8,
        'tree_learner': 'serial',
        'colsample_bytree': 0.8,
        'subsample_freq': 1,
        'subsample': 0.8,
        'num_boost_round': 5000,
        'max_bin': 255,
        'verbose': -1,
        'seed': 2022,
        'bagging_seed': 2022,
        'feature_fraction_seed': 2022,
        'early_stopping_rounds': 100,

    }

    oof_lgb = np.zeros(len(train))
    predictions_lgb = np.zeros((len(test)))

    for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, y.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][feature_columns], label=y.iloc[trn_idx])
        val_data = lgb.Dataset(train.iloc[val_idx][feature_columns], label=y.iloc[val_idx])
        
        num_round = 3000
        
        clf = lgb.train(
            params,
            trn_data,
            num_round,
            valid_sets=[trn_data, val_data],
            verbose_eval=100,
            early_stopping_rounds=50,
        )

        oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][feature_columns], num_iteration=clf.best_iteration)
        predictions_lgb[:] += clf.predict(test[feature_columns], num_iteration=clf.best_iteration) / 5
#         feat_imp_df['imp'] += clf.feature_importance() / 5
    return predictions_lgb

In [55]:
feature_columns = [col for col in train_df.columns if col not in ['id','label']]

# 训练集、测试集
train = train_df[feature_columns+['label']]
# y = train_df['label']
train = train[:50000]
# y = y[:50000]

test = test_df
train.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f1+f2,f1-f2,f1*f2,f1/f2,f1+f4,f1-f4,f1*f4,f1/f4,f1+f5,f1-f5,f1*f5,f1/f5,f1+f6,f1-f6,f1*f6,f1/f6,f2+f4,f2-f4,f2*f4,f2/f4,f2+f5,f2-f5,f2*f5,f2/f5,f2+f6,f2-f6,f2*f6,f2/f6,f4+f5,f4-f5,f4*f5,f4/f5,f4+f6,f4-f6,f4*f6,f4/f6,f5+f6,f5-f6,f5*f6,f5/f6,f43+f44,f43-f44,f43*f44,f43/f44,f43+f45,f43-f45,f43*f45,f43/f45,f43+f46,f43-f46,f43*f46,f43/f46,f44+f45,f44-f45,f44*f45,f44/f45,f44+f46,f44-f46,f44*f46,f44/f46,f45+f46,f45-f46,f45*f46,f45/f46,label
0,0,1,2,0,0,0,153,0,0,78,0,0,0,0,0,0,0,9,129,0,24,0,162,174,39,33,0,0,0,0,0,0,0,0,0,0,132,0,0,0,0,0,0,0,624,1539,1,-1,0,0.0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,624,-624,0,0.0,1539,-1539,0,0.0,624,-624,0,0.0,1539,-1539,0,0.0,2163,-915,960336,0.405195,0
1,1,1,2,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,24,0,69,66,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,186,366,2,0,1,0.5,1,1,0,1.0,1,1,0,1.0,22,-20,21,0.045455,1,1,0,1.0,1,1,0,1.0,22,-20,21,0.045455,0,0,0,0.0,21,-21,0,0.0,21,-21,0,0.0,0,0,0,0.0,186,-186,0,0.0,366,-366,0,0.0,186,-186,0,0.0,366,-366,0,0.0,552,-180,68076,0.506812,0
2,0,0,3,36,36,120,0,0,0,0,0,0,0,0,0,0,0,0,156,0,0,0,156,162,15,15,0,0,0,0,0,0,0,0,0,0,105,0,0,0,0,0,0,0,24,48,0,0,0,0.0,36,-36,0,0.0,36,-36,0,0.0,120,-120,0,0.0,36,-36,0,0.0,36,-36,0,0.0,120,-120,0,0.0,72,0,1296,0.972973,156,-84,4320,0.297521,156,-84,4320,0.297521,0,0,0,0.0,24,-24,0,0.0,48,-48,0,0.0,24,-24,0,0.0,48,-48,0,0.0,72,-24,1152,0.489796,1
3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,120,117,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,9,2,0,1,0.5,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,3,-3,0,0.0,3,-3,0,0.0,9,-9,0,0.0,6,0,9,0.75,12,-6,27,0.3,12,-6,27,0.3,0
4,1,1,2,9,51,294,0,0,0,0,0,0,0,0,0,0,0,0,153,0,0,0,129,0,90,183,0,0,0,102,0,0,72,0,111,0,141,0,0,0,0,0,0,0,42,141,2,0,1,0.5,10,-8,9,0.1,52,-50,51,0.019231,295,-293,294,0.00339,10,-8,9,0.1,52,-50,51,0.019231,295,-293,294,0.00339,60,-42,459,0.173077,303,-285,2646,0.030508,345,-243,14994,0.172881,0,0,0,0.0,42,-42,0,0.0,141,-141,0,0.0,42,-42,0,0.0,141,-141,0,0.0,183,-99,5922,0.295775,0


In [56]:
# 训练集分层抽样，1w，用于伪标签训练、筛选
train['idx_cut'] = pd.cut(train.index, 50, labels=list(range(50)))
np.random.seed(seed=2)
train_sample = train.groupby(['idx_cut']).sample(200).iloc[:,:-1]
train_sample

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f1+f2,f1-f2,f1*f2,f1/f2,f1+f4,f1-f4,f1*f4,f1/f4,f1+f5,f1-f5,f1*f5,f1/f5,f1+f6,f1-f6,f1*f6,f1/f6,f2+f4,f2-f4,f2*f4,f2/f4,f2+f5,f2-f5,f2*f5,f2/f5,f2+f6,f2-f6,f2*f6,f2/f6,f4+f5,f4-f5,f4*f5,f4/f5,f4+f6,f4-f6,f4*f6,f4/f6,f5+f6,f5-f6,f5*f6,f5/f6,f43+f44,f43-f44,f43*f44,f43/f44,f43+f45,f43-f45,f43*f45,f43/f45,f43+f46,f43-f46,f43*f46,f43/f46,f44+f45,f44-f45,f44*f45,f44/f45,f44+f46,f44-f46,f44*f46,f44/f46,f45+f46,f45-f46,f45*f46,f45/f46,label
37,0,1,1,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48,45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,12,66,1,-1,0,0.0,0,0,0,0.0,0,0,0,0.0,6,-6,0,0.0,1,1,0,1.0,1,1,0,1.000,7,-5,6,0.142857,0,0,0,0.000000,6,-6,0,0.000000,6,-6,0,0.000000,3,-3,0,0.000000,12,-12,0,0.000000,66,-66,0,0.000000,15,-9,36,0.230769,69,-63,198,0.044776,78,-54,792,0.179104,0
726,0,1,2,0,39,96,0,0,0,0,0,0,0,0,0,0,0,0,63,0,0,0,0,21,30,30,0,0,0,0,0,0,0,0,0,0,63,0,0,0,0,0,21,105,396,1431,1,-1,0,0.0,0,0,0,0.0,39,-39,0,0.0,96,-96,0,0.0,1,1,0,1.0,40,-38,39,0.025,97,-95,96,0.010309,39,-39,0,0.000000,96,-96,0,0.000000,135,-57,3744,0.402062,126,-84,2205,0.198113,417,-375,8316,0.052897,1452,-1410,30051,0.014665,501,-291,41580,0.264484,1536,-1326,150255,0.073324,1827,-1035,566676,0.276536,0
846,0,0,1,24,24,42,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48,45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87,93,0,0,0,0.0,24,-24,0,0.0,24,-24,0,0.0,42,-42,0,0.0,24,-24,0,0.0,24,-24,0,0.000,42,-42,0,0.000000,48,0,576,0.960000,66,-18,1008,0.558140,66,-18,1008,0.558140,0,0,0,0.000000,87,-87,0,0.000000,93,-93,0,0.000000,87,-87,0,0.000000,93,-93,0,0.000000,180,-6,8091,0.925532,1
295,0,1,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,24,0,0,0,33,30,78,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,15,114,330,1,-1,0,0.0,0,0,0,0.0,0,0,0,0.0,3,-3,0,0.0,1,1,0,1.0,1,1,0,1.000,4,-2,3,0.250000,0,0,0,0.000000,3,-3,0,0.000000,3,-3,0,0.000000,24,-6,135,0.562500,123,-105,1026,0.078261,339,-321,2970,0.027190,129,-99,1710,0.130435,345,-315,4950,0.045317,444,-216,37620,0.344411,0
924,0,0,1,0,0,72,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,72,-72,0,0.0,0,0,0,0.0,0,0,0,0.000,72,-72,0,0.000000,0,0,0,0.000000,72,-72,0,0.000000,72,-72,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49073,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,69,69,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,12,21,189,435,1,-1,0,0.0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,1,1,0,1.0,1,1,0,1.000,1,1,0,1.000000,0,0,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,33,-9,252,0.545455,201,-177,2268,0.063158,447,-423,5220,0.027523,210,-168,3969,0.110526,456,-414,9135,0.048165,624,-246,82215,0.433486,0
49199,0,0,3,42,72,123,0,66,0,0,0,0,0,0,0,0,0,0,60,0,0,0,45,15,84,81,0,0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,9,9,27,75,0,0,0,0.0,42,-42,0,0.0,72,-72,0,0.0,123,-123,0,0.0,42,-42,0,0.0,72,-72,0,0.000,123,-123,0,0.000000,114,-30,3024,0.575342,165,-81,5166,0.338710,195,-51,8856,0.580645,18,0,81,0.900000,36,-18,243,0.321429,84,-66,675,0.118421,36,-18,243,0.321429,84,-66,675,0.118421,102,-48,2025,0.355263,1
49036,0,0,1,15,87,129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,60,0,0,0,0.0,15,-15,0,0.0,87,-87,0,0.0,129,-129,0,0.0,15,-15,0,0.0,87,-87,0,0.000,129,-129,0,0.000000,102,-72,1305,0.170455,144,-114,1935,0.115385,216,-42,11223,0.669231,0,0,0,0.000000,6,-6,0,0.000000,60,-60,0,0.000000,6,-6,0,0.000000,60,-60,0,0.000000,66,-54,360,0.098361,1
49505,0,0,1,12,12,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,12,-12,0,0.0,12,-12,0,0.0,27,-27,0,0.0,12,-12,0,0.0,12,-12,0,0.000,27,-27,0,0.000000,24,0,144,0.923077,39,-15,324,0.428571,39,-15,324,0.428571,0,0,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,0,0,0,0.000000,1


In [57]:
no_label_df = no_label_df.iloc[:,1:]

train_,val_ = train_test_split(train_sample,test_size=0.8,random_state=1024)
val_y = val_.label
val_X = val_.drop(['label'],axis=1)

In [58]:
# pseudo label
# 给无标签数据打标签，每10个样本有1024种标签组合，运行1024次xgb，选取auc提升最大的组合

def pipeline(unlabel_data,model_name):
    this_train = pd.concat([train_,unlabel_data])
    y = this_train.label
    X = this_train.drop(['label'],axis=1)
    
    if model_name == 'xgb': 
        dtrain = xgb.DMatrix(X, label=y)
        dval = xgb.DMatrix(val_X)
        params={
            'booster':'gbtree',
            'objective': 'rank:pairwise',
            # 'scale_pos_weight': float(len(y)-sum(y))/float(sum(y)),
            'eval_metric': 'auc',
            'gamma':0.1,
            'max_depth':8,
            'lambda':600,
            'subsample':0.6,
            'colsample_bytree':0.3,
            'min_child_weight':0.3, 
            'eta': 0.04,
            'seed':1024,
            'nthread':-1
            }
        model = xgb.train(params,dtrain,num_boost_round=256,verbose_eval=False)
        val_y_pred = model.predict(dval)
    
    if model_name == 'lgb':
        dtrain = lgb.Dataset(X, label=y)
#         dval = lgb.Dataset(val_X)
        params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'n_jobs': -1,
        'learning_rate': 0.05,
        'num_leaves': 2 ** 6,
        'max_depth': 8,
        'tree_learner': 'serial',
        'colsample_bytree': 0.8,
        'subsample_freq': 1,
        'subsample': 0.8,
#         'num_boost_round': 5000,
        'max_bin': 255,
        'verbose': -1,
        'seed': 2022,
        'bagging_seed': 2022,
        'feature_fraction_seed': 2022,
#         'early_stopping_rounds': 100,
    }
#         num_round = 3000
        model = lgb.train(
            params,
            dtrain,
#             num_round ,
#             valid_sets=[trn_data, val_data],
#             verbose_eval=100,
#             early_stopping_rounds=50,
        )
        
        val_y_pred = model.predict(val_X)

#         oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][feature_columns], num_iteration=clf.best_iteration)
#         predictions_lgb[:] += clf.predict(test[feature_columns], num_iteration=clf.best_iteration) / 5
        
        
    fpr,tpr,thresholds = roc_curve(val_y,val_y_pred,pos_label=1)
    return auc(fpr,tpr)

In [59]:
# 10条数据，2个类别，共1024个label组合
labels = []
get_bin = lambda x: format(x, 'b').zfill(10)
for i in range(1024):
    label_str = get_bin(i)
    label = [int(c) for c in label_str]
    labels.append(label)

In [64]:
for i in tqdm(range(1,4000)):
    uid_index = range(i*10,(i+1)*10)
    samples_selected = no_label_df.loc[uid_index]
    best_auc = 0
    best_label = []
    for label in labels:
        samples_selected['label'] = label
        this_auc = pipeline(samples_selected,'lgb')
        print(this_auc)
        if this_auc>best_auc:
            best_auc = this_auc
            best_label = label

    with open('label.csv','a') as f:
        f.writelines(f'{str(i)},' + ','.join([str(i) for i in best_label]) + ',' + str(best_auc) + '\n')

  0%|          | 0/3999 [00:00<?, ?it/s]

0.8924432385993104
0.8915980011723771
0.8921871984717389
0.8923331891561976
0.8919615765048478
0.8930711386803514
0.8920881951899571
0.8932245896454107
0.8926431411457887
0.8920288014642923
0.8920122734393154
0.891920153399856
0.8919981360014925
0.8925178414053156
0.8931586424135829
0.8926874082251532
0.8924545320627608
0.8922206666918916
0.8923385473687837
0.8918072599823708
0.8918068890291917
0.893049829481067
0.8913184673434666
0.8929936918999736
0.8927695125287788
0.8927526135506227
0.8930437293621232
0.8920969744151944
0.8918821100904952
0.892585973139198
0.892045741659468
0.8925120298055109
0.8917558211415451
0.8932430136533027
0.8927030294756924
0.8912393306652732
0.8914020966768283
0.8923563119043573
0.8916592908809569
0.8933124231148015
0.8915236044514712
0.8930538275319966
0.8919335077143011
0.8926712511533552
0.8915158144347115
0.8921135848742111
0.8934507474335605
0.892811924842248
0.8922736717794731
0.8916518306003564
0.8925009424271599
0.8919267893400585
0.892462528164620