In [99]:
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)


from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,roc_curve,auc, f1_score, recall_score, precision_score, classification_report
from sklearn.model_selection import train_test_split,GridSearchCV

In [100]:
path = './data/'

train_df = pd.read_csv(f'{path}dataTrain.csv')
no_label_df = pd.read_csv(f'{path}dataNoLabel.csv')
test_df = pd.read_csv(f'{path}dataA.csv')

print(train_df.shape, no_label_df.shape)
print(train_df.label.value_counts())

(59872, 48) (39884, 47)
0    44950
1    14922
Name: label, dtype: int64


In [101]:
import itertools

train_df['f3'] = train_df['f3'].map({'low': 1, 'mid': 2, 'high': 3})
no_label_df['f3'] = no_label_df['f3'].map({'low': 1, 'mid': 2, 'high': 3})
test_df['f3'] = test_df['f3'].map({'low': 1, 'mid': 2, 'high': 3})

# 暴力 位置
loc_f = ['f1', 'f2', 'f4', 'f5', 'f6']
for df, i in itertools.product(tqdm([train_df, test_df,no_label_df]), range(len(loc_f))):
    for j in range(i + 1, len(loc_f)):
        df[f'{loc_f[i]}+{loc_f[j]}'] = df[loc_f[i]] + df[loc_f[j]]
        df[f'{loc_f[i]}-{loc_f[j]}'] = df[loc_f[i]] - df[loc_f[j]]
        df[f'{loc_f[i]}*{loc_f[j]}'] = df[loc_f[i]] * df[loc_f[j]]
        df[f'{loc_f[i]}/{loc_f[j]}'] = df[loc_f[i]] / (df[loc_f[j]] + 1)

# 暴力 通话
com_f = ['f43', 'f44', 'f45', 'f46']
for df, i in itertools.product(tqdm([train_df, test_df,no_label_df]), range(len(com_f))):
    for j in range(i + 1, len(com_f)):
        df[f'{com_f[i]}+{com_f[j]}'] = df[com_f[i]] + df[com_f[j]]
        df[f'{com_f[i]}-{com_f[j]}'] = df[com_f[i]] - df[com_f[j]]
        df[f'{com_f[i]}*{com_f[j]}'] = df[com_f[i]] * df[com_f[j]]
        df[f'{com_f[i]}/{com_f[j]}'] = df[com_f[i]] / (df[com_f[j]] + 1)


100%|██████████| 3/3 [00:00<?, ?it/s]
100%|██████████| 3/3 [00:00<?, ?it/s]


In [102]:
# 验证训练集噪音
def model_train(model,train,y, model_name, kfold=5):
    oof_preds = np.zeros((train.shape[0]))
    test_preds = np.zeros(test.shape[0])
    skf = StratifiedKFold(n_splits=kfold)
    print(f"Model = {model_name}")
    for k, (train_index, test_index) in enumerate(skf.split(train, y)):
        x_train, x_test = train.iloc[train_index, :], train.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(x_train,y_train)

        y_pred = model.predict_proba(x_test)[:,1]
        oof_preds[test_index] = y_pred.ravel()
        auc = roc_auc_score(y_test,y_pred)
        print("- KFold = %d, val_auc = %.4f" % (k, auc))
        test_fold_preds = model.predict_proba(test)[:, 1]
        test_preds += test_fold_preds.ravel()
    print("Overall Model = %s, AUC = %.4f" % (model_name, roc_auc_score(y, oof_preds)))
    return test_preds / kfold

In [103]:
def lgb(train,y,test):
    KF = StratifiedKFold(n_splits=5, random_state=2022, shuffle=True)
    
#     feat_imp_df = pd.DataFrame({'feat': feature_columns, 'imp': 0})
    
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'n_jobs': 30,
        'learning_rate': 0.05,
        'num_leaves': 2 ** 6,
        'max_depth': 8,
        'tree_learner': 'serial',
        'colsample_bytree': 0.8,
        'subsample_freq': 1,
        'subsample': 0.8,
        'num_boost_round': 5000,
        'max_bin': 255,
        'verbose': -1,
        'seed': 2022,
        'bagging_seed': 2022,
        'feature_fraction_seed': 2022,
        'early_stopping_rounds': 100,

    }

    oof_lgb = np.zeros(len(train))
    predictions_lgb = np.zeros((len(test)))

    for fold_, (trn_idx, val_idx) in enumerate(KF.split(train.values, y.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][feature_columns], label=y.iloc[trn_idx])
        val_data = lgb.Dataset(train.iloc[val_idx][feature_columns], label=y.iloc[val_idx])
        
        num_round = 3000
        
        clf = lgb.train(
            params,
            trn_data,
            num_round,
            valid_sets=[trn_data, val_data],
            verbose_eval=100,
            early_stopping_rounds=50,
        )

        oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][feature_columns], num_iteration=clf.best_iteration)
        predictions_lgb[:] += clf.predict(test[feature_columns], num_iteration=clf.best_iteration) / 5
#         feat_imp_df['imp'] += clf.feature_importance() / 5
    return predictions_lgb

In [104]:
feature_columns = [col for col in train_df.columns if col not in ['id','label']]

# 训练集、测试集
train = train_df[feature_columns+['label']]
# y = train_df['label']
train = train[:50000]
# y = y[:50000]

test = test_df
train.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f1+f2,f1-f2,f1*f2,f1/f2,f1+f4,f1-f4,f1*f4,f1/f4,f1+f5,f1-f5,f1*f5,f1/f5,f1+f6,f1-f6,f1*f6,f1/f6,f2+f4,f2-f4,f2*f4,f2/f4,f2+f5,f2-f5,f2*f5,f2/f5,f2+f6,f2-f6,f2*f6,f2/f6,f4+f5,f4-f5,f4*f5,f4/f5,f4+f6,f4-f6,f4*f6,f4/f6,f5+f6,f5-f6,f5*f6,f5/f6,f43+f44,f43-f44,f43*f44,f43/f44,f43+f45,f43-f45,f43*f45,f43/f45,f43+f46,f43-f46,f43*f46,f43/f46,f44+f45,f44-f45,f44*f45,f44/f45,f44+f46,f44-f46,f44*f46,f44/f46,f45+f46,f45-f46,f45*f46,f45/f46,label
0,0,1,2,0,0,0,153,0,0,78,0,0,0,0,0,0,0,9,129,0,24,0,162,174,39,33,0,0,0,0,0,0,0,0,0,0,132,0,0,0,0,0,0,0,624,1539,1,-1,0,0.0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,624,-624,0,0.0,1539,-1539,0,0.0,624,-624,0,0.0,1539,-1539,0,0.0,2163,-915,960336,0.405195,0
1,1,1,2,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,24,0,69,66,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,186,366,2,0,1,0.5,1,1,0,1.0,1,1,0,1.0,22,-20,21,0.045455,1,1,0,1.0,1,1,0,1.0,22,-20,21,0.045455,0,0,0,0.0,21,-21,0,0.0,21,-21,0,0.0,0,0,0,0.0,186,-186,0,0.0,366,-366,0,0.0,186,-186,0,0.0,366,-366,0,0.0,552,-180,68076,0.506812,0
2,0,0,3,36,36,120,0,0,0,0,0,0,0,0,0,0,0,0,156,0,0,0,156,162,15,15,0,0,0,0,0,0,0,0,0,0,105,0,0,0,0,0,0,0,24,48,0,0,0,0.0,36,-36,0,0.0,36,-36,0,0.0,120,-120,0,0.0,36,-36,0,0.0,36,-36,0,0.0,120,-120,0,0.0,72,0,1296,0.972973,156,-84,4320,0.297521,156,-84,4320,0.297521,0,0,0,0.0,24,-24,0,0.0,48,-48,0,0.0,24,-24,0,0.0,48,-48,0,0.0,72,-24,1152,0.489796,1
3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,120,117,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,9,2,0,1,0.5,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,1,1,0,1.0,0,0,0,0.0,0,0,0,0.0,0,0,0,0.0,3,-3,0,0.0,3,-3,0,0.0,9,-9,0,0.0,6,0,9,0.75,12,-6,27,0.3,12,-6,27,0.3,0
4,1,1,2,9,51,294,0,0,0,0,0,0,0,0,0,0,0,0,153,0,0,0,129,0,90,183,0,0,0,102,0,0,72,0,111,0,141,0,0,0,0,0,0,0,42,141,2,0,1,0.5,10,-8,9,0.1,52,-50,51,0.019231,295,-293,294,0.00339,10,-8,9,0.1,52,-50,51,0.019231,295,-293,294,0.00339,60,-42,459,0.173077,303,-285,2646,0.030508,345,-243,14994,0.172881,0,0,0,0.0,42,-42,0,0.0,141,-141,0,0.0,42,-42,0,0.0,141,-141,0,0.0,183,-99,5922,0.295775,0


## 半监督-调包


In [105]:
no_label_df = no_label_df.iloc[:,1:]
no_label_df['label'] = -1

train_01 = pd.concat([train,no_label_df])
train_01 = train_01.reset_index().drop(['index'],axis=1)

In [106]:
from sklearn.semi_supervised import SelfTrainingClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.semi_supervised.SelfTrainingClassifier.html?highlight=self


gbdt = GradientBoostingClassifier()

self_training_model = SelfTrainingClassifier(gbdt,threshold=0.91,verbose=True,max_iter=1)
self_training_model.fit(train_01.iloc[:,:-1], train_01.label)

End of iteration 1, added 18197 new labels.


SelfTrainingClassifier(base_estimator=GradientBoostingClassifier(), max_iter=1,
                       threshold=0.91, verbose=True)

In [109]:
predictions_gbdt = self_training_model.predict(test.iloc[:,1:])
predictions_gbdt

# test['label'] = predictions_gbdt
# test[['id', 'label']].to_csv('sub_self_train.csv', index=False)

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)