## Created by <a href="https://github.com/yunsuxiaozi">yunsuxiaozi</a> 2024/6/8

#### 这个notebook是模型的训练和推理。

In [1]:
#necessary
import polars as pl#和pandas类似,但是处理大型数据集有更好的性能.
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
import json#用于读取和写入json数据格式
#model lgb分类模型,日志评估,早停防止过拟合
from  lightgbm import LGBMClassifier,log_evaluation
#metric
from sklearn.metrics import roc_auc_score#导入roc_auc曲线
#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import StratifiedGroupKFold
import dill#对对象进行序列化和反序列化(例如保存和加载树模型)

In [2]:
#config
class Config():
    seed=2024#随机种子
    num_folds=10#K折交叉验证
    TARGET_NAME ='label'#标签
import random#提供了一些用于生成随机数的函数
#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(Config.seed)

#### 这里还是对数据做了些特征工程

In [3]:
train_feats=pl.read_csv("/kaggle/input/2024kdd-data/train_feats.csv")
train_feats=train_feats.to_pandas()
valid_feats=pl.read_csv("/kaggle/input/2024kdd-data/valid_feats.csv")
valid_feats=valid_feats.to_pandas()

#698,754(author_count=2),512,151(author_count>1800) 测试数据中没有这样的数据
train_feats=train_feats[~train_feats['authorid'].isin([512,151,698,754])]

#特征交叉
def deal_df(df):
    df['title_readable']=df['title_ari']+df['title_McAlpine_EFLAW']+df['title_CLRI']
    df['abs_readable']=df['abs_ari']+df['abs_McAlpine_EFLAW']+df['abs_CLRI']
    
    tops=['title_ari','title_McAlpine_EFLAW','title_CLRI']
    for i in range(len(tops)):
        for j in range(i+1,len(tops)):
            df[f"{tops[i]}+{tops[j]}"]=df[tops[i]]+df[tops[j]]
            df[f"{tops[i]}-{tops[j]}"]=df[tops[i]]-df[tops[j]]
            df[f"{tops[i]}*{tops[j]}"]=df[tops[i]]*df[tops[j]]
            df[f"{tops[i]}/{tops[j]}"]=df[tops[i]]/df[tops[j]]
            
    tops=['abs_ari','abs_McAlpine_EFLAW','abs_CLRI']
    for i in range(len(tops)):
        for j in range(i+1,len(tops)):
            df[f"{tops[i]}+{tops[j]}"]=df[tops[i]]+df[tops[j]]
            df[f"{tops[i]}-{tops[j]}"]=df[tops[i]]-df[tops[j]]
            df[f"{tops[i]}*{tops[j]}"]=df[tops[i]]*df[tops[j]]
            df[f"{tops[i]}/{tops[j]}"]=df[tops[i]]/df[tops[j]]
    tops=['title_wordcount','abs_wordcount']
    for i in range(len(tops)):
        for j in range(i+1,len(tops)):
            df[f"{tops[i]}+{tops[j]}"]=df[tops[i]]+df[tops[j]]
            df[f"{tops[i]}-{tops[j]}"]=df[tops[i]]-df[tops[j]]
            df[f"{tops[i]}*{tops[j]}"]=df[tops[i]]*df[tops[j]]
            df[f"{tops[i]}/{tops[j]}"]=df[tops[i]]/df[tops[j]]
            
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    return df
train_feats=deal_df(train_feats)
valid_feats=deal_df(valid_feats)

train_feats.head()

Unnamed: 0,authorid,title_len,title_wordcount,title_ari,title_McAlpine_EFLAW,title_CLRI,title_word_maxlen,title_word_meanlen,title_word_stdlen,title_word_sumlen,...,abs_ari*abs_CLRI,abs_ari/abs_CLRI,abs_McAlpine_EFLAW+abs_CLRI,abs_McAlpine_EFLAW-abs_CLRI,abs_McAlpine_EFLAW*abs_CLRI,abs_McAlpine_EFLAW/abs_CLRI,title_wordcount+abs_wordcount,title_wordcount-abs_wordcount,title_wordcount*abs_wordcount,title_wordcount/abs_wordcount
0,0.0,4.795791,2.944439,18.97,3.610918,21.755556,2.70805,1.905419,1.506033,4.644391,...,950.222,0.461013,-44.301388,46.498612,-49.876998,-0.024199,2.944439,2.944439,0.0,
1,0.0,4.820282,2.772589,21.148235,3.555348,25.002354,2.564949,2.112231,1.519022,4.700481,...,950.222,0.461013,-44.301388,46.498612,-49.876998,-0.024199,2.772589,2.772589,0.0,
2,0.0,4.61512,2.639057,15.712857,3.091042,21.971428,2.70805,2.050171,1.525656,4.488637,...,214.409559,0.83603,21.254241,-10.774599,83.912694,0.327194,7.714231,-2.436117,13.393676,0.519993
3,0.0,4.644391,2.70805,20.222143,3.367296,25.345715,2.484907,2.005334,1.396379,4.510859,...,950.222,0.461013,-44.301388,46.498612,-49.876998,-0.024199,2.70805,2.70805,0.0,
4,0.0,4.89784,2.70805,24.082,3.157,32.38933,3.044522,2.258782,1.72164,4.795791,...,330.97664,0.835311,25.511142,-14.299998,111.582106,0.281608,8.14613,-2.730029,14.726593,0.497979


#### 相关性特征的构建

In [4]:
#计算两组变量的皮尔逊相关系数
def pearson_corr(x1,x2,eps=1e-15):
    mean_x1=np.mean(x1)
    mean_x2=np.mean(x2)
    std_x1=np.std(x1)
    std_x2=np.std(x2)
    pearson=np.mean((x1-mean_x1)*(x2-mean_x2))/(std_x1*std_x2+eps)
    return pearson

def corr_feats(margin=0.1):
    train_feats[f'corr_{margin}']=0
    valid_feats[f'corr_{margin}']=0
    drop_cols=[]
    for col in valid_feats.columns:
        if 'corr' in col:
            drop_cols+=[col]
    for col in valid_feats.drop(['authorid']+drop_cols,axis=1).columns:
        tmp_df=train_feats[[col,'label']].dropna().copy()
        pearson=pearson_corr(tmp_df[col].values,tmp_df['label'].values)
        max_value,min_value=float(tmp_df[col].max()),float(tmp_df[col].min())
        if abs(pearson)>margin:
            train_feats[f'corr_{margin}']+=pearson*(train_feats[col]-min_value)/(max_value-min_value)
            valid_feats[f'corr_{margin}']+=pearson*(valid_feats[col]-min_value)/(max_value-min_value)
    tmp_df=train_feats[[f'corr_{margin}','label']].dropna().copy()
    print(f"margin:{margin},len(tmp_df):{len(tmp_df)},{pearson_corr(tmp_df[f'corr_{margin}'].values,tmp_df['label'].values)}")
for margin in [0.1,0.05,0.025,0.01,0.005]:
    corr_feats(margin)
train_feats.head()

margin:0.1,len(tmp_df):143896,0.2389044131464588
margin:0.05,len(tmp_df):72345,0.24274503270248282
margin:0.025,len(tmp_df):57259,0.2494189186709124
margin:0.01,len(tmp_df):57158,0.24808946098186307
margin:0.005,len(tmp_df):57150,0.2485095029879458


Unnamed: 0,authorid,title_len,title_wordcount,title_ari,title_McAlpine_EFLAW,title_CLRI,title_word_maxlen,title_word_meanlen,title_word_stdlen,title_word_sumlen,...,abs_McAlpine_EFLAW/abs_CLRI,title_wordcount+abs_wordcount,title_wordcount-abs_wordcount,title_wordcount*abs_wordcount,title_wordcount/abs_wordcount,corr_0.1,corr_0.05,corr_0.025,corr_0.01,corr_0.005
0,0.0,4.795791,2.944439,18.97,3.610918,21.755556,2.70805,1.905419,1.506033,4.644391,...,-0.024199,2.944439,2.944439,0.0,,4.998334,,,,
1,0.0,4.820282,2.772589,21.148235,3.555348,25.002354,2.564949,2.112231,1.519022,4.700481,...,-0.024199,2.772589,2.772589,0.0,,4.998334,,,,
2,0.0,4.61512,2.639057,15.712857,3.091042,21.971428,2.70805,2.050171,1.525656,4.488637,...,0.327194,7.714231,-2.436117,13.393676,0.519993,4.998334,11.488661,12.982249,12.887514,12.597896
3,0.0,4.644391,2.70805,20.222143,3.367296,25.345715,2.484907,2.005334,1.396379,4.510859,...,-0.024199,2.70805,2.70805,0.0,,4.998334,,,,
4,0.0,4.89784,2.70805,24.082,3.157,32.38933,3.044522,2.258782,1.72164,4.795791,...,0.281608,8.14613,-2.730029,14.726593,0.497979,4.998334,11.493188,13.052029,13.000854,12.714798


#### 模型的训练代码

In [5]:
#这里尝试自定义评估指标  https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/discussion/500868
def weight_rocauc(authorid,y_true,y_pro):
    max_authorid=np.max(authorid).astype(int)+1
    total_weight_auc=0
    cnt=0
    for id in range(max_authorid):
        cur_idx=np.where(authorid==id)[0]
        if len(cur_idx)>0:#authorid可能是
            auc=roc_auc_score(y_true[cur_idx],y_pro[cur_idx])
            cnt+=int(auc<0.5)
            weight_auc=np.sum(y_true[cur_idx]==0)*auc
            total_weight_auc+=weight_auc
    print(f"cnt:{cnt}")
    total_weight_auc/=np.sum(y_true==0)
    return total_weight_auc#'weight_rocauc',total_weight_auc,True#名称,值,是否越高越好
#保存训练好的树模型,obj是保存的模型,path是需要保存的路径
def pickle_dump(obj, path):
    #打开指定的路径path,binary write(二进制写入)
    with open(path, mode="wb") as f:
        #将obj对象保存到f,使用协议版本4进行序列化
        dill.dump(obj, f, protocol=4)

choose_cols=[ col for col in valid_feats.drop(['authorid'],axis=1).columns]#.drop(['index'],axis=1)

def fit_and_predict(train_feats=train_feats,test_feats=valid_feats,model=None,fold=10,seed=2024,name='lgb'):
    X=train_feats[choose_cols].copy()
    y=train_feats[Config.TARGET_NAME].copy()
    authorid=train_feats['authorid'].values
    test_X=test_feats[choose_cols].copy()
    oof_pred_pro=np.zeros((len(X)))
    test_pred_pro=np.zeros((fold,len(test_X),2))

    #10折交叉验证
    gkf = StratifiedGroupKFold(n_splits=fold,shuffle=True)
    for fold, (train_index, valid_index) in (enumerate(gkf.split(X,y,train_feats['authorid'].values))):
        print(f"name {name},fold:{fold}")

        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        valid_authorid=authorid[valid_index]
        
        model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
                      callbacks=[log_evaluation(100)],
                     # eval_metric=lambda y_true, y_pred: weight_rocauc(valid_authorid,y_true,y_pred)
                     )
        valid_pred=model.predict_proba(X_valid)[:,1]
        

        oof_pred_pro[valid_index]=valid_pred
        #将数据分批次进行预测.
        test_pred_pro[fold]=model.predict_proba(test_X)
        pickle_dump(model, f'/kaggle/working/{name}_fold{fold}.model') #保存训练好的模型   
    print(f"weight_roc_auc:{weight_rocauc(authorid,y.values,oof_pred_pro)}")
    
    #保存oof_pred_pro,仔细研究一下模型对发论文多的作者效果好还是对发论文少的作者效果好
    np.save(f'{name}_oof_pred_pro.npy', oof_pred_pro)
    
    test_preds=test_pred_pro.mean(axis=0)[:,1]
    
    return oof_pred_pro,test_preds

lgb_params1 = {"boosting_type": "gbdt","objective": "binary","metric": "auc",       
                         'random_state': 2024, 'n_estimators': 1024,
                         'reg_alpha': 0.1, 'reg_lambda': 10, 
                         'colsample_bytree': 0.8, 'subsample': 0.8,
                         'learning_rate': 0.05, 'num_leaves': 64, 'min_child_samples': 62,
                         'max_bin':245, "extra_trees": True,
                         'device':'gpu','gpu_use_dp':True,#GPU环境的参数
                    }

#之前使用的一组参数:weight_roc_auc:0.6982612255710695
lgb_params2 = {"boosting_type": "gbdt","objective": "binary","metric": "auc",       
     'random_state': 2024, 'n_estimators': 1536,
     'reg_alpha': 2.6756579164398144, 'reg_lambda': 1.6187614490530422, 
     'colsample_bytree': 0.7, 'subsample': 0.7,
     'learning_rate': 0.030479973315991688, 'num_leaves': 50, 'min_child_samples': 62,
     'max_bin':245,
     'device':'gpu','gpu_use_dp':True,#GPU环境的参数
}

lgb_oof_pred_pro1,test_preds1=fit_and_predict(model=LGBMClassifier(**lgb_params1),fold=Config.num_folds,seed=2024,name='lgb1')

lgb_oof_pred_pro2,test_preds2=fit_and_predict(model=LGBMClassifier(**lgb_params2),fold=Config.num_folds,seed=42,name='lgb2')

test_preds=(test_preds1+test_preds2)/2

print(test_preds[:10])

name lgb1,fold:0
[LightGBM] [Info] Number of positive: 113933, number of negative: 15568
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 293077
[LightGBM] [Info] Number of data points in the train set: 129501, number of used features: 1283
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 1261 dense feature groups (156.11 MB) transferred to GPU in 0.093319 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.879785 -> initscore=1.990393
[LightGBM] [Info] Start training from score 1.990393
[100]	valid_0's auc: 0.751934
[200]	valid_0's auc: 0.763989
[300]	valid_0's auc: 0.767503
[400]	valid_0's auc: 0.772023
[500]	valid_0's auc: 0.775528
[600]	valid_0's auc: 0.776258
[700]	valid_0's auc: 0.778621
[800]	valid_0's auc: 0.778396
[900]	valid_0's auc: 0.780198
[1000]	valid_0's auc: 0.781493
name lgb1,fold:1
[LightGBM] [Info] Number of positive: 115490, number of negative: 15280
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 293319
[LightGBM] [Info] Number of data points in the train set: 130770, number of used features: 1283
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[Li

#### 保存为json的提交文件。

In [6]:
path='/kaggle/input/'
with open(path+"2024kddcupwhoiswho/2024kddcupwhoiswho/ind_test_author_submit.json") as f:
    submission=json.load(f)
cnt=0
for id,names in submission.items():
    for name in names:
        submission[id][name]=test_preds[cnt]
        cnt+=1
with open('2024kddmodel.json', 'w', encoding='utf-8') as f:
    json.dump(submission, f, ensure_ascii=False, indent=4)

#### 这里是用optuna找参数的代码.

In [7]:
# #这里是找参数的.
# import optuna#自动超参数优化软件框架
# choose_cols=[ col for col in valid_feats.drop(['authorid'],axis=1).columns]
# print(f"choose_cols:{choose_cols}")


# def weight_rocauc(authorid,y_true,y_pro):
#     max_authorid=np.max(authorid).astype(int)+1
#     total_weight_auc=0
#     for id in range(max_authorid):
#         cur_idx=np.where(authorid==id)[0]
#         auc=roc_auc_score(y_true[cur_idx],y_pro[cur_idx])
#         weight_auc=np.sum(y_true[cur_idx]==0)*auc
#         total_weight_auc+=weight_auc
#     total_weight_auc/=np.sum(y_true==0)
#     return total_weight_auc


# def objective(trial):
#     param = {
#         "boosting_type": "gbdt",
#         "objective": "binary",
#         "metric": "auc",
#         'random_state': trial.suggest_int('random_state',2024,2024),
#         'n_estimators': trial.suggest_int('n_estimators', 1000, 2000),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),#对数分布的建议值
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),#浮点数
#         'subsample': trial.suggest_float('subsample', 0.3, 0.8),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.5, log=True),
#         'num_leaves' : trial.suggest_int('num_leaves', 8, 64),#整数
#         'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
#         'max_bin':225,
#         'device':'gpu','gpu_use_dp':True,#GPU环境的参数
#     }
#     model = LGBMClassifier(**param)  
    
#     X=train_feats[choose_cols].copy()
#     y=train_feats[Config.TARGET_NAME].copy()
#     oof_pred_pro=np.zeros((len(X),2))
    
#     #10折交叉验证
#     gkf = GroupKFold(n_splits=Config.num_folds)

#     for fold, (train_index, valid_index) in (enumerate(gkf.split(X,y,train_feats['authorid'].values))):
#         print(f"fold:{fold}")

#         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#         y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
     
#         model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
#                       callbacks=[log_evaluation(500)]#,early_stopping(100)
#                      )
        
#         oof_pred_pro[valid_index]=model.predict_proba(X_valid)
        
#     weight_roc_auc=weight_rocauc(train_feats['authorid'].values,y.values,oof_pred_pro[:,1])
    
    
#     return weight_roc_auc
# lgb_params = {"boosting_type": "gbdt","objective": "binary","metric": "auc",       
#      'random_state': 2024, 'n_estimators': 1536,
#      'reg_alpha': 2.6756579164398144, 'reg_lambda': 1.6187614490530422, 
#      'colsample_bytree': 0.6631684151930536, 'subsample': 0.6162557596563916, 
#      'learning_rate': 0.030479973315991688, 'num_leaves': 50, 'min_child_samples': 62,
#      'max_bin':225,
#      'device':'gpu','gpu_use_dp':True,#GPU环境的参数
# }
# #创建的研究命名,找最小值.
# study = optuna.create_study(direction='maximize', study_name='Optimize boosting hyperparameters')
# #目标函数,尝试的次数
# study.optimize(objective, n_trials=20)
# lgbm_params=study.best_trial.params

# #输出最佳的参数
# print('lgbm_params=', lgbm_params)