In [8]:
# Cell 1: 导入依赖和初始设置
import gc
import os
import random
import warnings
import joblib
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from utils import Logger, evaluate, gen_sub

warnings.filterwarnings('ignore')
seed = 2020
random.seed(seed)

In [9]:
# Cell 2: 参数配置和日志记录
mode = 'valid'  # 'valid' or 'online'
os.makedirs('../user_data/log', exist_ok=True)
log = Logger('../user_data/log/notebook.log').logger
log.info(f'lightgbm 排序，mode: {mode}')

2025-04-19 19:35:56,518 - /tmp/ipykernel_978832/3530196446.py[line:5] - INFO: lightgbm 排序，mode: valid
2025-04-19 19:35:56,518 - /tmp/ipykernel_978832/3530196446.py[line:5] - INFO: lightgbm 排序，mode: valid


In [10]:
# Cell 3: 定义模型训练函数
def train_model(df_feature, df_query):
    df_train = df_feature[df_feature['label'].notnull()]
    df_test = df_feature[df_feature['label'].isnull()]
    
    del df_feature
    gc.collect()
    
    ycol = 'label'
    feature_names = list(filter(lambda x: x not in [ycol, 'created_at_datetime', 'click_datetime'], df_train.columns))
    feature_names.sort()
    
    model = lgb.LGBMClassifier(
        num_leaves=64,
        max_depth=10,
        learning_rate=0.05,
        n_estimators=10000,
        subsample=0.8,
        feature_fraction=0.8,
        reg_alpha=0.5,
        reg_lambda=0.5,
        random_state=seed,
        importance_type='gain',
        metric=None,
        device_type='gpu',
        gpu_device_id=0,
        gpu_use_dp=True,
        boost_from_average=True
    )
    
    oof = []
    prediction = df_test[['user_id', 'article_id']]
    prediction['pred'] = 0
    df_importance_list = []
    
    kfold = GroupKFold(n_splits=5)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol], df_train['user_id'])):
        X_train = df_train.iloc[trn_idx][feature_names]
        Y_train = df_train.iloc[trn_idx][ycol]
        X_val = df_train.iloc[val_idx][feature_names]
        Y_val = df_train.iloc[val_idx][ycol]
        
        callbacks = [
            lgb.log_evaluation(period=100),
            lgb.early_stopping(stopping_rounds=100)
        ]
        
        lgb_model = model.fit(
            X_train, Y_train,
            eval_set=[(X_train, Y_train), (X_val, Y_val)],
            eval_names=['train', 'valid'],
            eval_metric='auc',
            callbacks=callbacks
        )
        
        pred_val = lgb_model.predict_proba(X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
        df_oof = df_train.iloc[val_idx][['user_id', 'article_id', ycol]].copy()
        df_oof['pred'] = pred_val
        oof.append(df_oof)
        
        pred_test = lgb_model.predict_proba(df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
        prediction['pred'] += pred_test / 5
        
        df_importance = pd.DataFrame({
            'feature_name': feature_names,
            'importance': lgb_model.feature_importances_,
        })
        df_importance_list.append(df_importance)
        
        joblib.dump(model, f'../user_data/model/lgb{fold_id}.pkl')
    
    return prediction, oof, df_importance_list

In [11]:
# Cell 4: 定义在线预测函数
def online_predict(df_test):
    ycol = 'label'
    feature_names = list(filter(lambda x: x not in [ycol, 'created_at_datetime', 'click_datetime'], df_test.columns))
    feature_names.sort()
    
    prediction = df_test[['user_id', 'article_id']]
    prediction['pred'] = 0
    
    for fold_id in tqdm(range(5)):
        model = joblib.load(f'../user_data/model/lgb{fold_id}.pkl')
        pred_test = model.predict_proba(df_test[feature_names])[:, 1]
        prediction['pred'] += pred_test / 5
    
    return prediction

In [12]:
# Cell 5: 数据加载与预处理
def load_and_process_data(mode='valid'):
    if mode == 'valid':
        df_feature = pd.read_pickle('../user_data/data/offline/feature.pkl')
        df_query = pd.read_pickle('../user_data/data/offline/query.pkl')
        
        for f in df_feature.select_dtypes('object').columns:
            lbl = LabelEncoder()
            df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))
            
        prediction, oof, importance_list = train_model(df_feature, df_query)
        return prediction, oof, importance_list, df_query
    else:
        df_feature = pd.read_pickle('../user_data/data/online/feature.pkl')
        prediction = online_predict(df_feature)
        return prediction

In [13]:
# Cell 6: 生成提交文件(可单独运行)
def generate_submission(prediction):
    df_sub = gen_sub(prediction)
    df_sub.sort_values(['user_id'], inplace=True)
    os.makedirs('../prediction_result', exist_ok=True)
    df_sub.to_csv('../prediction_result/result.csv', index=False)
    return df_sub

In [14]:

# Cell 7: 执行主流程
if mode == 'valid':
    prediction, oof, importance_list, df_query = load_and_process_data(mode='valid')
    # 评估指标计算
    df_oof = pd.concat(oof)
    total = df_query[df_query['click_article_id'] != -1].user_id.nunique()
    metrics = evaluate(df_oof, total)
    print("评估指标:", metrics)
else:
    prediction = load_and_process_data(mode='online')



FileNotFoundError: [Errno 2] No such file or directory: '../user_data/data/offline/feature.pkl'

In [None]:
# Cell 8: 生成提交文件(按需运行)
df_submission = generate_submission(prediction)
df_submission.head()

100%|██████████| 50000/50000 [03:45<00:00, 222.17it/s]


Unnamed: 0,user_id,article_1,article_2,article_3,article_4,article_5
49999,200000,194935,336221,195087,195645,59681
49998,200001,272143,64329,199198,324823,166581
49997,200002,202701,208596,205982,203288,206711
49996,200003,277107,158772,235105,50494,156807
49995,200004,218028,289003,157478,57966,202355
