In [None]:
# 数据导入
import pandas as pd
import warnings, gc

from sklearn.model_selection import train_test_split  # 划分数据集
from sklearn.model_selection import StratifiedKFold  # 分层k折交叉验证

import lightgbm as lgb
import xgboost as xgb

warnings.filterwarnings('ignore')

In [None]:
# 数据加载
# 用户行为日志
user_log = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/user_log_format1.csv')
# 用户画像
user_info = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/user_info_format1.csv')
# 训练数据和测试数据
train_data = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/train_format1.csv')
test_data = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/test_format1.csv')

In [None]:
# 查看数据
# print('-- data shape --')
# for data in [user_log, user_info, train_data, test_data]:
#     print(data.shape)

print('-- data info --')
for data in [user_log, user_info, train_data, test_data]:
    print(data.info())

In [None]:
# 数据集成
train_data['origin'] = 'train'
test_data['origin'] = 'test'

# 合并数据集
all_data = pd.concat([train_data, test_data], ignore_index=True, sort=False)
all_data.drop(['prob'], axis=1, inplace=True)

# 连接user_info，通过user_id连接
all_data = all_data.merge(user_info, on='user_id', how='left')

# 修改user_log表的seller_id列名
user_log.rename(columns={'seller_id': 'merchant_id'}, inplace=True)

del train_data, test_data, user_info
gc.collect()

In [None]:
# 数据类型转换
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log.fillna({'brand_id': 0}, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
user_log['action_type'] = user_log['action_type'].astype('int8')
user_log.head()

In [None]:
# all_data数据填充
all_data.fillna({'age_range': 0}, inplace=True)
all_data.fillna({'gender': 2}, inplace=True)
all_data.isnull().sum()

In [None]:
# all_data表的数据类型转换
all_data['user_id'] = all_data['user_id'].astype('int32')
all_data['merchant_id'] = all_data['merchant_id'].astype('int32')
all_data['label'] = all_data['label'].astype('str')
all_data['age_range'] = all_data['age_range'].astype('int8')
all_data['gender'] = all_data['gender'].astype('int8')
all_data.info()

In [None]:
# 用户特征工程
groups = user_log.groupby(['user_id'])

# 用户交互行为数量
temp = groups.size().reset_index().rename(columns={0: 'u1'})
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['item_id'].nunique().reset_index(name='u2')
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['cat_id'].nunique().reset_index(name='u3')
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['merchant_id'].nunique().reset_index(name='u4')
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['brand_id'].nunique().reset_index(name='u5')
all_data = all_data.merge(temp, on='user_id', how='left')

# 购物时间间隔特征
temp = groups['time_stamp'].agg([('u6', lambda x: (x.max() - x.min()).seconds // 3600)]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')

# 统计操作类型为0，1，2，3的个数
temp = groups['action_type'].value_counts().unstack().reset_index()
temp.rename(columns={0: 'u7', 1: 'u8', 2: 'u9', 3: 'u10'}, inplace=True)
all_data = all_data.merge(temp, on='user_id', how='left')

del temp, groups
gc.collect()

In [None]:
# 在特征工程前，移除可能存在的特征列
# feature_cols = ['m1', 'm2', 'm3', 'm4', 'm5']
# existing_cols = [col for col in feature_cols if col in all_data.columns]
# if existing_cols:
#     all_data = all_data.drop(existing_cols, axis=1)

all_data.head()

In [None]:
# 商家特征处理
groups = user_log.groupby(['merchant_id'])

# 商家交互行为数量
temp = groups.size().reset_index().rename(columns={0: 'm1'})
all_data = all_data.merge(temp, on='merchant_id', how='left')

# 统计商家被交互的user_id,item_id,cat_id,brand_id的唯一值
temp = pd.DataFrame({
    'm2': groups['user_id'].nunique(),
    'm3': groups['item_id'].nunique(),
    'm4': groups['cat_id'].nunique(),
    'm5': groups['brand_id'].nunique()
}).reset_index()
all_data = all_data.merge(temp, on='merchant_id', how='left')

# 统计商家被交互的action_type唯一值
temp = groups['action_type'].value_counts().unstack().reset_index()
temp.rename(columns={0: 'm6', 1: 'm7', 2: 'm8', 3: 'm9'}, inplace=True)
all_data = all_data.merge(temp, on='merchant_id', how='left')

del temp, groups
gc.collect()

In [None]:
# 用户店铺联合特征
groups = user_log.groupby(['user_id', 'merchant_id'])

# 用户在不同商家交互统计
temp = groups.size().reset_index().rename(columns={0: 'um1'})
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 统计用户在不同商家交互的item_id,cat_id,brand_id的唯一值
temp = pd.DataFrame({
    'um2': groups['item_id'].nunique(),
    'um3': groups['cat_id'].nunique(),
    'um4': groups['brand_id'].nunique()
}).reset_index()
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 统计用户在不同的商家交互的action_type唯一值
temp = groups['action_type'].value_counts().unstack().reset_index()
temp.rename(columns={0: 'um5', 1: 'um6', 2: 'um7', 3: 'um8'}, inplace=True)
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 统计用户在不同商家交互的时间间隔
temp = groups['time_stamp'].agg([('um9', lambda x: (x.max() - x.min()).seconds // 3600)]).reset_index()
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

del temp, groups
gc.collect()

In [None]:
# 购买点击比
all_data['r1'] = all_data['u9'] / all_data['u7']  # 用户购买点击比
all_data['r2'] = all_data['m8'] / all_data['m6']  # 商家购买点击比
all_data['r3'] = all_data['um7'] / all_data['um5']  # 不同用户不同商家购买点击比
all_data.head()

In [None]:
# 空数据的填充
all_data.fillna(0, inplace=True)

In [None]:
# 年龄性别类型的转换
# 修改age_range字段名称为age_0,age_1,...,age_8
# 独热编码
temp = pd.get_dummies(all_data['age_range'], prefix='age')
all_data = pd.concat([all_data, temp], axis=1)
temp = pd.get_dummies(all_data['gender'], prefix='gender')
all_data = pd.concat([all_data, temp], axis=1)
# 删除原始数据
all_data.drop(['age_range', 'gender'], axis=1, inplace=True)
del temp
gc.collect()

In [None]:
# 数据存储
train_data = all_data[all_data['origin'] == 'train'].drop(columns=['origin'], axis=1)
test_data = all_data[all_data['origin'] == 'test'].drop(columns=['origin'], axis=1)

train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [None]:
# 算法建模预测
# 训练数据和目标值
train_X, train_Y = train_data.drop(['label'], axis=1), train_data['label']

# 数据划分保留20%作为验证集
X_train, X_valid, Y_train, Y_valid = train_test_split(train_X, train_Y, test_size=0.2)

In [None]:
# LGBM模型训练


def lgb_train(x_train, y_train, x_valid, y_valid, verbose=True):
    """
    LightGBM 模型训练函数

    参数:
        x_train: 训练集特征
        y_train: 训练集标签
        x_valid: 验证集特征
        y_valid: 验证集标签
        verbose: 是否显示训练信息，默认为True

    返回:
        训练好的LightGBM模型
    """

    # 创建回调函数列表
    callback = [
        lgb.early_stopping(stopping_rounds=10)  # 早停策略，10轮内无改善则停止
    ]

    # 仅当verbose为True时添加日志回调
    if verbose:
        callback.append(lgb.log_evaluation(period=100))

    # 创建LightGBM分类器
    model_lgb = lgb.LGBMClassifier(
        max_depth=10,  # 树的最大深度
        n_estimators=5000,  # 最大迭代次数
        min_child_weight=100,  # 叶子节点最小样本权重和
        colsample_bytree=0.7,  # 每棵树随机选择70%的特征
        subsample=0.9,  # 每棵树随机选择90%的样本
        learning_rate=0.1,  # 学习率
        verbose=-1 if not verbose else 0  # -1完全禁止输出，0只显示警告和错误
    )

    # 训练模型
    model_lgb.fit(
        x_train,
        y_train,
        eval_metric='auc',  # 评估指标为AUC
        eval_set=[(x_train, y_train), (x_valid, y_valid)],  # 评估数据集
        callbacks=callback  # 回调函数列表
    )

    # 打印验证集的最佳AUC得分
    print(f"最佳验证集AUC得分: {model_lgb.best_score_['valid_1']['auc']}")
    return model_lgb

In [None]:
# verbose=True时显示训练信息
model_lgb = lgb_train(X_train.values, Y_train, X_valid.values, Y_valid, verbose=False)

In [None]:
prob = model_lgb.predict_proba(test_data.drop(['label'], axis=1))

submission = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/test_format1.csv')
submission['prob'] = pd.Series(prob[:, 1])
submission.to_csv('submission.csv', index=False)

del submission
gc.collect()

In [None]:
# 算法建模预测
# 训练数据和目标值
train_X, train_Y = train_data.drop(['label'], axis=1), train_data['label'].astype(float).astype(int)

# 数据划分保留20%作为验证集
X_train, X_valid, Y_train, Y_valid = train_test_split(train_X, train_Y, test_size=0.2)

In [None]:
# XGBoost模型
def xgb_train(x_train, y_train, x_valid, y_valid, verbose=True):
    """
    使用XGBoost原生API而非sklearn包装器
    """
    # 转换为DMatrix格式
    train = xgb.DMatrix(x_train, label=y_train)
    valid = xgb.DMatrix(x_valid, label=y_valid)

    # 参数设置
    params = {
        'max_depth': 10,
        'eta': 0.1,
        'min_child_weight': 300,
        'colsample_bytree': 0.7,
        'subsample': 0.9,
        'objective': 'binary:logistic',
        'eval_metric': 'auc'
    }

    model = xgb.train(
        params,
        train,
        num_boost_round=5000,
        evals=[(train, 'train'), (valid, 'eval')],
        early_stopping_rounds=10,
        verbose_eval=verbose
    )

    print(f"最佳迭代轮数: {model.best_iteration}")
    print(f"最佳验证集AUC得分: {model.best_score}")

    return model

In [None]:
model_xgb = xgb_train(X_train, Y_train, X_valid, Y_valid, verbose=False)

In [None]:
# 创建DMatrix用于预测
dtest = xgb.DMatrix(test_data.drop(['label'], axis=1))
prob = model_xgb.predict(dtest)

submission = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/test_format1.csv')
submission['prob'] = pd.Series(prob)
submission.to_csv('submission2.csv', index=False)

del submission
gc.collect()

In [None]:
# 交叉验证多轮建模
def get_train_test_datas(train_df, label_df):
    skv = StratifiedKFold(n_splits=10, shuffle=True)
    xtrain = []
    ytrain = []
    xtest = []
    ytest = []
    # 索引：训练数据索引train_index，验证数据索引valid_index
    for train_index, test_index in skv.split(train_df, label_df):
        train_x, train_y, test_x, test_y = (
            train_df.iloc[train_index], label_df.iloc[train_index], train_df.iloc[test_index],
            label_df.iloc[test_index])
        xtrain.append(train_x)
        ytrain.append(train_y)
        xtest.append(test_x)
        ytest.append(test_y)
    return xtrain, ytrain, xtest, ytest


In [None]:
# LightGBM
train_X, train_Y = train_data.drop(['label'], axis=1), train_data['label']

# 拆分为10份训练数据和10份验证数据
trainX, trainY, validX, validY = get_train_test_datas(train_X, train_Y)
print('-- 训练数据的长度 --', len(trainX))
print('-- 验证数据的长度 --', len(validX))

pred_lgbms = []  # 列表，接受目标值，10轮，平均值

for i in range(10):
    print(f'LGB第{i}轮训练')

    callbacks = [
        lgb.early_stopping(stopping_rounds=10)
    ]

    model_lgb = lgb.LGBMClassifier(
        max_depth=10,
        n_estimators=1000,
        min_child_weight=100,
        colsample_bytree=0.7,
        subsample=0.9,
        learning_rate=0.1,
        verbose=-1,
        silent=True
    )

    model_lgb.fit(
        trainX[i],
        trainY[i],
        eval_metric='auc',
        eval_set=[(trainX[i], trainY[i]), (validX[i], validY[i])],
        callbacks=callbacks
    )

    print(f"最佳验证集AUC得分: {model_lgb.best_score_['valid_1']['auc']}")

    # 预测时保持特征名
    test_features = test_data.drop(['label'], axis=1)
    pred = model_lgb.predict_proba(test_features)
    pred = pd.DataFrame(pred[:, 1])
    pred_lgbms.append(pred)

# 求十轮平均值生成预测结果
pred_lgbms = pd.concat(pred_lgbms, axis=1)
submission = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/test_format1.csv')
submission['prob'] = pred_lgbms.mean(axis=1)
submission.to_csv('submissionLgb.csv', index=False)

In [None]:
# XGBoost
train_X, train_Y = train_data.drop(['label'], axis=1), train_data['label']

# 拆分为10份训练数据和10份验证数据
trainX, trainY, validX, validY = get_train_test_datas(train_X, train_Y)
print('-- 训练数据的长度 --', len(trainX))
print('-- 验证数据的长度 --', len(validX))
pred_xgbms = []  # 列表，接受目标值，10轮，平均值
for i in range(10):
    print(f'XGB第{i}轮训练')

    # 转换为DMatrix格式
    train = xgb.DMatrix(trainX[i], label=trainY[i])
    valid = xgb.DMatrix(validX[i], label=validY[i])

    # 参数设置
    params = {
        'max_depth': 10,
        'eta': 0.1,
        'min_child_weight': 200,
        'colsample_bytree': 0.7,
        'subsample': 0.9,
        'objective': 'binary:logistic',
        'eval_metric': 'auc'
    }

    model_xgb = xgb.train(
        params,
        train,
        num_boost_round=5000,
        evals=[(train, 'train'), (valid, 'eval')],
        early_stopping_rounds=10,
        verbose_eval=False
    )

    print(f"最佳验证集AUC得分: {model_xgb.best_score}")

    # 预测时保持特征名
    test_features = xgb.DMatrix(test_data.drop(['label'], axis=1))
    pred = model_xgb.predict(test_features)
    pred = pd.DataFrame(pred)
    pred_xgbms.append(pred)

# 求十轮平均值生成预测结果
pred_xgbms = pd.concat(pred_xgbms, axis=1)
submission = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/test_format1.csv')
submission['prob'] = pred_xgbms.mean(axis=1)
submission.to_csv('submissionXgb.csv', index=False)