In [64]:
# 数据导入
import pandas as pd
import warnings, gc

from sklearn.model_selection import train_test_split  # 划分数据集
from sklearn.model_selection import StratifiedKFold  # 分层k折交叉验证
from xgboost.callback import EarlyStopping

import lightgbm as lgb
import xgboost as xgb

warnings.filterwarnings('ignore')

In [None]:
# 数据加载
# 用户行为日志
user_log = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/user_log_format1.csv')
# 用户画像
user_info = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/user_info_format1.csv')
# 训练数据和测试数据
train_data = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/train_format1.csv')
test_data = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/test_format1.csv')

In [None]:
# 查看数据
# print('-- data shape --')
# for data in [user_log, user_info, train_data, test_data]:
#     print(data.shape)

print('-- data info --')
for data in [user_log, user_info, train_data, test_data]:
    print(data.info())

In [4]:
# 数据集成
train_data['origin'] = 'train'
test_data['origin'] = 'test'

# 合并数据集
all_data = pd.concat([train_data, test_data], ignore_index=True, sort=False)
all_data.drop(['prob'], axis=1, inplace=True)

# 连接user_info，通过user_id连接
all_data = all_data.merge(user_info, on='user_id', how='left')

# 修改user_log表的seller_id列名
user_log.rename(columns={'seller_id': 'merchant_id'}, inplace=True)

del train_data, test_data, user_info
gc.collect()

705

In [5]:
# 数据类型转换
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log.fillna({'brand_id': 0}, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
user_log['action_type'] = user_log['action_type'].astype('int8')
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661,1900-01-01 08:29:00,0
1,328862,844400,1271,2882,2661,1900-01-01 08:29:00,0
2,328862,575153,1271,2882,2661,1900-01-01 08:29:00,0
3,328862,996875,1271,2882,2661,1900-01-01 08:29:00,0
4,328862,1086186,1271,1253,1049,1900-01-01 08:29:00,0


In [6]:
# all_data数据填充
all_data.fillna({'age_range': 0}, inplace=True)
all_data.fillna({'gender': 2}, inplace=True)
all_data.isnull().sum()

user_id             0
merchant_id         0
label          261477
origin              0
age_range           0
gender              0
dtype: int64

In [7]:
# all_data表的数据类型转换
all_data['user_id'] = all_data['user_id'].astype('int32')
all_data['merchant_id'] = all_data['merchant_id'].astype('int32')
all_data['label'] = all_data['label'].astype('str')
all_data['age_range'] = all_data['age_range'].astype('int8')
all_data['gender'] = all_data['gender'].astype('int8')
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522341 entries, 0 to 522340
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      522341 non-null  int32 
 1   merchant_id  522341 non-null  int32 
 2   label        522341 non-null  object
 3   origin       522341 non-null  object
 4   age_range    522341 non-null  int8  
 5   gender       522341 non-null  int8  
dtypes: int32(2), int8(2), object(2)
memory usage: 13.0+ MB


In [8]:
# 用户特征工程
groups = user_log.groupby(['user_id'])

# 用户交互行为数量
temp = groups.size().reset_index().rename(columns={0: 'u1'})
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['item_id'].nunique().reset_index(name='u2')
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['cat_id'].nunique().reset_index(name='u3')
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['merchant_id'].nunique().reset_index(name='u4')
all_data = all_data.merge(temp, on='user_id', how='left')

temp = groups['brand_id'].nunique().reset_index(name='u5')
all_data = all_data.merge(temp, on='user_id', how='left')

# 购物时间间隔特征
temp = groups['time_stamp'].agg([('u6', lambda x: (x.max() - x.min()).seconds // 3600)]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')

# 统计操作类型为0，1，2，3的个数
temp = groups['action_type'].value_counts().unstack().reset_index()
temp.rename(columns={0: 'u7', 1: 'u8', 2: 'u9', 3: 'u10'}, inplace=True)
all_data = all_data.merge(temp, on='user_id', how='left')

del temp, groups
gc.collect()

0

In [11]:
# 在特征工程前，移除可能存在的特征列
# feature_cols = ['m1', 'm2', 'm3', 'm4', 'm5']
# existing_cols = [col for col in feature_cols if col in all_data.columns]
# if existing_cols:
#     all_data = all_data.drop(existing_cols, axis=1)

all_data.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,...,u10,m1,m2,m3,m4,m5,m6,m7,m8,m9
0,34176,3906,0.0,train,6,0,451,256,45,109,...,7.0,16269,5819,308,20,2,14870.0,28.0,410.0,961.0
1,34176,121,0.0,train,6,0,451,256,45,109,...,7.0,79865,10931,1179,26,2,72265.0,121.0,4780.0,2699.0
2,34176,4356,1.0,train,6,0,451,256,45,109,...,7.0,7269,2281,67,15,2,6094.0,16.0,963.0,196.0
3,34176,2217,0.0,train,6,0,451,256,45,109,...,7.0,60202,16870,377,5,2,52230.0,101.0,3721.0,4150.0
4,230784,4818,0.0,train,0,0,54,31,17,20,...,,48089,7500,461,27,2,43268.0,129.0,2733.0,1959.0


In [10]:
# 商家特征处理
groups = user_log.groupby(['merchant_id'])

# 商家交互行为数量
temp = groups.size().reset_index().rename(columns={0: 'm1'})
all_data = all_data.merge(temp, on='merchant_id', how='left')

# 统计商家被交互的user_id,item_id,cat_id,brand_id的唯一值
temp = pd.DataFrame({
    'm2': groups['user_id'].nunique(),
    'm3': groups['item_id'].nunique(),
    'm4': groups['cat_id'].nunique(),
    'm5': groups['brand_id'].nunique()
}).reset_index()
all_data = all_data.merge(temp, on='merchant_id', how='left')

# 统计商家被交互的action_type唯一值
temp = groups['action_type'].value_counts().unstack().reset_index()
temp.rename(columns={0: 'm6', 1: 'm7', 2: 'm8', 3: 'm9'}, inplace=True)
all_data = all_data.merge(temp, on='merchant_id', how='left')

del temp, groups
gc.collect()

12

In [12]:
# 用户店铺联合特征
groups = user_log.groupby(['user_id', 'merchant_id'])

# 用户在不同商家交互统计
temp = groups.size().reset_index().rename(columns={0: 'um1'})
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 统计用户在不同商家交互的item_id,cat_id,brand_id的唯一值
temp = pd.DataFrame({
    'um2': groups['item_id'].nunique(),
    'um3': groups['cat_id'].nunique(),
    'um4': groups['brand_id'].nunique()
}).reset_index()
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 统计用户在不同的商家交互的action_type唯一值
temp = groups['action_type'].value_counts().unstack().reset_index()
temp.rename(columns={0: 'um5', 1: 'um6', 2: 'um7', 3: 'um8'}, inplace=True)
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

# 统计用户在不同商家交互的时间间隔
temp = groups['time_stamp'].agg([('um9', lambda x: (x.max() - x.min()).seconds // 3600)]).reset_index()
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')

del temp, groups
gc.collect()

0

In [13]:
# 购买点击比
all_data['r1'] = all_data['u9'] / all_data['u7']  # 用户购买点击比
all_data['r2'] = all_data['m8'] / all_data['m6']  # 商家购买点击比
all_data['r3'] = all_data['um7'] / all_data['um5']  # 不同用户不同商家购买点击比
all_data.head()

Unnamed: 0,user_id,merchant_id,label,origin,age_range,gender,u1,u2,u3,u4,...,um3,um4,um5,um6,um7,um8,um9,r1,r2,r3
0,34176,3906,0.0,train,6,0,451,256,45,109,...,6,1,36.0,,1.0,2.0,0,0.082927,0.027572,0.027778
1,34176,121,0.0,train,6,0,451,256,45,109,...,1,1,13.0,,1.0,,0,0.082927,0.066145,0.076923
2,34176,4356,1.0,train,6,0,451,256,45,109,...,1,1,12.0,,6.0,,0,0.082927,0.158024,0.5
3,34176,2217,0.0,train,6,0,451,256,45,109,...,1,1,1.0,,1.0,,0,0.082927,0.071243,1.0
4,230784,4818,0.0,train,0,0,54,31,17,20,...,1,1,7.0,,1.0,,0,0.148936,0.063164,0.142857


In [14]:
# 空数据的填充
all_data.fillna(0, inplace=True)

In [15]:
# 年龄性别类型的转换
# 修改age_range字段名称为age_0,age_1,...,age_8
# 独热编码
temp = pd.get_dummies(all_data['age_range'], prefix='age')
all_data = pd.concat([all_data, temp], axis=1)
temp = pd.get_dummies(all_data['gender'], prefix='gender')
all_data = pd.concat([all_data, temp], axis=1)
# 删除原始数据
all_data.drop(['age_range', 'gender'], axis=1, inplace=True)
del temp
gc.collect()

12

In [16]:
# 数据存储
train_data = all_data[all_data['origin'] == 'train'].drop(columns=['origin'], axis=1)
test_data = all_data[all_data['origin'] == 'test'].drop(columns=['origin'], axis=1)

train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [17]:
# 算法建模预测
# 训练数据和目标值
train_X, train_Y = train_data.drop(['label'], axis=1), train_data['label']

# 数据划分保留20%作为验证集
X_train, X_valid, Y_train, Y_valid = train_test_split(train_X, train_Y, test_size=0.2)

In [47]:
# LGBM模型训练
def lgb_train(x_train, y_train, x_valid, y_valid, verbose=True):
    """
    LightGBM 模型训练函数

    参数:
        x_train: 训练集特征
        y_train: 训练集标签
        x_valid: 验证集特征
        y_valid: 验证集标签
        verbose: 是否显示训练信息，默认为True

    返回:
        训练好的LightGBM模型
    """
    # 创建回调函数列表
    callbacks = [
        lgb.early_stopping(stopping_rounds=10)  # 早停策略，10轮内无改善则停止
    ]

    # 仅当verbose为True时添加日志回调
    if verbose:
        callbacks.append(lgb.log_evaluation(period=100))

    # 创建LightGBM分类器
    model_lgb = lgb.LGBMClassifier(
        max_depth=10,  # 树的最大深度
        n_estimators=5000,  # 最大迭代次数
        min_child_weight=100,  # 叶子节点最小样本权重和
        colsample_bytree=0.7,  # 每棵树随机选择70%的特征
        subsample=0.9,  # 每棵树随机选择90%的样本
        learning_rate=0.1,  # 学习率
        verbose=-1 if not verbose else 0  # -1完全禁止输出，0只显示警告和错误
    )

    # 训练模型
    model_lgb.fit(
        x_train,
        y_train,
        eval_metric='auc',  # 评估指标为AUC
        eval_set=[(x_train, y_train), (x_valid, y_valid)],  # 评估数据集
        callbacks=callbacks  # 回调函数列表
    )

    # 打印验证集的最佳AUC得分
    print(f"最佳验证集AUC得分: {model_lgb.best_score_['valid_1']['auc']}")
    return model_lgb

In [50]:
# verbose=True时显示训练信息
model_lgb = lgb_train(X_train.values, Y_train, X_valid.values, Y_valid, verbose=False)

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[104]	training's auc: 0.728575	training's binary_logloss: 0.210499	valid_1's auc: 0.682499	valid_1's binary_logloss: 0.215529
最佳验证集AUC得分: 0.6824988455792353


In [56]:
prob = model_lgb.predict_proba(test_data.drop(['label'], axis=1).values)

submission = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/test_format1.csv')
submission['prob'] = pd.Series(prob[:, 1])
submission.to_csv('submission.csv', index=False)

del submission
gc.collect()

20

In [73]:
# 算法建模预测
# 训练数据和目标值
train_X, train_Y = train_data.drop(['label'], axis=1), train_data['label'].astype(float).astype(int)

# 数据划分保留20%作为验证集
X_train, X_valid, Y_train, Y_valid = train_test_split(train_X, train_Y, test_size=0.2)

In [95]:
# XGBoost模型
def xgb_train(x_train, y_train, x_valid, y_valid, verbose=True):
    """
    使用XGBoost原生API而非sklearn包装器
    """
    # 转换为DMatrix格式
    train = xgb.DMatrix(x_train, label=y_train)
    valid = xgb.DMatrix(x_valid, label=y_valid)

    # 参数设置
    params = {
        'max_depth': 10,
        'eta': 0.1,
        'min_child_weight': 300,
        'colsample_bytree': 0.7,
        'subsample': 0.9,
        'objective': 'binary:logistic',
        'eval_metric': 'auc'
    }

    model = xgb.train(
        params,
        train,
        num_boost_round=5000,
        evals=[(train, 'train'), (valid, 'eval')],
        early_stopping_rounds=10,
        verbose_eval=verbose
    )

    print(f"最佳迭代轮数: {model.best_iteration}")
    print(f"最佳验证集AUC得分: {model.best_score}")

    return model

In [96]:
model_xgb = xgb_train(X_train, Y_train, X_valid, Y_valid, verbose=False)

最佳迭代轮数: 97
最佳验证集AUC得分: 0.6809273845416097


In [99]:
# 创建DMatrix用于预测
dtest = xgb.DMatrix(test_data.drop(['label'], axis=1))
prob = model_xgb.predict(dtest)

submission = pd.read_csv('/Users/zhuzijie/Downloads/data_format1/test_format1.csv')
submission['prob'] = pd.Series(prob)
submission.to_csv('submission2.csv', index=False)

del submission
gc.collect()

6461