In [1]:
import json

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.metrics import classification_report, roc_auc_score

# 假设df是你的DataFrame
# 首先，导入数据
df = pd.read_csv('./data/w_0812_0827.train.csv')  # 如果数据在CSV文件中

# 检查缺失值
print(df.isnull().sum())
print(f'====columns: {df.columns}')
print(df.dtypes)

traceId                0
sceneId                0
eid                    0
itemId                 0
actionTime             0
actionValue            0
age               135672
gender            135672
categoryLevel1         0
categoryLevel2     80924
duration            3873
publisherName          0
publishTime            0
source             11204
collectionID      159662
dtype: int64
====columns: Index(['traceId', 'sceneId', 'eid', 'itemId', 'actionTime', 'actionValue',
       'age', 'gender', 'categoryLevel1', 'categoryLevel2', 'duration',
       'publisherName', 'publishTime', 'source', 'collectionID'],
      dtype='object')
traceId            object
sceneId             int64
eid                 int64
itemId              int64
actionTime          int64
actionValue         int64
age               float64
gender             object
categoryLevel1     object
categoryLevel2     object
duration          float64
publisherName      object
publishTime         int64
source             object


In [2]:
# 将观看时长转换为二进制变量
df['target'] = (df['actionValue'] > 10).astype(int)

# 定义特征和目标变量
features = df.drop(columns=['actionValue', 'target'])
target = df['target']

from feature import  process_time_features,create_cross_features
# def process_time_features(df):
#     df['actionTime'] = pd.to_datetime(df['actionTime'])
#     df['publishTime'] = pd.to_datetime(df['publishTime'])
#     df['action_hour'] = df['actionTime'].dt.hour
#     df['action_weekday'] = df['actionTime'].dt.weekday
#     df['publish_hour'] = df['publishTime'].dt.hour
#     df['publish_weekday'] = df['publishTime'].dt.weekday
#     return df.drop(['actionTime', 'publishTime'], axis=1)


numeric_features = [
    'duration',
    'action_hour',
    'action_weekday',
    'publish_hour',
    'publish_weekday'
]

# 定义分类和数值特征
categorical_features = [
    'gender',
    'categoryLevel1',
    'categoryLevel2',
    'publisherName',
    'source',
    'collectionID'
]

# 定义交叉特征
cross_features = [
    ('gender_categoryLevel1', ['gender', 'categoryLevel1']),
    ('publisherName_source', ['publisherName', 'source'])
]

# 定义预处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features + [f[0] for f in cross_features])
    ])

# 使用LightGBM
import lightgbm as lgb

pipeline = Pipeline(steps=[
    ('time_processor', FunctionTransformer(process_time_features, validate=False)),
    ('cross_feature_creator',
     FunctionTransformer(create_cross_features, kw_args={'cross_features': cross_features}, validate=False)),
    # ('age_binning', FunctionTransformer(bin_age, validate=False)),
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier())
])


In [3]:
# 准备训练和测试数据
X = df.drop(['actionValue', 'target'], axis=1)
print(X.columns)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Index(['traceId', 'sceneId', 'eid', 'itemId', 'actionTime', 'age', 'gender',
       'categoryLevel1', 'categoryLevel2', 'duration', 'publisherName',
       'publishTime', 'source', 'collectionID'],
      dtype='object')


In [4]:
# 训练模型
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 8682, number of negative: 127317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010002 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1171
[LightGBM] [Info] Number of data points in the train set: 135999, number of used features: 470
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063839 -> initscore=-2.685428
[LightGBM] [Info] Start training from score -2.685428


In [5]:
# 预测测试集
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print(X_test)
print(y_pred)
print(y_pred_proba)

# 评估模型
print(classification_report(y_test, y_pred))
print("ROC AUC score:", roc_auc_score(y_test, y_pred_proba))

                                           traceId  sceneId              eid  \
63890    9541b5df-1fca-4f54-8c67-99a8482e98df-6069       14  584070995160197   
21954    f1f196bb-d09a-4f16-bd74-ee6edc8803bd-6067       14  582985158147077   
125032  1cdd73ae-0736-429b-afef-c3e088851831-39993       14  583395540822981   
82290    ee1f793d-4262-496a-8207-ccb94c4d9f97-9083       14  583135584328645   
117830  2d8985ac-de43-4970-a3ca-2fd090f9b0f9-10399       14  583440698181765   
...                                            ...      ...              ...   
64522    43c466b9-0873-4ba8-b765-e9c98138521f-6069       14  583317000297605   
157721   e57ed265-ffba-44eb-94e9-740c7f6a1d1b-9083       14  583310296300485   
79811    9002fd85-2f57-42d2-b98b-189f651f95da-9083       14  583189433510853   
94708    fea1b64b-8778-4d8f-8064-b5c22ab5ad8a-6072       14  584096422529989   
11461    c9628259-5d45-45c3-a0f6-4fe2b731c213-6069       14  583407209257925   

        itemId                    actio

In [6]:
import joblib

# 保存模型
joblib.dump(pipeline, 'model.pkl')

['model.pkl']

In [9]:
# 加载模型
from joblib import load

pipeline = load('model.pkl')

data = [
    {
        'itemId': '7928',
        'age': 25,
        'gender': '女',
        'actionTime': 1723776282810,
        'categoryLevel1': '动物',
        'categoryLevel2': '',
        'duration': 1200,
        'publisherName': 'MovieCentral',
        'publishTime': 1723776282810,
        'source': 'Web',
        'collectionID': ''
    },
    {
        'itemId': '7928',
        'age': 25,
        'gender': '女',
        'actionTime': 1723776282810,
        'categoryLevel1': '动物',
        'categoryLevel2': '',
        'duration': 1200,
        'publisherName': 'MovieCentral',
        'publishTime': 1723776282810,
        'source': 'Web',
        'collectionID': ''
    }
]
import json
print(json.dumps(data,ensure_ascii=False,indent=4))
df = pd.DataFrame(data)
prediction = pipeline.predict(df)
prediction_prob = pipeline.predict_proba(df)[:, 1]
print(prediction)
print(prediction_prob)

[
    {
        "itemId": "7928",
        "age": 25,
        "gender": "女",
        "actionTime": 1723776282810,
        "categoryLevel1": "动物",
        "categoryLevel2": "",
        "duration": 1200,
        "publisherName": "MovieCentral",
        "publishTime": 1723776282810,
        "source": "Web",
        "collectionID": ""
    },
    {
        "itemId": "7928",
        "age": 25,
        "gender": "女",
        "actionTime": 1723776282810,
        "categoryLevel1": "动物",
        "categoryLevel2": "",
        "duration": 1200,
        "publisherName": "MovieCentral",
        "publishTime": 1723776282810,
        "source": "Web",
        "collectionID": ""
    }
]
[0 0]
[0.10539439 0.10539439]
