In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report, roc_auc_score

# 假设df是你的DataFrame
# 首先，导入数据
df = pd.read_csv('./data/w_0812_0827.train.csv')  # 如果数据在CSV文件中

# 检查缺失值
print(df.isnull().sum())
print(f'====columns: {df.columns}')

traceId                0
sceneId                0
eid                    0
itemId                 0
actionTime             0
actionValue            0
age               135672
gender            135672
categoryLevel1         0
categoryLevel2     80924
duration            3873
publisherName          0
publishTime            0
source             11204
collectionID      159662
dtype: int64
====columns: Index(['traceId', 'sceneId', 'eid', 'itemId', 'actionTime', 'actionValue',
       'age', 'gender', 'categoryLevel1', 'categoryLevel2', 'duration',
       'publisherName', 'publishTime', 'source', 'collectionID'],
      dtype='object')


In [51]:


# 处理缺失值，这里以填充中位数为例
imputer = SimpleImputer(strategy='median')
df['age'] = imputer.fit_transform(df[['age']])
df['duration'] = imputer.fit_transform(df[['duration']])

# 将观看时长转换为二进制变量
df['duration_gt_10'] = (df['duration'] > 10).astype(int)

print(df.isnull().sum())

# 定义数值和类别特征
numeric_features = ['age']
categorical_features = ['itemId', 'gender', 'categoryLevel1', 'categoryLevel2', 'publisherName', 'source',
                        'collectionID']

# 创建数值特征的预处理器
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 创建类别特征的预处理器
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 组合预处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 创建模型管道
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

# 使用LightGBM
# import lightgbm as lgb
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', lgb.LGBMClassifier())
# ])


traceId                0
sceneId                0
eid                    0
itemId                 0
actionTime             0
actionValue            0
age                    0
gender            135672
categoryLevel1         0
categoryLevel2     80924
duration               0
publisherName          0
publishTime            0
source             11204
collectionID      159662
duration_gt_10         0
dtype: int64


In [44]:
# 准备训练和测试数据
X = df.drop(['duration', 'duration_gt_10'], axis=1)
print(X.columns)
y = df['duration_gt_10']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Index(['traceId', 'sceneId', 'eid', 'itemId', 'actionTime', 'actionValue',
       'age', 'gender', 'categoryLevel1', 'categoryLevel2', 'publisherName',
       'publishTime', 'source', 'collectionID'],
      dtype='object')


In [52]:
# 训练模型
pipeline.fit(X_train, y_train)

In [53]:
# 预测测试集
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

# 评估模型
print(classification_report(y_test, y_pred))
print("ROC AUC score:", roc_auc_score(y_test, y_pred_proba))

              precision    recall  f1-score   support

           0       1.00      0.42      0.59        43
           1       1.00      1.00      1.00     33957

    accuracy                           1.00     34000
   macro avg       1.00      0.71      0.79     34000
weighted avg       1.00      1.00      1.00     34000

ROC AUC score: 0.9989833243274154


In [54]:
import joblib

# 保存模型
joblib.dump(pipeline, 'model.pkl')

['model.pkl']

In [49]:
# 加载模型
from joblib import load

pipeline = load('model.pkl')

data = {
    'sceneId': '14',
    'eid': '574942950007813',
    'itemId': '7928',
    'age': 25,
    'gender': '女',
    'categoryLevel1': 'Entertainment',
    'categoryLevel2': 'Movies',
    'duration': 120,
    'publisherName': 'MovieCentral',
    'publisherTime': '2024-09-13T10:00:00Z',
    'source': 'Web',
    'collectionID':''
}
df = pd.DataFrame([data])
prediction = pipeline.predict(df)
print(prediction)

[1]
