In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 假设df是你的DataFrame
# 首先，导入数据
df = pd.read_csv('./data/w_0812_0827.train.csv')  # 如果数据在CSV文件中

# 检查缺失值
print(df.isnull().sum())

# 处理缺失值，这里以填充中位数为例
imputer = SimpleImputer(strategy='median')
df['age'] = imputer.fit_transform(df[['age']])

# 将观看时长转换为二进制变量
df['duration_gt_10'] = (df['duration'] > 10).astype(int)

# 定义数值和类别特征
numeric_features = ['age', 'duration']
categorical_features = ['gender', 'categoryLevel1', 'categoryLevel2', 'publisherName', 'source']

# 创建数值特征的预处理器
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 创建类别特征的预处理器
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 组合预处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 创建模型管道
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

# 准备训练和测试数据
X = df.drop(['duration', 'duration_gt_10'], axis=1)
y = df['duration_gt_10']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练模型
pipeline.fit(X_train, y_train)

# 预测测试集
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

# 评估模型
print(classification_report(y_test, y_pred))
print("ROC AUC score:", roc_auc_score(y_test, y_pred_proba))