In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
def run(fold):
    # 读取分层k折交叉检验数据
    df = pd.read_csv("cat_train_folds.csv")
    # 取除"id", "target", "kfold"外的其他特征列
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold") 
    ]
    # 遍历特征列表
    for col in features:
        # 将空值置为"NONE"
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
    # 取训练集（kfold列中不为fold的样本，重置索引）
    df_train = df[df.kfold != fold].reset_index(drop=True) 
    # 取验证集（kfold列中为fold的样本，重置索引）
    df_valid = df[df.kfold == fold].reset_index(drop=True) 
    # 独热编码
    ohe = preprocessing.OneHotEncoder()
    # 将训练集、验证集沿行合并
    full_data = pd.concat([df_train[features], df_valid[features]], axis=0)
    ohe.fit(full_data[features])
    # 转换训练集
    x_train = ohe.transform(df_train[features])
    # 转换测试集
    x_valid = ohe.transform(df_valid[features])
    # 逻辑回归
    model = linear_model.LogisticRegression(max_iter=10000)
    # 使用训练集训练模型
    model.fit(x_train, df_train.target.values)
    # 使用验证集得到预测标签
    valid_preds = model.predict_proba(x_valid)[:, 1]
    train_preds = model.predict_proba(x_train)[:, 1]
    # 计算auc指标
    auc1 = metrics.roc_auc_score(df_train.target.values, train_preds)
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    print(auc)
if __name__ == "__main__":
    # 运行折叠0
    for i in range(5):
        run(i)

0.7855490345138398
0.7883183892018131
0.7838280318052777
0.7866314229555562
0.7858323005965283
