In [1]:
!pip install plotly

!pip install lightgbm

!pip install pyarrow

# 科学计算模块
import matplotlib
import numpy as np
import pandas as pd

# 绘图模块
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly
from sklearn.model_selection import train_test_split, KFold
import numpy as np

from sklearn.metrics import accuracy_score
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting plotly
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a8/07/72953cf70e3bd3a24cbc3e743e6f8539abe6e3e6d83c3c0c83426eaffd39/plotly-5.18.0-py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: plotly
Successfully installed plotly-5.18.0
[0mLooking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting lightgbm
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a6/11/5171f6a1ecf7f008648fef6ef780d92414763ff5ba50a796657b9275dc1e/lightgbm-4.2.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-4.2.0
[0mLooking in indexes: https://pypi.tuna.tsinghua.edu.cn/si

In [2]:
train_data_original = pd.read_feather('./input/train.feather')
labels = pd.read_csv('/openbayes/input/input0/train_labels.csv')
test_data = pd.read_feather('./input/test.feather')

In [3]:
lgb_config = {
    'lgb_params': {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'dart',
        'max_depth': -1,
        'num_leaves': 64,
        'learning_rate': 0.035,
        'bagging_freq': 5,
        'bagging_fraction': 0.75,
        'feature_fraction': 0.05,
        'min_data_in_leaf': 256,
        'max_bin': 63,
        'min_data_in_bin': 256,
        'tree_learner': 'serial',
        'boost_from_average': 'false',
        'lambda_l1': 0.1,
        'lambda_l2': 30,
        'num_threads': 24,
        'verbosity': 1,
    },
    'rounds': 4500,
    #'early_stopping_rounds': 100,
    'verbose_eval': 50,
}
# 定义早停函数
#early_stopping = lgb.early_stopping(lgb_config['early_stopping_rounds'], verbose=lgb_config['verbose_eval'])


In [4]:
def optimized_preprocessing(df, mode):
    cat_features = ["B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126", "D_63", "D_64", "D_66", "D_68"]
    ignore_features = ["B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126", "D_63", "D_64", "D_66", "D_68",
                       'S_2', 'customer_ID', 'target']
    numeric_features = [col for col in df.columns if col not in ignore_features]
    target_features = ['target']

    # print("This is in [2], Columns before processing:"+",".join(df.columns))

    # 使用更高效的方式填充缺失值
    df[numeric_features] = df[numeric_features].fillna(df[numeric_features].median())
    df[cat_features] = df[cat_features].fillna(df[cat_features].mode().iloc[0])

    # 类别特征的独热编码
    encoder = OneHotEncoder()
    encoded_data = encoder.fit_transform(df[cat_features]).toarray()
    new_feature_names = encoder.get_feature_names_out(cat_features)
    encoded_df = pd.DataFrame(encoded_data, columns=new_feature_names)
    final_df = pd.concat([df.drop(cat_features, axis=1), encoded_df], axis=1)

    # 数值特征处理
    final_df[numeric_features] = np.floor(df[numeric_features] * 100)

    # 时间特征处理
    final_df["S_2"] = pd.to_datetime(df["S_2"])

    # 聚合特征
    if mode == 0:
        agg_funcs = {
            "S_2": ['count'],
            **{name: ['last', 'nunique'] for name in new_feature_names},
            **{name: ['mean', 'std', 'min', 'max'] for name in numeric_features},
            label_name: ['last']  # 添加对标签的聚合操作
        }
    else:
        agg_funcs = {
            "S_2": ['count'],
            **{name: ['last', 'nunique'] for name in new_feature_names},
            **{name: ['mean', 'std', 'min', 'max'] for name in numeric_features},
        }

    final_agg_df = final_df.groupby("customer_ID", sort=False).agg(agg_funcs).reset_index(drop=False)

    new_feature_names = ['_'.join(x) for x in final_agg_df.columns]
    final_agg_df.columns = new_feature_names
    # 如果 'target_last' 存在于列名中，将其重命名为 'target'

    if 'target_last' in final_agg_df.columns:
        final_agg_df = final_agg_df.rename(columns={'target_last': 'target'})

    if 'customer_ID_' in final_agg_df.columns:
        final_agg_df = final_agg_df.rename(columns={'customer_ID_': 'customer_ID'})

    customer_ids = final_agg_df['customer_ID']
    final_agg_df.reset_index(drop=False, inplace=True)
    final_agg_df = final_agg_df.drop(columns=['customer_ID'])

    # print("This is in [2], final_agg_df.columns: "+",".join(final_agg_df.columns))

    return final_agg_df, new_feature_names, customer_ids

In [5]:
def My_best_model_training(train_data, new_feature_names, lgb_config):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    models = []  # 存储每个折的模型

    for train_index, test_index in kf.split(train_data):
        X_train_fold = train_data.iloc[train_index].drop(columns=['target'])
        y_train_fold = train_data.iloc[train_index]['target']
        X_valid_fold = train_data.iloc[test_index].drop(columns=['target'])
        y_valid_fold = train_data.iloc[test_index]['target']

        # 将特征名称更新为新的特征名称
        lgb_config['feature_name'] = new_feature_names

        # 将训练数据和标签转换为LightGBM的Dataset对象
         # Create LightGBM Dataset objects for training and validation
        dtrain = lgb.Dataset(X_train_fold, label=y_train_fold)
        dvalid = lgb.Dataset(X_valid_fold, label=y_valid_fold)

        My_best_model = lgb.train(
             lgb_config['lgb_params'],
            dtrain,
            num_boost_round=lgb_config['rounds'],
            valid_sets=[dvalid],
            verbose_eval=lgb_config['verbose_eval']
        )

        # 获取验证集上的预测概率
        probs = My_best_model.predict(X_valid_fold)
        pred_labels = (probs >= 0.5).astype(int)

        # 计算精度
        accuracy = accuracy_score(y_valid_fold, pred_labels)
        accuracies.append(accuracy)
        models.append(My_best_model)

    # 返回交叉验证的平均精度和每个折的模型
    return models, np.mean(accuracies)

In [6]:
train_data = pd.merge(train_data_original, labels, on='customer_ID', how='left')
label_name = 'target'

train_data, new_feature_names, _ = optimized_preprocessing(train_data, 0)

# 移除 'customer_ID_'
if 'customer_ID_' in new_feature_names:
    new_feature_names.remove('customer_ID_')
if 'S_2_count' in new_feature_names:
    new_feature_names.remove('S_2_count')

# 将 'target_last' 替换为 'target'
if 'target_last' in new_feature_names:
    index = new_feature_names.index('target_last')
    new_feature_names[index] = 'target'

test_data, useless, customer_ids = optimized_preprocessing(test_data, 1)

In [7]:
lgb_train_data = lgb.Dataset(train_data.drop(columns=['target']), label=train_data["target"])

models,mean_accuracy=My_best_model_training(train_data,new_feature_names,lgb_config)

print("Mean accuracy:",mean_accuracy)

[LightGBM] [Info] Number of positive: 95285, number of negative: 271845
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.084082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28554
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 773




[LightGBM] [Info] Number of positive: 95001, number of negative: 272129
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28589
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 774




[LightGBM] [Info] Number of positive: 95047, number of negative: 272083
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28551
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 774




[LightGBM] [Info] Number of positive: 94965, number of negative: 272166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28546
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 774




[LightGBM] [Info] Number of positive: 95014, number of negative: 272117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28667
[LightGBM] [Info] Number of data points in the train set: 367131, number of used features: 774




Mean accuracy: 0.899767491684339


In [8]:
# 找出训练数据和测试数据中特征的不一致
train_features = set(train_data.columns)
test_features = set(test_data.columns)

# 在训练数据中存在但在测试数据中不存在的特征
missing_in_test = train_features - test_features

# 在测试数据中存在但在训练数据中不存在的特征
missing_in_train = test_features - train_features

# 输出缺失的特征
print("Missing in Test: ", missing_in_test)
print("Missing in Train: ", missing_in_train)

# 处理缺失的特征
# 您可以根据具体情况选择添加缺失的特征或删除多余的特征
# 为测试集中缺失的特征添加默认值
for feature in missing_in_test:
    if feature != 'target':  # 排除 'target'，因为它不应该在测试集中
        
        test_data[feature] = 0

Missing in Test:  {'D_64_1_nunique', 'D_68_0.0_last', 'D_64_1_last', 'D_68_0.0_nunique', 'D_66_0.0_nunique', 'target', 'D_66_0.0_last'}
Missing in Train:  set()


In [9]:
# 对测试数据进行概率预测
test_probs = [model.predict(test_data) for model in models]
test_prob = np.mean(test_probs, axis=0)


In [10]:
# 创建一个包含 customer_ID 和预测概率的 DataFrame
results_df = pd.DataFrame({
    'customer_ID': customer_ids,
    'prediction': test_prob
})

# 将 DataFrame 保存为 CSV 文件
result_csv = results_df.to_csv('predictions.csv', index=False)

In [14]:
print(models)

[<lightgbm.basic.Booster object at 0x7f0111401940>, <lightgbm.basic.Booster object at 0x7f01114011f0>, <lightgbm.basic.Booster object at 0x7f114c771250>, <lightgbm.basic.Booster object at 0x7f008818fa60>, <lightgbm.basic.Booster object at 0x7f114c7c6880>]


In [15]:
for index,model in enumerate(models):
    model.save_model(f'my_best_lightgbm_model{index}.txt')

In [None]:
train_data_original = pd.read_feather('./input/train.feather')
labels = pd.read_csv('/openbayes/input/input0/train_labels.csv')

cross_validation_train_data=train_data_original.iloc[0:3616080]
cross_validation_train_lable=labels.iloc[0:300000]
cross_validation_test_data=train_data_original.iloc[3616080:]
cross_validation_test_label=labels.iloc[300000:]