In [1]:
import warnings
warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

In [2]:
!pip install pyarrow
# 读取数据
train_data_origin = pd.read_feather('./input/train.feather')
labels_origin = pd.read_csv('/openbayes/input/input0/train_labels.csv')
train_data = pd.merge(train_data_origin, labels_origin, on='customer_ID', how='left')  # 合并标签列到训练数据集中
test_data = pd.read_feather('./input/test.feather')

#train_data=train_data.iloc[0:3616080]
#labels=labels_origin.iloc[0:300000]

cross_validation_test_data = train_data.iloc[3616080:]
cross_validation_test_lable = labels_origin.iloc[300000:]

# 标签
label_name = 'target'

# 标准化数值特征
numerical_cols = [col for col in train_data.columns if col not in ['customer_ID', 'S_2', label_name] and train_data[col].dtype != 'O']

# 使用均值填充数值特征的缺失值
train_data[numerical_cols] = train_data[numerical_cols].fillna(train_data[numerical_cols].mean())
test_data[numerical_cols] = test_data[numerical_cols].fillna(train_data[numerical_cols].mean())

# 数值标准化
scaler = StandardScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

# 类别特征处理 - 使用 Embedding 层替代 DNN 处理
cat_features = ["B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126", "D_63", "D_64", "D_66", "D_68"]

class CategoricalEmbedding(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(CategoricalEmbedding, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=-1)

    def forward(self, x):
        return self.embedding(x)

# 处理缺失值和类别特征编码
for col in cat_features:
# # 使用-1填充缺失值
#     train_data[col] = train_data[col].astype('category').cat.codes.replace(-1, np.nan)
#     test_data[col] = test_data[col].astype('category').cat.codes.replace(-1, np.nan)

 # 使用众数填充缺失值
    train_data[col] = train_data[col].astype('category').cat.codes.replace(np.nan,train_data[col].value_counts().idxmax())
    test_data[col] = test_data[col].astype('category').cat.codes.replace(np.nan,test_data[col].value_counts().idxmax())

    # 使用训练集中的编码
    unique_values_train = set(train_data[col].dropna().unique())
    unique_values_test = set(test_data[col].dropna().unique())

    # 新增的类别值
    new_values = unique_values_test - unique_values_train

    # 如果测试集中有新的类别值，将它们映射为一个特殊的编码
    if new_values:
        mapping = {val: len(unique_values_train) for val in new_values}
        test_data[col] = test_data[col].apply(lambda x: mapping.get(x, x))

    embedding_dim = min(50, len(unique_values_train) + 1)  # +1 for the padding index
    embedding_layer = CategoricalEmbedding(len(unique_values_train) + 1, embedding_dim)

    # Apply the embedding to the input data
    embedded_values = embedding_layer(torch.LongTensor(train_data[col].values))
    # Concatenate the embedded values to the dataframe
    train_data = pd.concat([train_data, pd.DataFrame(embedded_values.detach().numpy(), columns=[f'{col}_{i}' for i in range(embedding_dim)])], axis=1)

    # Transform on test data
    embedded_test_values = embedding_layer(torch.LongTensor(test_data[col].values))
    test_data = pd.concat([test_data, pd.DataFrame(embedded_test_values.detach().numpy(), columns=[f'{col}_{i}' for i in range(embedding_dim)])], axis=1)

    
# 删除原始的类别特征列
train_data = train_data.drop(cat_features, axis=1)
test_data = test_data.drop(cat_features, axis=1)

# 时间特征处理
time_features = ["S_2"]
for col in time_features:
    train_data[col] = pd.to_datetime(train_data[col])
    test_data[col] = pd.to_datetime(test_data[col])

# 划分数据集
#features = train_data.iloc[:, :-1]
#labels = train_data.iloc[:, -1]

train_features, val_features = train_test_split(train_data, test_size=0.2, random_state=None)

#train_features, val_features, train_labels, val_labels = train_test_split(
#    features, labels, test_size=0.2, random_state=None
#)



Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
[0m

In [3]:
def prepare_data(data):
    print("Original columns:", data.columns)
    
    data[time_features] = data[time_features].apply(lambda x: x.astype(np.int64) // 10**9)

    if label_name not in data.columns:
        data[label_name] = 0

    prepare_data = data.drop(time_features + [label_name], axis=1)
    
    print("Remaining columns after drop:", prepare_data.columns)

    prepare_data = prepare_data.groupby('customer_ID').mean().reset_index(drop=False)
    prepare_data = prepare_data.drop_duplicates('customer_ID')
    
    return prepare_data

# Call the function and check the output
train_data = prepare_data(train_features)
val_data = prepare_data(val_features)
test_data = prepare_data(test_data)


Original columns: Index(['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41',
       'B_3',
       ...
       'D_66_3', 'D_68_0', 'D_68_1', 'D_68_2', 'D_68_3', 'D_68_4', 'D_68_5',
       'D_68_6', 'D_68_7', 'D_68_8'],
      dtype='object', length=246)
Remaining columns after drop: Index(['customer_ID', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3',
       'D_42',
       ...
       'D_66_3', 'D_68_0', 'D_68_1', 'D_68_2', 'D_68_3', 'D_68_4', 'D_68_5',
       'D_68_6', 'D_68_7', 'D_68_8'],
      dtype='object', length=244)
Original columns: Index(['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41',
       'B_3',
       ...
       'D_66_3', 'D_68_0', 'D_68_1', 'D_68_2', 'D_68_3', 'D_68_4', 'D_68_5',
       'D_68_6', 'D_68_7', 'D_68_8'],
      dtype='object', length=246)
Remaining columns after drop: Index(['customer_ID', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3',
       'D_42',
       ...
       'D_66_3', 'D_68_0', 'D_68_1'

In [4]:
test_data

Unnamed: 0,customer_ID,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_66_3,D_68_0,D_68_1,D_68_2,D_68_3,D_68_4,D_68_5,D_68_6,D_68_7,D_68_8
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,-0.228935,-0.309824,-0.514206,0.692461,-0.326011,-0.214822,-0.270553,-0.523909,-0.772038,...,0.697428,-0.006463,0.134079,-0.105446,-0.047264,-0.324971,0.313887,-0.388357,-0.306510,-0.284751
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.846948,0.001273,-0.336411,0.955427,-0.326011,-0.621559,-0.270553,-0.536277,0.000000,...,0.697428,-0.079935,-0.247890,0.793585,-1.149349,-1.290671,0.959498,-0.676958,0.099538,-0.968860
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.379691,0.103811,-0.525092,0.700552,-0.326011,-0.370469,-0.270553,-0.523181,0.000000,...,0.053490,-0.256046,0.041284,0.301178,-0.907900,-0.809500,0.958093,-0.215818,-0.226445,-0.772873
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,-0.747408,1.177617,0.752133,-1.323117,-0.326011,0.396018,0.002960,2.276050,0.000000,...,0.697428,-0.701480,0.296459,-2.431327,-0.256233,1.174154,0.959400,1.993503,-0.503849,0.783549
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,-1.363050,0.741828,1.572170,-1.443847,0.353591,-0.348443,-0.270553,2.063242,0.000000,...,0.697428,-0.318991,-0.038525,-0.446766,-0.805843,-0.342661,0.959460,0.350142,-0.132534,-0.294856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,-0.100194,-0.548447,-0.546862,0.482088,-0.326011,-0.577508,-0.270553,-0.457702,0.000000,...,0.697428,-0.079935,-0.247890,0.793585,-1.149349,-1.290671,0.959498,-0.676958,0.099538,-0.968860
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,-0.444322,-0.357612,-0.354553,-0.522081,0.523491,3.479588,-0.270553,-0.402045,0.784503,...,0.697428,-0.318991,-0.038525,-0.446766,-0.805843,-0.342661,0.959460,0.350142,-0.132534,-0.294856
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,-1.716650,0.294647,-0.434380,0.696720,0.183690,0.092067,1.104608,-0.444606,0.588267,...,0.697428,-0.079935,-0.247890,0.793585,-1.149349,-1.290671,0.959498,-0.676958,0.099538,-0.968860
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,-0.393808,-0.155383,-0.176758,-1.210052,-0.326011,-0.128188,-0.270553,-0.055006,0.000000,...,0.697428,-0.605857,0.212713,-1.935187,-0.393636,0.794950,0.959415,1.582663,-0.411020,0.513947


In [4]:
#test_data_subset = test_data.iloc[3616080:]
#customer_ids = test_data_subset['customer_ID']
customer_ids = test_data['customer_ID']
test_data_matrix = test_data.drop(['customer_ID'], axis=1)
#test_data_matrix['S_2'] = test_data_matrix['S_2'].astype(int) // 10**9#

In [7]:
test_data_matrix

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_66_3,D_68_0,D_68_1,D_68_2,D_68_3,D_68_4,D_68_5,D_68_6,D_68_7,D_68_8
0,-0.228935,-0.309824,-0.514206,0.692461,-0.326011,-0.214822,-0.270553,-0.523909,-0.772038,-0.186265,...,-0.885904,-0.583866,-0.402981,0.005626,-0.172955,0.766475,0.206203,-0.001868,-1.139044,-0.549376
1,0.846948,0.001273,-0.336411,0.955427,-0.326011,-0.621559,-0.270553,-0.536277,0.000000,0.000000,...,-0.885903,-0.270490,-1.259770,0.349445,-0.700824,0.767135,-0.889310,-0.098858,-2.123584,-0.316407
2,0.379691,0.103811,-0.525092,0.700552,-0.326011,-0.370469,-0.270553,-0.523181,0.000000,-0.140376,...,-0.403017,-0.137458,-0.832398,0.102010,-0.447246,0.574049,-0.461374,0.018268,-1.626397,-0.082246
3,-0.747408,1.177617,0.752133,-1.323117,-0.326011,0.396018,0.002960,2.276050,0.000000,0.820201,...,-0.885903,0.933979,0.095609,0.550512,-1.574607,-0.056243,-0.295412,-0.138850,-0.718565,-0.399623
4,-1.363050,0.741828,1.572170,-1.443847,0.353591,-0.348443,-0.270553,2.063242,0.000000,0.406679,...,-0.885903,0.192767,-0.738470,0.426778,-1.036895,0.450451,-0.660888,-0.114239,-1.583192,-0.348413
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
924616,-0.100194,-0.548447,-0.546862,0.482088,-0.326011,-0.577508,-0.270553,-0.457702,0.000000,-0.614203,...,-0.885903,-0.270490,-1.259770,0.349445,-0.700824,0.767135,-0.889310,-0.098858,-2.123584,-0.316407
924617,-0.444322,-0.357612,-0.354553,-0.522081,0.523491,3.479588,-0.270553,-0.402045,0.784503,1.302643,...,-0.885903,0.192767,-0.738470,0.426778,-1.036895,0.450451,-0.660888,-0.114239,-1.583192,-0.348413
924618,-1.716650,0.294647,-0.434380,0.696720,0.183690,0.092067,1.104608,-0.444606,0.588267,0.083615,...,-0.885903,-0.270490,-1.259770,0.349445,-0.700824,0.767135,-0.889310,-0.098858,-2.123584,-0.316407
924619,-0.393808,-0.155383,-0.176758,-1.210052,-0.326011,-0.128188,-0.270553,-0.055006,0.000000,0.290376,...,-0.885903,0.748676,-0.112911,0.519579,-1.440179,0.070430,-0.386781,-0.132697,-0.934722,-0.386820


In [5]:
X_train = train_features.drop([label_name, 'customer_ID','S_2'], axis=1)
#X_train['S_2'] = X_train['S_2'].astype(np.int64) // 10**9
y_train = train_features[label_name]
X_val = val_features.drop([label_name, 'customer_ID','S_2'], axis=1)
#X_val['S_2'] = X_val['S_2'].astype(np.int64) // 10**9
y_val = val_features[label_name]

### XGB

In [14]:
!pip install xgboost
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Build XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)  # For binary classification

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_xgb = xgb_model.predict(X_val)

# Evaluate the performance
accuracy_xgb = accuracy_score(y_val, y_pred_xgb)
print(f"XGBoost Model Accuracy: {accuracy_xgb}")

# Print other classification metrics
print(classification_report(y_val, y_pred_xgb))


Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
[0mXGBoost Model Accuracy: 0.8827144033531864
              precision    recall  f1-score   support

           0       0.92      0.92      0.92    831111
           1       0.76      0.76      0.76    275180

    accuracy                           0.88   1106291
   macro avg       0.84      0.84      0.84   1106291
weighted avg       0.88      0.88      0.88   1106291



In [10]:
customer_ids

0         00000469ba478561f23a92a868bd366de6f6527a684c9a...
1         00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...
2         0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...
3         00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...
4         00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...
                                ...                        
924616    ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...
924617    ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...
924618    ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...
924619    ffffddef1fc3643ea179c93245b68dca0f36941cd83977...
924620    fffffa7cf7e453e1acc6a1426475d5cb9400859f82ff61...
Name: customer_ID, Length: 924621, dtype: object

In [16]:
# Make predictions on the validation set with probabilities
y_pred_proba_xgb = xgb_model.predict_proba(X_val)

# Extract the probability for class 1 (positive class)
y_pred_proba_class_1 = y_pred_proba_xgb[:, 1]

# Make predictions on the test set with probabilities
predictions_proba = xgb_model.predict_proba(test_data_matrix)[:, 1]

# 创建包含预测概率和 customer_ID 的 DataFrame
submission_proba = pd.DataFrame({'customer_ID': customer_ids, 'prediction_proba': predictions_proba})

# 如果需要保存预测结果到 CSV 文件
submission_proba.to_csv('./baseline_model_result/baseline_XGB_submission_proba.csv', index=False)


In [15]:
predictions = xgb_model.predict(test_data_matrix)

# 创建包含预测结果和 customer_ID 的 DataFrame
submission = pd.DataFrame({'customer_ID': customer_ids, 'prediction': predictions})

# 如果需要保存预测结果到 CSV 文件
submission.to_csv('./baseline_model_result/baseline_XGB_submission_prob).csv', index=False)

In [12]:
# 检查列名是否一致
print(set(test_data_matrix.columns) - set(xgb_model.get_booster().feature_names))


set()


### SGD

In [17]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Build SGD classifier
sgd_model = SGDClassifier(loss='log_loss', random_state=42)  # 'log' for logistic regression

# Train the model
sgd_model.fit(X_train_scaled, y_train)

# Make predictions on the validation set
y_pred_sgd = sgd_model.predict(X_val_scaled)

# Evaluate the performance
accuracy_sgd = accuracy_score(y_val, y_pred_sgd)
print(f"SGD Model Accuracy: {accuracy_sgd}")

# Print other classification metrics
print(classification_report(y_val, y_pred_sgd))


SGD Model Accuracy: 0.872129484918525
              precision    recall  f1-score   support

           0       0.91      0.92      0.92    831111
           1       0.75      0.73      0.74    275180

    accuracy                           0.87   1106291
   macro avg       0.83      0.82      0.83   1106291
weighted avg       0.87      0.87      0.87   1106291



In [18]:
# Make predictions on the test set with probabilities
probability_estimates = sgd_model.predict_proba(test_data_matrix)

# Extract the probability for class 1 (positive class)
predictions_proba = probability_estimates[:, 1]

# 创建包含预测概率和 customer_ID 的 DataFrame
submission_proba = pd.DataFrame({'customer_ID': customer_ids, 'prediction_proba': predictions_proba})

# 如果需要保存预测概率结果到 CSV 文件
submission_proba.to_csv('./baseline_model_result/baseline_SGD_submission_proba.csv', index=False)


In [9]:
predictions = sgd_model.predict(test_data_matrix)

# 创建包含预测结果和 customer_ID 的 DataFrame
submission = pd.DataFrame({'customer_ID': customer_ids, 'prediction': predictions})

# 如果需要保存预测结果到 CSV 文件
submission.to_csv('./baseline_model_result/baseline_SGD_submission.csv', index=False)

### Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# 构建随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 在训练集上训练模型
rf_model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report
y_pred = rf_model.predict(X_val)

# 评估分类准确性
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")

# 输出其他分类指标
print(classification_report(y_val, y_pred))



Accuracy: 0.9363323031643573
              precision    recall  f1-score   support

           0       0.96      0.96      0.96    831080
           1       0.88      0.87      0.87    275211

    accuracy                           0.94   1106291
   macro avg       0.92      0.91      0.91   1106291
weighted avg       0.94      0.94      0.94   1106291



In [12]:
predictions = rf_model.predict(test_data_matrix)

#customer_ids_subset = test_data.iloc[350000:458000]['customer_ID']
# 创建包含预测结果和 customer_ID 的 DataFrame
submission = pd.DataFrame({'customer_ID': customer_ids, 'prediction': predictions})

# 如果需要保存预测结果到 CSV 文件
submission.to_csv('./baseline_model_result/baseline_Random_Forest_submission.csv', index=False)

In [8]:
# Make predictions on the test set with probabilities
probability_estimates = rf_model.predict_proba(test_data_matrix)

# Extract the probability for class 1 (positive class)
predictions_proba = probability_estimates[:, 1]

# 创建包含预测概率和 customer_ID 的 DataFrame
submission_proba = pd.DataFrame({'customer_ID': customer_ids, 'prediction': predictions_proba})

# 如果需要保存预测概率结果到 CSV 文件
submission_proba.to_csv('./baseline_model_result/baseline_Random_Forest_submission_proba.csv', index=False)
