In [1]:
import pandas as pd
# baic transformer Decoder model
import torch
import torch.nn as nn
import torch.nn.functional as Fun
import torch.optim as optim
import numpy as np
import xformers.ops as xops
import math 
from typing import Optional, Union
from torch import Tensor
import random

main_df = pd.read_csv('adult.csv')
main_df.head()
DEVICE = 'cuda'
# DEVICE = 'cpu'
torch.random.manual_seed(42)
random.seed(42)
np.random.seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms(True)

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
def POOL_preprocess(df, N_BINS = 100):
    '''
    Preprocess the DataFrame 
    Args:
        df: DataFrame
        N_BINS: number of bins for each numerical column (will not be the exact number of bins, differ by distribution)
    Return:
        X_trans: DataFrame after preprocessing
        ct: ColumnTransformer object, for inference and inverse transform
        NUM_vs_CAT: tuple, (number of numerical columns, number of categorical columns - 1) "in feature field, do not include label column"
        existing_values: dict, {column name: sorted list of existing values}
    '''
    
    CAT = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
    NUM = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    
    num_CAT = len(CAT)
    num_NUM = len(NUM)  
    
    ct = ColumnTransformer([
        ("age", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='uniform', subsample=None), ["age"]),
        ("fnlwgt", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='quantile', subsample=None), ["fnlwgt"]),
        ("educational-num", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='quantile', subsample=None), ["educational-num"]),
        ("capital-gain", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='uniform', subsample=None), ["capital-gain"]),
        ("capital-loss", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='uniform', subsample=None), ["capital-loss"]),
        ("hours-per-week", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='uniform', subsample=None), ["hours-per-week"]),
         ],remainder = 'passthrough', verbose_feature_names_out = False) # make sure columns are unique
    ct.set_output(transform = 'pandas')
    X_trans = ct.fit_transform(df) 
    
    # store the numrical columns' existing values for identifying unseen values
    existing_values = {}
    for column in NUM:
        existing_values[column] = sorted(X_trans[column].unique().astype(int))
    for column in CAT:
        existing_values[column] = sorted(X_trans[column].unique().astype(str))
    
    # apply Ordinal encoding on columns
    from sklearn.preprocessing import OrdinalEncoder
    OE_list = {}
    for column in NUM + CAT:
        OE = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1)
        X_trans[column] = OE.fit_transform(X_trans[[column]])
        OE_list[column] = OE
    
    # make all columns' catagory unique
    # 7/19: each NUM column has its own number of unique values, plus 1 for unseen values
    # each column has it's own number of unique values. '+1' is for unseen values
    offset = 0
    for column in NUM + CAT:
        X_trans[column] = X_trans[column].apply(lambda x: x + offset)
        offset += (X_trans[column].max() - X_trans[column].min() + 1) + 1
    
    X_trans = X_trans.astype(int).reset_index(drop = True)
    return X_trans, (ct, OE_list, NUM, CAT, existing_values), (num_NUM, num_CAT - 1)
    # -1 is for the income column (label)
X_trans, inference_package , _  = POOL_preprocess(main_df)
X_trans



Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income
0,8,146,181,191,215,304,366,373,393,404,416,422,427,468,472
1,21,89,183,191,215,314,366,383,391,402,413,424,427,468,472
2,11,165,186,191,215,304,364,379,391,408,413,424,427,468,473
3,27,115,184,198,215,304,366,387,391,404,413,422,427,468,473
4,1,93,184,191,215,294,362,387,393,397,416,424,426,468,472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,10,154,186,191,215,302,366,379,391,410,418,424,426,468,472
48838,23,113,183,191,215,304,366,383,391,404,413,424,427,468,473
48839,41,112,183,191,215,304,366,383,395,398,417,424,426,468,472
48840,5,137,183,191,215,284,366,383,393,398,416,424,427,468,472


## 5 FOLD (transformed input)

In [3]:
import numpy as np
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import roc_auc_score
# import labelencoder
from sklearn.preprocessing import LabelEncoder

kf = KFold(n_splits=5, shuffle=True)
AUCS = []
main_df = pd.read_csv('adult.csv')

# 進行5-fold交叉驗證
for index, (train_index, test_index) in enumerate(kf.split(main_df)):
    X_train, X_test = main_df.loc[train_index], main_df.loc[test_index]
    
    le = LabelEncoder()
    Y_train, Y_test = X_train['income'] , X_test['income']
    Y_train = le.fit_transform(Y_train)
    Y_test = le.transform(Y_test)
    
    X_train, X_test = X_train.drop(columns='income'), X_test.drop(columns='income')
    
    CAT = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
    le2 = LabelEncoder()
    le2.fit(main_df[CAT].astype(str).values.flatten())
    for column in CAT:
        le = LabelEncoder()
        X_train[column] = le2.transform(X_train[column].astype(str))
        X_test[column] = le2.transform(X_test[column].astype(str))
    
    print(f'Fold {index}:')
    print(f'X_train: {X_train.shape}, X_test: {X_test.shape}')
    print(f'Y_train: {Y_train.shape}, Y_test: {Y_test.shape}')
    
    model = xgb.XGBClassifier(random_state=42, n_jobs=-1)
    model.fit(X_train, Y_train)
    Y_prob = model.predict_proba(X_test)  # 预测概率
    auc = roc_auc_score(Y_test, Y_prob[:, 1])
    AUCS.append(auc)
    print(f'auc: {auc}')
print(f'Average AUC: {np.mean(AUCS)}')

Fold 0:
X_train: (39073, 14), X_test: (9769, 14)
Y_train: (39073,), Y_test: (9769,)
auc: 0.929907058541208
Fold 1:
X_train: (39073, 14), X_test: (9769, 14)
Y_train: (39073,), Y_test: (9769,)
auc: 0.9291356104330563
Fold 2:
X_train: (39074, 14), X_test: (9768, 14)
Y_train: (39074,), Y_test: (9768,)
auc: 0.92712635692001
Fold 3:
X_train: (39074, 14), X_test: (9768, 14)
Y_train: (39074,), Y_test: (9768,)
auc: 0.9242047156100273
Fold 4:
X_train: (39074, 14), X_test: (9768, 14)
Y_train: (39074,), Y_test: (9768,)
auc: 0.9263723581549314
Average AUC: 0.9273492199318467


## 5-FOLD (original input)

In [4]:
kf = KFold(n_splits=5, shuffle=True)
AUCS = []
# 進行5-fold交叉驗證
for index, (train_index, test_index) in enumerate(kf.split(main)):
    X_train, X_test = X_trans.loc[train_index], X_trans.loc[test_index]
    Y_train, Y_test = X_train['income'] - X_train['income'].min(), X_test['income'] - X_test['income'].min()
    X_train, X_test = X_train.drop(columns='income'), X_test.drop(columns='income')
    print(f'Fold {index}:')
    print(f'X_train: {X_train.shape}, X_test: {X_test.shape}')
    print(f'Y_train: {Y_train.shape}, Y_test: {Y_test.shape}')
    
    model = xgb.XGBClassifier()
    model.fit(X_train, Y_train)
    Y_prob = model.predict_proba(X_test)  # 预测概率
    auc = roc_auc_score(Y_test, Y_prob[:, 1])
    AUCS.append(auc)
    print(f'auc: {auc}')
print(f'Average AUC: {np.mean(AUCS)}')

NameError: name 'main' is not defined

main_df: Average AUC: 0.9204860054219773 <br>
processed:  Average AUC: 0.9200331315953896