In [77]:
# XGBoost

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

original_data = pd.read_csv('data/original_data.csv')
synthetic_data = pd.read_csv('data/synthetic_data.csv')

categorical_features = ['SEX', 'ETHGP']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ], remainder='passthrough')

X = original_data.drop('CRFIN', axis=1)
y = original_data['CRFIN']
X_synthetic = preprocessor.fit_transform(synthetic_data.drop('CRFIN', axis=1))
y_synthetic = synthetic_data['CRFIN']

# define K-Fold
kf = KFold(n_splits=5, random_state=42, shuffle=True)

scores = {
    'train_r2': [],
    'train_mse': [],
    'train_mae': [],
    
    'test_r2': [],
    'test_mse': [],
    'test_mae': [],

    'synthetic_r2': [],
    'synthetic_mse': [],
    'synthetic_mae': []
}

# K-Fold 
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # preprocess
    X_train_encoded = preprocessor.fit_transform(X_train)
    X_test_encoded = preprocessor.transform(X_test)

    model = XGBRegressor(
        objective='reg:squarederror',
        max_depth=1, 
        learning_rate=0.05,
        n_estimators=100,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.01,
        reg_lambda=1 # L2 regularization
    )

    model.fit(X_train_encoded, y_train)

    # train
    train_preds = model.predict(X_train_encoded)
    scores['train_r2'].append(r2_score(y_train, train_preds))
    scores['train_mse'].append(mean_squared_error(y_train, train_preds))
    scores['train_mae'].append(mean_absolute_error(y_train, train_preds))

    # test
    test_preds = model.predict(X_test_encoded)
    scores['test_r2'].append(r2_score(y_test, test_preds))
    scores['test_mse'].append(mean_squared_error(y_test, test_preds))
    scores['test_mae'].append(mean_absolute_error(y_test, test_preds))

    # syn
    synthetic_preds = model.predict(X_synthetic)
    scores['synthetic_r2'].append(r2_score(y_synthetic, synthetic_preds))
    scores['synthetic_mse'].append(mean_squared_error(y_synthetic, synthetic_preds))
    scores['synthetic_mae'].append(mean_absolute_error(y_synthetic, synthetic_preds))

# print
print("average_train_R2:", np.mean(scores['train_r2']))
print("average_train_MSE:", np.mean(scores['train_mse']))
print("average_train_MAE:", np.mean(scores['train_mae']))

print("\naverage_test_R2:", np.mean(scores['test_r2']))
print("average_test_MSE:", np.mean(scores['test_mse']))
print("average_test_MAE:", np.mean(scores['test_mae']))

print("\naverage_syn_R2:", np.mean(scores['synthetic_r2']))
print("average_syn_MSE:", np.mean(scores['synthetic_mse']))
print("average_syn_MAE:", np.mean(scores['synthetic_mae']))


average_train_R2: 0.16520199683349482
average_train_MSE: 127.91212178299577
average_train_MAE: 8.866787149322397

average_test_R2: 0.10103277367638963
average_test_MSE: 137.18853000591912
average_test_MAE: 9.170173167048162

average_syn_R2: 0.06290046788187435
average_syn_MSE: 144.0185448249643
average_syn_MAE: 9.547253402074178


In [36]:
# Catboost

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

original_data = pd.read_csv('data/original_data.csv')
synthetic_data = pd.read_csv('data/synthetic_data.csv')

X = original_data.drop('CRFIN', axis=1)
y = original_data['CRFIN']
X_synthetic = synthetic_data.drop('CRFIN', axis=1)
y_synthetic = synthetic_data['CRFIN']

categorical_features_indices = [X.columns.get_loc(col) for col in ['SEX', 'ETHGP']]

kf = KFold(n_splits=5, random_state=42, shuffle=True)

scores = {
    'train_r2': [],
    'train_mse': [],
    'train_mae': [],
    'test_r2': [],
    'test_mse': [],
    'test_mae': [],
    'synthetic_r2': [],
    'synthetic_mse': [],
    'synthetic_mae': []
}

# K-Fold 
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


    model = CatBoostRegressor(
        iterations=100,
        learning_rate=0.05,
        depth=3, 
        loss_function='RMSE',
        verbose=False,
        cat_features=categorical_features_indices,
        l2_leaf_reg=10,
        subsample=0.8
    )
    
    model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    scores['train_r2'].append(r2_score(y_train, train_preds))
    scores['train_mse'].append(mean_squared_error(y_train, train_preds))
    scores['train_mae'].append(mean_absolute_error(y_train, train_preds))

    test_preds = model.predict(X_test)
    scores['test_r2'].append(r2_score(y_test, test_preds))
    scores['test_mse'].append(mean_squared_error(y_test, test_preds))
    scores['test_mae'].append(mean_absolute_error(y_test, test_preds))

    synthetic_preds = model.predict(X_synthetic)
    scores['synthetic_r2'].append(r2_score(y_synthetic, synthetic_preds))
    scores['synthetic_mse'].append(mean_squared_error(y_synthetic, synthetic_preds))
    scores['synthetic_mae'].append(mean_absolute_error(y_synthetic, synthetic_preds))

print("average_train_R2:", np.mean(scores['train_r2']))
print("average_train_MSE:", np.mean(scores['train_mse']))
print("average_train_MAE:", np.mean(scores['train_mae']))

print("\naverage_test_R2:", np.mean(scores['test_r2']))
print("average_test_MSE:", np.mean(scores['test_mse']))
print("average_test_MAE:", np.mean(scores['test_mae']))

print("\naverage_syn_R2:", np.mean(scores['synthetic_r2']))
print("average_syn_MSE:", np.mean(scores['synthetic_mse']))
print("average_syn_MAE:", np.mean(scores['synthetic_mae']))


average_train_R2: 0.18424996923223938
average_train_MSE: 124.98970011843429
average_train_MAE: 8.785449268351341

average_test_R2: 0.08907896436779202
average_test_MSE: 139.03768182922892
average_test_MAE: 9.274475409292446

average_syn_R2: 0.07330053030054447
average_syn_MSE: 142.4202067570323
average_syn_MAE: 9.49164071195605


In [58]:
# Random forest

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

original_data = pd.read_csv('data/original_data.csv')
synthetic_data = pd.read_csv('data/synthetic_data.csv')

categorical_features = ['SEX', 'ETHGP']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ], remainder='passthrough')

X_original = original_data.drop('CRFIN', axis=1)
y_original = original_data['CRFIN'].values
X_synthetic = synthetic_data.drop('CRFIN', axis=1)
y_synthetic = synthetic_data['CRFIN'].values

kf = KFold(n_splits=5, random_state=42, shuffle=True)

scores = {
    'train_r2': [],
    'train_mse': [],
    'train_mae': [],
    'test_r2': [],
    'test_mse': [],
    'test_mae': [],
    'synthetic_r2': [],
    'synthetic_mse': [],
    'synthetic_mae': []
}

for train_index, test_index in kf.split(X_original):
    X_train, X_test = X_original.iloc[train_index], X_original.iloc[test_index]
    y_train, y_test = y_original[train_index], y_original[test_index]
    
    X_train_encoded = preprocessor.fit_transform(X_train)
    X_test_encoded = preprocessor.transform(X_test)
    X_synthetic_encoded = preprocessor.transform(X_synthetic)
    
    random_forest_model = RandomForestRegressor(
        n_estimators=100,  
        min_samples_split=50,  
        min_samples_leaf=50,  
        random_state=42
    )
    random_forest_model.fit(X_train_encoded, y_train)

    train_predictions = random_forest_model.predict(X_train_encoded)
    scores['train_r2'].append(r2_score(y_train, train_predictions))
    scores['train_mse'].append(mean_squared_error(y_train, train_predictions))
    scores['train_mae'].append(mean_absolute_error(y_train, train_predictions))

    test_predictions = random_forest_model.predict(X_test_encoded)
    scores['test_r2'].append(r2_score(y_test, test_predictions))
    scores['test_mse'].append(mean_squared_error(y_test, test_predictions))
    scores['test_mae'].append(mean_absolute_error(y_test, test_predictions))

    synthetic_predictions = random_forest_model.predict(X_synthetic_encoded)
    scores['synthetic_r2'].append(r2_score(y_synthetic, synthetic_predictions))
    scores['synthetic_mse'].append(mean_squared_error(y_synthetic, synthetic_predictions))
    scores['synthetic_mae'].append(mean_absolute_error(y_synthetic, synthetic_predictions))

print("average_train_R2:", np.mean(scores['train_r2']))
print("average_train_MSE:", np.mean(scores['train_mse']))
print("average_train_MAE:", np.mean(scores['train_mae']))

print("\naverage_test_R2:", np.mean(scores['test_r2']))
print("average_test_MSE:", np.mean(scores['test_mse']))
print("average_test_MAE:", np.mean(scores['test_mae']))

print("\naverage_syn_R2:", np.mean(scores['synthetic_r2']))
print("average_syn_MSE:", np.mean(scores['synthetic_mse']))
print("average_syn_MAE:", np.mean(scores['synthetic_mae']))


average_train_R2: 0.1725967891483908
average_train_MSE: 126.7695318891721
average_train_MAE: 8.838799457915867

average_test_R2: 0.0904079290002161
average_test_MSE: 138.8447022222304
average_test_MAE: 9.263940561146024

average_syn_R2: 0.061437932398678965
average_syn_MSE: 144.24331522002424
average_syn_MAE: 9.537745898618706


In [56]:
# LGBMRegressor

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

original_data = pd.read_csv('data/original_data.csv')
synthetic_data = pd.read_csv('data/synthetic_data.csv')

categorical_features = ['SEX', 'ETHGP']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ], remainder='passthrough')

X_original = original_data.drop('CRFIN', axis=1)
y_original = original_data['CRFIN'].values
X_synthetic = synthetic_data.drop('CRFIN', axis=1)
y_synthetic = synthetic_data['CRFIN'].values

kf = KFold(n_splits=5, random_state=42, shuffle=True)

scores = {
    'train_r2': [],
    'train_mse': [],
    'train_mae': [],
    'test_r2': [],
    'test_mse': [],
    'test_mae': [],
    'synthetic_r2': [],
    'synthetic_mse': [],
    'synthetic_mae': []
}

for train_index, test_index in kf.split(X_original):
    X_train, X_test = X_original.iloc[train_index], X_original.iloc[test_index]
    y_train, y_test = y_original[train_index], y_original[test_index]
    
    X_train_encoded = preprocessor.fit_transform(X_train)
    X_test_encoded = preprocessor.transform(X_test)
    X_synthetic_encoded = preprocessor.transform(X_synthetic)

    lgbm_model = LGBMRegressor(
        objective='regression',
        num_leaves=15,
        learning_rate=0.01,
        n_estimators=100,
        lambda_l1=0.01,
        lambda_l2=1,
        min_data_in_leaf=30,
        min_sum_hessian_in_leaf=1e-3,
        bagging_fraction=0.8,
        bagging_freq=5, 
        feature_fraction=0.8,
    )
    
    lgbm_model.fit(X_train_encoded, y_train)

    train_predictions = lgbm_model.predict(X_train_encoded)
    scores['train_r2'].append(r2_score(y_train, train_predictions))
    scores['train_mse'].append(mean_squared_error(y_train, train_predictions))
    scores['train_mae'].append(mean_absolute_error(y_train, train_predictions))

    test_predictions = lgbm_model.predict(X_test_encoded)
    scores['test_r2'].append(r2_score(y_test, test_predictions))
    scores['test_mse'].append(mean_squared_error(y_test, test_predictions))
    scores['test_mae'].append(mean_absolute_error(y_test, test_predictions))

    synthetic_predictions = lgbm_model.predict(X_synthetic_encoded)
    scores['synthetic_r2'].append(r2_score(y_synthetic, synthetic_predictions))
    scores['synthetic_mse'].append(mean_squared_error(y_synthetic, synthetic_predictions))
    scores['synthetic_mae'].append(mean_absolute_error(y_synthetic, synthetic_predictions))

print("average_train_R2:", np.mean(scores['train_r2']))
print("average_train_MSE:", np.mean(scores['train_mse']))
print("average_train_MAE:", np.mean(scores['train_mae']))

print("\naverage_test_R2:", np.mean(scores['test_r2']))
print("average_test_MSE:", np.mean(scores['test_mse']))
print("average_test_MAE:", np.mean(scores['test_mae']))

print("\naverage_syn_R2:", np.mean(scores['synthetic_r2']))
print("average_syn_MSE:", np.mean(scores['synthetic_mse']))
print("average_syn_MAE:", np.mean(scores['synthetic_mae']))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 568
[LightGBM] [Info] Number of data points in the train set: 844, number of used features: 13
[LightGBM] [Info] Start training from score 83.759479
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000600 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 564
[LightGBM] [Info] Number of data points in the train set: 845, number of used features: 13
[LightGBM] [Info] Start training from score 83.704142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y