In [1]:
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df= pd.read_csv('test.csv')

In [2]:
train_df.isna().sum()

ID                        0
Age                       0
Gender                    0
Education_Status          0
Employment_Status         0
Working_Week (Yearly)     0
Industry_Status           0
Occupation_Status         0
Race                      0
Hispanic_Origin           0
Martial_Status            0
Household_Status          0
Household_Summary         0
Citizenship               0
Birth_Country             0
Birth_Country (Father)    0
Birth_Country (Mother)    0
Tax_Status                0
Gains                     0
Losses                    0
Dividends                 0
Income_Status             0
Income                    0
dtype: int64

In [3]:
train_df['Birth_Country'].value_counts()

Birth_Country
US                              17825
Mexico                            540
Unknown                           330
Puerto-Rico                       117
Philippines                       112
Germany                            90
Canada                             75
El-Salvador                        68
Cuba                               58
India                              55
Dominican-Republic                 49
England                            47
Poland                             45
Jamaica                            45
Columbia                           36
Italy                              35
South Korea                        31
Vietnam                            31
Ecuador                            31
Japan                              30
Portugal                           29
Nicaragua                          28
China                              28
Guatemala                          28
Haiti                              25
Iran                               2

In [4]:
from sklearn.preprocessing import LabelEncoder

columns_to_encode = ["Employment_Status", "Industry_Status", "Occupation_Status", 
                     "Race", "Hispanic_Origin", "Martial_Status", "Household_Summary", 
                     "Citizenship", "Tax_Status","Income_Status","Education_Status","Birth_Country"]

# 각 열에 대해 LabelEncoder 객체 생성 및 변환 수행
for column in columns_to_encode:
    label_encoder = LabelEncoder()
    # train 데이터에 fit한 후 transform
    train_df[column] = label_encoder.fit_transform(train_df[column])
    # test 데이터에는 동일한 encoder를 사용하여 transform
    test_df[column] = label_encoder.transform(test_df[column])

# 변경된 데이터 확인
train_df.head(), test_df.head()


(            ID  Age Gender  Education_Status  Employment_Status  \
 0  TRAIN_00000   63      M                15                  2   
 1  TRAIN_00001   37      M                 1                  2   
 2  TRAIN_00002   58      F                12                  2   
 3  TRAIN_00003   44      M                12                  2   
 4  TRAIN_00004   37      F                12                  2   
 
    Working_Week (Yearly)  Industry_Status  Occupation_Status  Race  \
 0                      4               20                 11     4   
 1                     52                6                 11     4   
 2                     52               11                  0     1   
 3                     52               19                 12     4   
 4                     52               19                 10     4   
 
    Hispanic_Origin  ...  Citizenship Birth_Country  Birth_Country (Father)  \
 0                0  ...            2            39                      US   
 1  

In [5]:
# 성별 변수 변환: 남성('M')은 1, 여성('F')은 0
train_df['Gender'] = train_df['Gender'].map({'M': 1, 'F': 0})
test_df['Gender'] = test_df['Gender'].map({'M': 1, 'F': 0})

# 변환 후 데이터 확인
train_df.head(), test_df.head()

(            ID  Age  Gender  Education_Status  Employment_Status  \
 0  TRAIN_00000   63       1                15                  2   
 1  TRAIN_00001   37       1                 1                  2   
 2  TRAIN_00002   58       0                12                  2   
 3  TRAIN_00003   44       1                12                  2   
 4  TRAIN_00004   37       0                12                  2   
 
    Working_Week (Yearly)  Industry_Status  Occupation_Status  Race  \
 0                      4               20                 11     4   
 1                     52                6                 11     4   
 2                     52               11                  0     1   
 3                     52               19                 12     4   
 4                     52               19                 10     4   
 
    Hispanic_Origin  ...  Citizenship Birth_Country  Birth_Country (Father)  \
 0                0  ...            2            39                      US  

In [6]:
import numpy as np

# 새로운 변수 생성
train_df['Age_Working_Week'] = train_df['Age'] + train_df['Working_Week (Yearly)']
train_df['Gains_Losses'] = train_df['Gains'] - train_df['Losses']
train_df['Age_Dividends'] = train_df['Age'] * train_df['Dividends']
train_df['Age_Losses'] = train_df['Age'] - train_df['Losses']
train_df['Working_Week_Dividends'] = train_df['Working_Week (Yearly)'] * train_df['Dividends']
train_df['Gains_Dividends'] = train_df['Gains'] * train_df['Dividends']
train_df['Working_Week_Losses'] = train_df['Working_Week (Yearly)'] - train_df['Losses']
train_df['Age_Working_Week_Gains'] = train_df['Age'] + train_df['Working_Week (Yearly)'] * train_df['Gains']
train_df['Working_Week_Gains_Losses'] = train_df['Working_Week (Yearly)'] * train_df['Gains'] - train_df['Losses']




# test 데이터셋에도 같은 작업을 수행
test_df['Age_Working_Week'] = test_df['Age'] + test_df['Working_Week (Yearly)']
test_df['Gains_Losses'] = test_df['Gains'] - test_df['Losses']
test_df['Age_Dividends'] = test_df['Age'] * test_df['Dividends']
test_df['Age_Losses'] = test_df['Age'] - test_df['Losses']
test_df['Working_Week_Dividends'] = test_df['Working_Week (Yearly)'] * test_df['Dividends']
test_df['Gains_Dividends'] = test_df['Gains'] * test_df['Dividends']
test_df['Working_Week_Losses'] = test_df['Working_Week (Yearly)'] - test_df['Losses']
test_df['Age_Working_Week_Gains'] = test_df['Age'] + test_df['Working_Week (Yearly)'] * test_df['Gains']
test_df['Working_Week_Gains_Losses'] = test_df['Working_Week (Yearly)'] * test_df['Gains'] - test_df['Losses']




In [7]:
# train_df에서 Income 변수를 y_train으로 할당
target = train_df['Income']

# train_df에서 ID, Household_Status, Income 변수 제거
train_df.drop(columns=['ID', 'Household_Status', 'Income','Birth_Country (Father)','Birth_Country (Mother)'], inplace=True)

# test_df에서 ID, Household_Status 변수 제거
test_df.drop(columns=['ID', 'Household_Status','Birth_Country (Father)','Birth_Country (Mother)'], inplace=True)

In [8]:
train_df.columns

Index(['Age', 'Gender', 'Education_Status', 'Employment_Status',
       'Working_Week (Yearly)', 'Industry_Status', 'Occupation_Status', 'Race',
       'Hispanic_Origin', 'Martial_Status', 'Household_Summary', 'Citizenship',
       'Birth_Country', 'Tax_Status', 'Gains', 'Losses', 'Dividends',
       'Income_Status', 'Age_Working_Week', 'Gains_Losses', 'Age_Dividends',
       'Age_Losses', 'Working_Week_Dividends', 'Gains_Dividends',
       'Working_Week_Losses', 'Age_Working_Week_Gains',
       'Working_Week_Gains_Losses'],
      dtype='object')

In [10]:
import optuna
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

# 데이터 준비
X_train, X_valid, y_train, y_valid = train_test_split(train_df, target, test_size=0.2, random_state=42)

def objective(trial):
    # HistGradientBoostingRegressor 파라미터 설정
    param = {
        "loss": "squared_error",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_iter": trial.suggest_int("max_iter", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 100),
        "l2_regularization": trial.suggest_float("l2_regularization", 0.1, 1.0),
    }
    
    # HistGradientBoostingRegressor 모델 생성
    model = HistGradientBoostingRegressor(**param)
    
    # HistGradientBoostingRegressor 모델 학습
    model.fit(X_train, y_train)
    
    # 검증 데이터 예측
    preds = model.predict(X_valid)
    preds = np.maximum(0, preds)  # 예측값이 0보다 작으면 0으로 설정
    
    # RMSE 계산
    rmse = mean_squared_error(y_valid, preds, squared=False)
    
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print("Best trial:")
trial_ = study.best_trial

print(f"Value: {trial_.value}")
print("Params: ")
for key, value in trial_.params.items():
    print(f"    {key}: {value}")


[I 2024-04-05 23:55:23,854] A new study created in memory with name: no-name-09d1d34b-fe6f-46e6-b35e-0039306037ab
[I 2024-04-05 23:55:24,131] Trial 0 finished with value: 591.6586225354546 and parameters: {'learning_rate': 0.24444448013801195, 'max_iter': 567, 'max_depth': 6, 'min_samples_leaf': 61, 'l2_regularization': 0.105885635560516}. Best is trial 0 with value: 591.6586225354546.
[I 2024-04-05 23:55:24,353] Trial 1 finished with value: 593.5768287376264 and parameters: {'learning_rate': 0.24788410099113709, 'max_iter': 394, 'max_depth': 14, 'min_samples_leaf': 100, 'l2_regularization': 0.8739269591416992}. Best is trial 0 with value: 591.6586225354546.
[I 2024-04-05 23:55:24,585] Trial 2 finished with value: 592.5767407904804 and parameters: {'learning_rate': 0.2815781753935534, 'max_iter': 121, 'max_depth': 7, 'min_samples_leaf': 32, 'l2_regularization': 0.8591044011590605}. Best is trial 0 with value: 591.6586225354546.
[I 2024-04-05 23:55:24,769] Trial 3 finished with value: 5

[I 2024-04-05 23:55:39,487] Trial 30 finished with value: 590.837922135618 and parameters: {'learning_rate': 0.03466454066387548, 'max_iter': 534, 'max_depth': 4, 'min_samples_leaf': 66, 'l2_regularization': 0.6704573832344676}. Best is trial 19 with value: 586.8777383914102.
[I 2024-04-05 23:55:39,965] Trial 31 finished with value: 589.1761554679288 and parameters: {'learning_rate': 0.06254175073155767, 'max_iter': 999, 'max_depth': 13, 'min_samples_leaf': 14, 'l2_regularization': 0.153722758940614}. Best is trial 19 with value: 586.8777383914102.
[I 2024-04-05 23:55:40,407] Trial 32 finished with value: 589.1149518760392 and parameters: {'learning_rate': 0.07514110383989586, 'max_iter': 579, 'max_depth': 10, 'min_samples_leaf': 36, 'l2_regularization': 0.2669243289100485}. Best is trial 19 with value: 586.8777383914102.
[I 2024-04-05 23:55:40,731] Trial 33 finished with value: 588.8026743162774 and parameters: {'learning_rate': 0.07014219944242951, 'max_iter': 675, 'max_depth': 14, '

[I 2024-04-05 23:55:55,784] Trial 60 finished with value: 590.2245260495594 and parameters: {'learning_rate': 0.07636291221071777, 'max_iter': 514, 'max_depth': 20, 'min_samples_leaf': 85, 'l2_regularization': 0.4704220546391351}. Best is trial 19 with value: 586.8777383914102.
[I 2024-04-05 23:55:56,428] Trial 61 finished with value: 586.7153355196892 and parameters: {'learning_rate': 0.05179152914131646, 'max_iter': 306, 'max_depth': 18, 'min_samples_leaf': 42, 'l2_regularization': 0.11347395893080851}. Best is trial 61 with value: 586.7153355196892.
[I 2024-04-05 23:55:57,318] Trial 62 finished with value: 587.5041251897662 and parameters: {'learning_rate': 0.04181098019184003, 'max_iter': 427, 'max_depth': 18, 'min_samples_leaf': 52, 'l2_regularization': 0.2138574040940851}. Best is trial 61 with value: 586.7153355196892.
[I 2024-04-05 23:55:57,734] Trial 63 finished with value: 589.0166656754933 and parameters: {'learning_rate': 0.05523172487079482, 'max_iter': 301, 'max_depth': 1

[I 2024-04-05 23:56:21,772] Trial 90 finished with value: 588.4196923731715 and parameters: {'learning_rate': 0.03808732236778886, 'max_iter': 879, 'max_depth': 15, 'min_samples_leaf': 47, 'l2_regularization': 0.6910224555560152}. Best is trial 89 with value: 586.509569276224.
[I 2024-04-05 23:56:22,176] Trial 91 finished with value: 589.5432064754455 and parameters: {'learning_rate': 0.05059929695299323, 'max_iter': 955, 'max_depth': 16, 'min_samples_leaf': 50, 'l2_regularization': 0.5774990319035643}. Best is trial 89 with value: 586.509569276224.
[I 2024-04-05 23:56:22,638] Trial 92 finished with value: 590.6550752805047 and parameters: {'learning_rate': 0.07942864253694754, 'max_iter': 997, 'max_depth': 16, 'min_samples_leaf': 40, 'l2_regularization': 0.6203000758699525}. Best is trial 89 with value: 586.509569276224.
[I 2024-04-05 23:56:23,221] Trial 93 finished with value: 588.4808005627158 and parameters: {'learning_rate': 0.04392080232953072, 'max_iter': 937, 'max_depth': 17, '

Best trial:
Value: 586.509569276224
Params: 
    learning_rate: 0.04612756986536774
    max_iter: 931
    max_depth: 16
    min_samples_leaf: 41
    l2_regularization: 0.5793182507718159


In [12]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

# Optuna로 얻은 최적의 하이퍼파라미터
opt_params = {
    'learning_rate': 0.04612756986536774,
    'max_iter': 931,
    'max_depth': 16,
    'min_samples_leaf': 41,
    'l2_regularization': 0.5793182507718159
}

# 최적의 하이퍼파라미터를 사용하여 HistGradientBoostingRegressor 모델 인스턴스 생성
model = HistGradientBoostingRegressor(**opt_params)

# 모델 학습 (여기서 X는 학습 데이터의 특성, y는 타겟 변수입니다.)
model.fit(train_df, target)

# 모델을 사용하여 예측 수행 (필요한 경우 test_df에 대한 예측)
predictions = model.predict(test_df)

# 예측 결과 출력 또는 활용
print(predictions)


[  4.20905289  -6.12032428 412.01796411 ... 382.67262307   3.91555808
 622.2831993 ]


In [13]:
predictions

array([  4.20905289,  -6.12032428, 412.01796411, ..., 382.67262307,
         3.91555808, 622.2831993 ])

In [14]:
# 예측값이 0보다 작으면 0으로 처리
predictions = np.maximum(predictions, 0)

# 처리된 예측값 출력 또는 활용
print(predictions)

[  4.20905289   0.         412.01796411 ... 382.67262307   3.91555808
 622.2831993 ]


In [15]:
submit = pd.read_csv("sample_submission.csv")

In [16]:
submit['Income'] =predictions
submit

Unnamed: 0,ID,Income
0,TEST_0000,4.209053
1,TEST_0001,0.000000
2,TEST_0002,412.017964
3,TEST_0003,732.650984
4,TEST_0004,3.183613
...,...,...
9995,TEST_9995,876.974534
9996,TEST_9996,859.063761
9997,TEST_9997,382.672623
9998,TEST_9998,3.915558


In [17]:
submit.to_csv('hgb(optuna).csv', index=False)