In [1]:
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df= pd.read_csv('test.csv')

In [2]:
train_df.isna().sum()

ID                        0
Age                       0
Gender                    0
Education_Status          0
Employment_Status         0
Working_Week (Yearly)     0
Industry_Status           0
Occupation_Status         0
Race                      0
Hispanic_Origin           0
Martial_Status            0
Household_Status          0
Household_Summary         0
Citizenship               0
Birth_Country             0
Birth_Country (Father)    0
Birth_Country (Mother)    0
Tax_Status                0
Gains                     0
Losses                    0
Dividends                 0
Income_Status             0
Income                    0
dtype: int64

In [3]:
train_df['Birth_Country'].value_counts()

Birth_Country
US                              17825
Mexico                            540
Unknown                           330
Puerto-Rico                       117
Philippines                       112
Germany                            90
Canada                             75
El-Salvador                        68
Cuba                               58
India                              55
Dominican-Republic                 49
England                            47
Poland                             45
Jamaica                            45
Columbia                           36
Italy                              35
South Korea                        31
Vietnam                            31
Ecuador                            31
Japan                              30
Portugal                           29
Nicaragua                          28
China                              28
Guatemala                          28
Haiti                              25
Iran                               2

In [4]:
from sklearn.preprocessing import LabelBinarizer

# Initialize the binary encoder
binarizer = LabelBinarizer()

# Function to apply binary encoding to a specified column in both train and test dataframes
def binary_encode_column(column_name):
    # Combine the column from both dataframes for fitting
    combined = pd.concat([train_df[column_name], test_df[column_name]], axis=0)
    
    # Fit the binarizer and transform the data
    combined_encoded = binarizer.fit_transform(combined)
    
    # Split the transformed data back into train and test sets
    train_encoded, test_encoded = combined_encoded[:len(train_df)], combined_encoded[len(train_df):]
    
    # Create new dataframe for the encoded columns
    train_encoded_df = pd.DataFrame(train_encoded, columns=[f"{column_name}_{i}" for i in range(train_encoded.shape[1])])
    test_encoded_df = pd.DataFrame(test_encoded, columns=[f"{column_name}_{i}" for i in range(test_encoded.shape[1])])
    
    # Drop the original column from train and test dataframes
    train_df.drop(columns=[column_name], inplace=True)
    test_df.drop(columns=[column_name], inplace=True)
    
    # Concatenate the new encoded dataframe to the original dataframes
    new_train_df = pd.concat([train_df, train_encoded_df], axis=1)
    new_test_df = pd.concat([test_df, test_encoded_df], axis=1)
    
    return new_train_df, new_test_df

# Apply binary encoding to the specified columns
for column in ["Birth_Country", "Birth_Country (Father)", "Birth_Country (Mother)"]:
    train_df, test_df = binary_encode_column(column)

# Display the first few rows of the modified datasets to verify changes
train_df.head(), test_df.head()


(            ID  Age Gender                Education_Status Employment_Status  \
 0  TRAIN_00000   63      M                    Middle (7-8)         Full-Time   
 1  TRAIN_00001   37      M  Associates degree (Vocational)         Full-Time   
 2  TRAIN_00002   58      F                   High graduate         Full-Time   
 3  TRAIN_00003   44      M                   High graduate         Full-Time   
 4  TRAIN_00004   37      F                   High graduate         Full-Time   
 
    Working_Week (Yearly)              Industry_Status  \
 0                      4              Social Services   
 1                     52                Entertainment   
 2                     52  Manufacturing (Non-durable)   
 3                     52                       Retail   
 4                     52                       Retail   
 
                   Occupation_Status   Race Hispanic_Origin  ...  \
 0                          Services  White       All other  ...   
 1                        

In [5]:
from sklearn.preprocessing import LabelEncoder

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# train 데이터셋의 Education_Status 열을 라벨 인코딩
train_df['Education_Status'] = label_encoder.fit_transform(train_df['Education_Status'])

# test 데이터셋의 Education_Status 열을 라벨 인코딩
test_df['Education_Status'] = label_encoder.transform(test_df['Education_Status'])

# Display the first few rows of the modified datasets to verify changes
train_df.head(), test_df.head()

(            ID  Age Gender  Education_Status Employment_Status  \
 0  TRAIN_00000   63      M                15         Full-Time   
 1  TRAIN_00001   37      M                 1         Full-Time   
 2  TRAIN_00002   58      F                12         Full-Time   
 3  TRAIN_00003   44      M                12         Full-Time   
 4  TRAIN_00004   37      F                12         Full-Time   
 
    Working_Week (Yearly)              Industry_Status  \
 0                      4              Social Services   
 1                     52                Entertainment   
 2                     52  Manufacturing (Non-durable)   
 3                     52                       Retail   
 4                     52                       Retail   
 
                   Occupation_Status   Race Hispanic_Origin  ...  \
 0                          Services  White       All other  ...   
 1                          Services  White       All other  ...   
 2  Admin Support (include Clerical)  Blac

In [6]:
# 성별 변수 변환: 남성('M')은 1, 여성('F')은 0
train_df['Gender'] = train_df['Gender'].map({'M': 1, 'F': 0})
test_df['Gender'] = test_df['Gender'].map({'M': 1, 'F': 0})

# 변환 후 데이터 확인
train_df.head(), test_df.head()

(            ID  Age  Gender  Education_Status Employment_Status  \
 0  TRAIN_00000   63       1                15         Full-Time   
 1  TRAIN_00001   37       1                 1         Full-Time   
 2  TRAIN_00002   58       0                12         Full-Time   
 3  TRAIN_00003   44       1                12         Full-Time   
 4  TRAIN_00004   37       0                12         Full-Time   
 
    Working_Week (Yearly)              Industry_Status  \
 0                      4              Social Services   
 1                     52                Entertainment   
 2                     52  Manufacturing (Non-durable)   
 3                     52                       Retail   
 4                     52                       Retail   
 
                   Occupation_Status   Race Hispanic_Origin  ...  \
 0                          Services  White       All other  ...   
 1                          Services  White       All other  ...   
 2  Admin Support (include Clerical)

In [7]:
columns_to_encode = ["Employment_Status", "Industry_Status", "Occupation_Status", 
                     "Race", "Hispanic_Origin", "Martial_Status", "Household_Summary", 
                     "Citizenship", "Tax_Status","Income_Status"]

from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder 객체 생성
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')

# 주어진 변수들을 선택하여 train 데이터셋에 대해 원핫인코딩 적용
train_encoded = one_hot_encoder.fit_transform(train_df[columns_to_encode])

# 생성된 원핫인코딩된 배열을 DataFrame으로 변환
train_encoded_df = pd.DataFrame(train_encoded, columns=one_hot_encoder.get_feature_names_out(columns_to_encode))

# 기존 train 데이터셋에서 선택한 변수들 제거
train_df.drop(columns=columns_to_encode, inplace=True)

# 새로운 원핫인코딩된 DataFrame과 기존 train 데이터셋을 결합
train_df = pd.concat([train_df, train_encoded_df], axis=1)

# test 데이터셋에 대해 같은 변환 적용
test_encoded = one_hot_encoder.transform(test_df[columns_to_encode])
test_encoded_df = pd.DataFrame(test_encoded, columns=one_hot_encoder.get_feature_names_out(columns_to_encode))
test_df.drop(columns=columns_to_encode, inplace=True)
test_df = pd.concat([test_df, test_encoded_df], axis=1)

# 변환 후 데이터 확인
train_df.head(), test_df.head()

(            ID  Age  Gender  Education_Status  Working_Week (Yearly)  \
 0  TRAIN_00000   63       1                15                      4   
 1  TRAIN_00001   37       1                 1                     52   
 2  TRAIN_00002   58       0                12                     52   
 3  TRAIN_00003   44       1                12                     52   
 4  TRAIN_00004   37       0                12                     52   
 
         Household_Status  Gains  Losses  Dividends  Income  ...  \
 0            Householder      0       0          0     425  ...   
 1  Nonfamily householder      0       0          0       0  ...   
 2            Householder   3411       0          0     860  ...   
 3  Nonfamily householder      0       0          0     850  ...   
 4            Householder      0       0          0     570  ...   
 
    Citizenship_Native  Citizenship_Native (Born Abroad)  \
 0                 1.0                               0.0   
 1                 1.0        

In [8]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 객체 생성
scaler = StandardScaler()

# train 데이터셋에 대해 표준화 수행
train_df[['Age', 'Working_Week (Yearly)', 'Gains', 'Losses', 'Dividends']] = scaler.fit_transform(train_df[['Age', 'Working_Week (Yearly)', 'Gains', 'Losses', 'Dividends']])

# test 데이터셋에 대해 같은 변환 적용
test_df[['Age', 'Working_Week (Yearly)', 'Gains', 'Losses', 'Dividends']] = scaler.transform(test_df[['Age', 'Working_Week (Yearly)', 'Gains', 'Losses', 'Dividends']])


In [9]:
import numpy as np

# 새로운 변수 생성
train_df['Age_Working_Week'] = train_df['Age'] + train_df['Working_Week (Yearly)']
train_df['Gains_Losses'] = train_df['Gains'] - train_df['Losses']
train_df['Age_Dividends'] = train_df['Age'] * train_df['Dividends']
train_df['Working_Week_Gains'] = train_df['Working_Week (Yearly)'] / train_df['Gains']
train_df['Age_Losses'] = train_df['Age'] - train_df['Losses']
train_df['Working_Week_Dividends'] = train_df['Working_Week (Yearly)'] * train_df['Dividends']
train_df['Age_Gains'] = train_df['Age'] / train_df['Gains']
train_df['Gains_Dividends'] = train_df['Gains'] * train_df['Dividends']
train_df['Age_Gains_Losses'] = train_df['Age'] * train_df['Gains'] / train_df['Losses']
train_df['Working_Week_Losses'] = train_df['Working_Week (Yearly)'] - train_df['Losses']
train_df['Age_Working_Week_Gains'] = train_df['Age'] + train_df['Working_Week (Yearly)'] * train_df['Gains']
train_df['Working_Week_Gains_Losses'] = train_df['Working_Week (Yearly)'] * train_df['Gains'] - train_df['Losses']
train_df['Age_Working_Week_Dividends'] = train_df['Age'] - train_df['Working_Week (Yearly)'] / train_df['Dividends']
train_df['Gains_Losses_Dividends'] = train_df['Gains'] / train_df['Losses'] * train_df['Dividends']
train_df['Age_Working_Week_Gains_Losses'] = train_df['Age'] * train_df['Working_Week (Yearly)'] / train_df['Gains'] + train_df['Losses']

# test 데이터셋에도 같은 작업을 수행
test_df['Age_Working_Week'] = test_df['Age'] + test_df['Working_Week (Yearly)']
test_df['Gains_Losses'] = test_df['Gains'] - test_df['Losses']
test_df['Age_Dividends'] = test_df['Age'] * test_df['Dividends']
test_df['Working_Week_Gains'] = test_df['Working_Week (Yearly)'] / test_df['Gains']
test_df['Age_Losses'] = test_df['Age'] - test_df['Losses']
test_df['Working_Week_Dividends'] = test_df['Working_Week (Yearly)'] * test_df['Dividends']
test_df['Age_Gains'] = test_df['Age'] / test_df['Gains']
test_df['Gains_Dividends'] = test_df['Gains'] * test_df['Dividends']
test_df['Age_Gains_Losses'] = test_df['Age'] * test_df['Gains'] / test_df['Losses']
test_df['Working_Week_Losses'] = test_df['Working_Week (Yearly)'] - test_df['Losses']
test_df['Age_Working_Week_Gains'] = test_df['Age'] + test_df['Working_Week (Yearly)'] * test_df['Gains']
test_df['Working_Week_Gains_Losses'] = test_df['Working_Week (Yearly)'] * test_df['Gains'] - test_df['Losses']
test_df['Age_Working_Week_Dividends'] = test_df['Age'] - test_df['Working_Week (Yearly)'] / test_df['Dividends']
test_df['Gains_Losses_Dividends'] = test_df['Gains'] / test_df['Losses'] * test_df['Dividends']
test_df['Age_Working_Week_Gains_Losses'] = test_df['Age'] * test_df['Working_Week (Yearly)'] / test_df['Gains'] + test_df['Losses']

In [10]:
# train_df에서 Income 변수를 y_train으로 할당
y_train = train_df['Income']

# train_df에서 ID, Household_Status, Income 변수 제거
train_df.drop(columns=['ID', 'Household_Status', 'Income'], inplace=True)

# test_df에서 ID, Household_Status 변수 제거
test_df.drop(columns=['ID', 'Household_Status'], inplace=True)

In [11]:
from catboost import CatBoostRegressor

# CatBoost 회귀 모델 정의
catboost_model = CatBoostRegressor(random_state=2024, n_estimators=100, max_depth=10)

# 모델 학습
catboost_model.fit(train_df, y_train)


Learning rate set to 0.427309
0:	learn: 642.3970756	total: 217ms	remaining: 21.5s
1:	learn: 617.6833654	total: 271ms	remaining: 13.3s
2:	learn: 603.2986584	total: 311ms	remaining: 10.1s
3:	learn: 595.3495096	total: 348ms	remaining: 8.34s
4:	learn: 588.8414652	total: 385ms	remaining: 7.32s
5:	learn: 584.5929327	total: 421ms	remaining: 6.59s
6:	learn: 580.4369381	total: 456ms	remaining: 6.05s
7:	learn: 575.8509949	total: 492ms	remaining: 5.66s
8:	learn: 571.5362649	total: 527ms	remaining: 5.32s
9:	learn: 568.2374598	total: 561ms	remaining: 5.05s
10:	learn: 565.6118259	total: 596ms	remaining: 4.82s
11:	learn: 562.1663320	total: 631ms	remaining: 4.63s
12:	learn: 560.2428685	total: 670ms	remaining: 4.48s
13:	learn: 559.1148775	total: 704ms	remaining: 4.33s
14:	learn: 555.6730229	total: 738ms	remaining: 4.18s
15:	learn: 553.7604789	total: 771ms	remaining: 4.05s
16:	learn: 552.2516971	total: 806ms	remaining: 3.94s
17:	learn: 549.3242445	total: 841ms	remaining: 3.83s
18:	learn: 546.8768730	tot

<catboost.core.CatBoostRegressor at 0x1e108808d90>

In [12]:
# 테스트 세트에 대한 예측 수행
y_pred = catboost_model.predict(test_df)

In [13]:
y_pred

array([-44.40838773,   5.79259318, 412.3001848 , ..., 360.95477258,
        -5.19716657, 647.16590599])

In [14]:
submit = pd.read_csv("sample_submission.csv")

In [15]:
submit['Income'] =y_pred
submit

Unnamed: 0,ID,Income
0,TEST_0000,-44.408388
1,TEST_0001,5.792593
2,TEST_0002,412.300185
3,TEST_0003,910.106734
4,TEST_0004,-2.726184
...,...,...
9995,TEST_9995,976.053662
9996,TEST_9996,725.264769
9997,TEST_9997,360.954773
9998,TEST_9998,-5.197167


In [16]:
submit.to_csv('catboost1.csv', index=False)