# 02. Data Preprocessing & Feature Engineering
## Spaceship Titanic - Kaggle Competition

이 노트북에서는 결측치 처리, Feature Engineering, 데이터 전처리를 수행합니다.

## 1. 라이브러리 Import 및 데이터 로드

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
import pickle
import os

warnings.filterwarnings('ignore')

# 저장 디렉토리 생성
os.makedirs('../data/processed', exist_ok=True)

In [2]:
# 데이터 로드
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# 원본 데이터 백업
train_original = train.copy()
test_original = test.copy()

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (8693, 14)
Test shape: (4277, 13)


In [3]:
# Train/Test 구분을 위한 플래그
train['is_train'] = 1
test['is_train'] = 0

# 데이터 합치기 (일관된 전처리를 위해)
df = pd.concat([train, test], axis=0, ignore_index=True)
print(f"Combined shape: {df.shape}")

Combined shape: (12970, 15)


## 2. Feature Engineering - 기본 변수 생성

### 2.1 PassengerId에서 Group 정보 추출

In [4]:
# Group 번호 추출
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)

# 그룹 내 번호
df['GroupNum'] = df['PassengerId'].apply(lambda x: x.split('_')[1]).astype(int)

# 그룹 크기
df['GroupSize'] = df.groupby('Group')['Group'].transform('count')

# 혼자 여행 여부
df['IsSolo'] = (df['GroupSize'] == 1).astype(int)

print("Group 관련 변수 생성 완료")
print(df[['PassengerId', 'Group', 'GroupNum', 'GroupSize', 'IsSolo']].head(10))

Group 관련 변수 생성 완료
  PassengerId  Group  GroupNum  GroupSize  IsSolo
0     0001_01      1         1          1       1
1     0002_01      2         1          1       1
2     0003_01      3         1          2       0
3     0003_02      3         2          2       0
4     0004_01      4         1          1       1
5     0005_01      5         1          1       1
6     0006_01      6         1          2       0
7     0006_02      6         2          2       0
8     0007_01      7         1          1       1
9     0008_01      8         1          3       0


### 2.2 Cabin 분해

In [5]:
# Cabin 분해 (Deck/Num/Side)
df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else np.nan)
df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else np.nan)
df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else np.nan)

print("Cabin 분해 완료")
print(df[['Cabin', 'Deck', 'CabinNum', 'Side']].head(10))

Cabin 분해 완료
   Cabin Deck  CabinNum Side
0  B/0/P    B       0.0    P
1  F/0/S    F       0.0    S
2  A/0/S    A       0.0    S
3  A/0/S    A       0.0    S
4  F/1/S    F       1.0    S
5  F/0/P    F       0.0    P
6  F/2/S    F       2.0    S
7  G/0/S    G       0.0    S
8  F/3/S    F       3.0    S
9  B/1/P    B       1.0    P


### 2.3 Name에서 성(Last Name) 추출

In [6]:
# Last Name 추출
df['LastName'] = df['Name'].apply(lambda x: x.split()[-1] if pd.notna(x) else np.nan)

# 가족 크기 (같은 성을 가진 사람 수)
df['FamilySize'] = df.groupby('LastName')['LastName'].transform('count')
df.loc[df['LastName'].isna(), 'FamilySize'] = np.nan

print("Name 관련 변수 생성 완료")
print(df[['Name', 'LastName', 'FamilySize']].head(10))

Name 관련 변수 생성 완료
                 Name     LastName  FamilySize
0     Maham Ofracculy    Ofracculy         3.0
1        Juanna Vines        Vines         4.0
2       Altark Susent       Susent         7.0
3        Solam Susent       Susent         7.0
4   Willy Santantines  Santantines         9.0
5   Sandie Hinetthews   Hinetthews        10.0
6  Billex Jacostaffey  Jacostaffey         9.0
7  Candra Jacostaffey  Jacostaffey         9.0
8       Andona Beston       Beston         5.0
9      Erraiam Flatic       Flatic         3.0


### 2.4 지출 관련 변수

In [7]:
# 지출 변수 목록
expense_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# 총 지출액
df['TotalExpenditure'] = df[expense_cols].sum(axis=1)

# 지출 유무
df['HasExpenditure'] = (df['TotalExpenditure'] > 0).astype(int)

# 각 지출 항목 유무
for col in expense_cols:
    df[f'Has{col}'] = (df[col] > 0).astype(int)

# 지출 항목 수
df['NumExpenseCategories'] = df[[f'Has{col}' for col in expense_cols]].sum(axis=1)

# 럭셔리 지출 (Spa + VRDeck + RoomService)
df['LuxuryExpense'] = df['Spa'] + df['VRDeck'] + df['RoomService']

# 필수 지출 (FoodCourt + ShoppingMall)
df['BasicExpense'] = df['FoodCourt'] + df['ShoppingMall']

print("지출 관련 변수 생성 완료")
print(df[['TotalExpenditure', 'HasExpenditure', 'NumExpenseCategories', 'LuxuryExpense', 'BasicExpense']].describe())

지출 관련 변수 생성 완료
       TotalExpenditure  HasExpenditure  NumExpenseCategories  LuxuryExpense  \
count      12970.000000    12970.000000          12970.000000   12167.000000   
mean        1433.221049        0.579260              1.732691     838.904578   
std         2807.369708        0.493697              1.643448    1851.468620   
min            0.000000        0.000000              0.000000       0.000000   
25%            0.000000        0.000000              0.000000       0.000000   
50%          716.000000        1.000000              2.000000      66.000000   
75%         1442.000000        1.000000              3.000000     867.000000   
max        35987.000000        1.000000              5.000000   28600.000000   

       BasicExpense  
count  12379.000000  
mean     626.786170  
std     1692.918191  
min        0.000000  
25%        0.000000  
50%        3.000000  
75%      620.000000  
max    29813.000000  


### 2.5 나이 관련 변수

In [8]:
# 나이 그룹
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                        labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])

# 미성년자 여부
df['IsMinor'] = (df['Age'] < 18).astype(int)

print("나이 관련 변수 생성 완료")
print(df['AgeGroup'].value_counts())

나이 관련 변수 생성 완료
AgeGroup
Adult     6255
Middle    3358
Teen      1603
Child      897
Senior     327
Name: count, dtype: int64


## 3. 결측치 처리

In [9]:
# 결측치 현황 확인
def check_missing(dataframe):
    missing = dataframe.isnull().sum()
    missing_pct = (missing / len(dataframe)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Missing %': missing_pct
    })
    return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)

print("결측치 현황:")
check_missing(df)

결측치 현황:


Unnamed: 0,Missing Count,Missing %
Transported,4277,32.976099
LuxuryExpense,803,6.19121
BasicExpense,591,4.556669
AgeGroup,530,4.086353
CryoSleep,310,2.390131
ShoppingMall,306,2.359291
Side,299,2.30532
Cabin,299,2.30532
Deck,299,2.30532
CabinNum,299,2.30532


### 3.1 CryoSleep과 지출 관계를 활용한 결측치 처리

In [10]:
# CryoSleep이 True면 모든 지출이 0이어야 함
# 지출이 있으면 CryoSleep은 False

# CryoSleep 결측치 중 지출이 있는 경우 False로 채우기
mask_cryo_null = df['CryoSleep'].isna()
mask_has_expense = df['TotalExpenditure'] > 0
df.loc[mask_cryo_null & mask_has_expense, 'CryoSleep'] = False

# CryoSleep이 True인 경우 지출 변수 0으로 채우기
mask_cryo_true = df['CryoSleep'] == True
for col in expense_cols:
    df.loc[mask_cryo_true & df[col].isna(), col] = 0

print(f"CryoSleep 기반 결측치 처리 후 CryoSleep 결측: {df['CryoSleep'].isna().sum()}")

CryoSleep 기반 결측치 처리 후 CryoSleep 결측: 136


### 3.2 그룹 정보를 활용한 결측치 처리

In [11]:
# 같은 그룹은 같은 HomePlanet, Destination, Deck를 공유할 가능성이 높음
group_cols = ['HomePlanet', 'Destination', 'Deck', 'Side']

for col in group_cols:
    # 그룹별 최빈값으로 채우기
    group_mode = df.groupby('Group')[col].transform(lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else np.nan)
    df[col] = df[col].fillna(group_mode)
    print(f"{col} - 그룹 기반 처리 후 결측: {df[col].isna().sum()}")

HomePlanet - 그룹 기반 처리 후 결측: 157


Destination - 그룹 기반 처리 후 결측: 154


Deck - 그룹 기반 처리 후 결측: 162


Side - 그룹 기반 처리 후 결측: 162


### 3.3 나머지 결측치 처리

In [12]:
# HomePlanet - 최빈값
df['HomePlanet'] = df['HomePlanet'].fillna(df['HomePlanet'].mode()[0])

# Destination - 최빈값
df['Destination'] = df['Destination'].fillna(df['Destination'].mode()[0])

# CryoSleep - 최빈값
df['CryoSleep'] = df['CryoSleep'].fillna(df['CryoSleep'].mode()[0])

# VIP - 최빈값
df['VIP'] = df['VIP'].fillna(df['VIP'].mode()[0])

# Age - 중앙값
df['Age'] = df['Age'].fillna(df['Age'].median())

# 지출 변수 - 0 또는 중앙값
for col in expense_cols:
    # CryoSleep이 True인 경우 0, 아닌 경우 중앙값
    median_val = df[df['CryoSleep'] == False][col].median()
    df.loc[df[col].isna() & (df['CryoSleep'] == True), col] = 0
    df.loc[df[col].isna() & (df['CryoSleep'] == False), col] = median_val
    df[col] = df[col].fillna(0)  # 남은 결측치는 0으로

# Deck, Side - 최빈값
df['Deck'] = df['Deck'].fillna(df['Deck'].mode()[0])
df['Side'] = df['Side'].fillna(df['Side'].mode()[0])

# CabinNum - 중앙값
df['CabinNum'] = df['CabinNum'].fillna(df['CabinNum'].median())

# FamilySize - 1 (혼자)
df['FamilySize'] = df['FamilySize'].fillna(1)

# AgeGroup 재계산
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                        labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])

# IsMinor 재계산
df['IsMinor'] = (df['Age'] < 18).astype(int)

print("\n결측치 처리 완료. 남은 결측치:")
remaining_missing = check_missing(df)
if len(remaining_missing) == 0:
    print("없음!")
else:
    print(remaining_missing)


결측치 처리 완료. 남은 결측치:
               Missing Count  Missing %
Transported             4277  32.976099
LuxuryExpense            803   6.191210
BasicExpense             591   4.556669
Cabin                    299   2.305320
Name                     294   2.266769
LastName                 294   2.266769
AgeGroup                 260   2.004626


### 3.4 지출 관련 변수 재계산

In [13]:
# 결측치 처리 후 지출 관련 변수 재계산
df['TotalExpenditure'] = df[expense_cols].sum(axis=1)
df['HasExpenditure'] = (df['TotalExpenditure'] > 0).astype(int)

for col in expense_cols:
    df[f'Has{col}'] = (df[col] > 0).astype(int)

df['NumExpenseCategories'] = df[[f'Has{col}' for col in expense_cols]].sum(axis=1)
df['LuxuryExpense'] = df['Spa'] + df['VRDeck'] + df['RoomService']
df['BasicExpense'] = df['FoodCourt'] + df['ShoppingMall']

print("지출 변수 재계산 완료")

지출 변수 재계산 완료


## 4. 추가 Feature Engineering

In [14]:
# Log 변환 (지출 변수들은 왜곡된 분포를 가지므로)
for col in expense_cols + ['TotalExpenditure', 'LuxuryExpense', 'BasicExpense']:
    df[f'{col}_log'] = np.log1p(df[col])

print("Log 변환 완료")

Log 변환 완료


In [15]:
# 상호작용 변수
df['CryoSleep_VIP'] = df['CryoSleep'].astype(int) * df['VIP'].astype(int)
df['Solo_Expenditure'] = df['IsSolo'] * df['TotalExpenditure']

# Deck과 Side 결합
df['DeckSide'] = df['Deck'] + '_' + df['Side']

# HomePlanet과 Destination 결합
df['Route'] = df['HomePlanet'] + '_' + df['Destination']

print("상호작용 변수 생성 완료")

상호작용 변수 생성 완료


### 4.2 그룹 통계 피처 (Group Statistics)

같은 그룹(가족/동행자)의 통계 정보를 활용하여 개인의 특성을 파악합니다.
- 그룹 내 평균/최대 지출
- 그룹 내 CryoSleep 비율
- 개인 지출과 그룹 평균의 차이

In [16]:
# 그룹 통계 피처 생성
print("그룹 통계 피처 생성 중...")

# 1. 그룹별 지출 통계
df['Group_MeanExpense'] = df.groupby('Group')['TotalExpenditure'].transform('mean')
df['Group_MaxExpense'] = df.groupby('Group')['TotalExpenditure'].transform('max')
df['Group_SumExpense'] = df.groupby('Group')['TotalExpenditure'].transform('sum')

# 2. 개인 지출과 그룹 평균의 차이 (편차)
df['Expense_Deviation'] = df['TotalExpenditure'] - df['Group_MeanExpense']

# 3. 그룹 내 지출 비율 (내 지출 / 그룹 총 지출)
df['Expense_Ratio_InGroup'] = df['TotalExpenditure'] / (df['Group_SumExpense'] + 1)

# 4. 그룹별 CryoSleep 비율
df['Group_CryoSleepRatio'] = df.groupby('Group')['CryoSleep'].transform('mean')

# 5. 그룹별 나이 통계
df['Group_MeanAge'] = df.groupby('Group')['Age'].transform('mean')
df['Group_MinAge'] = df.groupby('Group')['Age'].transform('min')
df['Group_MaxAge'] = df.groupby('Group')['Age'].transform('max')
df['Age_Deviation'] = df['Age'] - df['Group_MeanAge']

# 6. 그룹 내 미성년자 수
df['Group_MinorCount'] = df.groupby('Group')['IsMinor'].transform('sum')

# 7. 그룹 내 VIP 수
df['Group_VIPCount'] = df.groupby('Group')['VIP'].transform('sum')

# 8. 그룹 내 지출 유무 비율
df['Group_HasExpenseRatio'] = df.groupby('Group')['HasExpenditure'].transform('mean')

# 9. 솔로가 아닌 경우, 그룹 내 나의 순서 관련 피처
df['IsFirstInGroup'] = (df['GroupNum'] == 1).astype(int)
df['IsLastInGroup'] = (df['GroupNum'] == df['GroupSize']).astype(int)

print("\n생성된 그룹 통계 피처:")
group_stat_cols = [
    'Group_MeanExpense', 'Group_MaxExpense', 'Group_SumExpense',
    'Expense_Deviation', 'Expense_Ratio_InGroup',
    'Group_CryoSleepRatio', 
    'Group_MeanAge', 'Group_MinAge', 'Group_MaxAge', 'Age_Deviation',
    'Group_MinorCount', 'Group_VIPCount', 'Group_HasExpenseRatio',
    'IsFirstInGroup', 'IsLastInGroup'
]

for col in group_stat_cols:
    print(f"  - {col}")

print(f"\n총 {len(group_stat_cols)}개 그룹 통계 피처 생성 완료!")
print("\n샘플 데이터:")
print(df[['PassengerId', 'Group', 'GroupSize', 'TotalExpenditure', 'Group_MeanExpense', 'Expense_Deviation']].head(10))

그룹 통계 피처 생성 중...

생성된 그룹 통계 피처:
  - Group_MeanExpense
  - Group_MaxExpense
  - Group_SumExpense
  - Expense_Deviation
  - Expense_Ratio_InGroup
  - Group_CryoSleepRatio
  - Group_MeanAge
  - Group_MinAge
  - Group_MaxAge
  - Age_Deviation
  - Group_MinorCount
  - Group_VIPCount
  - Group_HasExpenseRatio
  - IsFirstInGroup
  - IsLastInGroup

총 15개 그룹 통계 피처 생성 완료!

샘플 데이터:
  PassengerId  Group  GroupSize  TotalExpenditure  Group_MeanExpense  \
0     0001_01      1          1               0.0                0.0   
1     0002_01      2          1             736.0              736.0   
2     0003_01      3          2           10383.0             7779.5   
3     0003_02      3          2            5176.0             7779.5   
4     0004_01      4          1            1091.0             1091.0   
5     0005_01      5          1             774.0              774.0   
6     0006_01      6          2            1584.0              792.0   
7     0006_02      6          2               0.0 

## 5. 범주형 변수 인코딩

In [17]:
# 인코딩할 범주형 변수
cat_cols_to_encode = ['HomePlanet', 'Destination', 'Deck', 'Side', 'AgeGroup', 'DeckSide', 'Route']

# Boolean 변수 변환
df['CryoSleep'] = df['CryoSleep'].astype(int)
df['VIP'] = df['VIP'].astype(int)

# Label Encoding
label_encoders = {}
for col in cat_cols_to_encode:
    le = LabelEncoder()
    df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    print(f"{col}: {len(le.classes_)} classes")

# Label Encoder 저장
with open('../data/processed/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

print("\nLabel Encoding 완료")

HomePlanet: 3 classes
Destination: 3 classes
Deck: 8 classes
Side: 2 classes
AgeGroup: 6 classes
DeckSide: 16 classes
Route: 9 classes

Label Encoding 완료


### 5.2 Target Encoding

Target Encoding은 범주형 변수를 해당 범주의 타겟 평균값으로 변환합니다.
- Label Encoding보다 타겟과의 관계를 직접 반영
- Smoothing을 적용하여 샘플이 적은 범주의 과적합 방지
- **중요**: Train 데이터로만 인코딩을 학습하여 Data Leakage 방지

In [18]:
# Target Encoding 구현 (with Smoothing)
def target_encode(train_df, test_df, col, target, smoothing=10):
    """
    Target Encoding with smoothing to prevent overfitting
    
    smoothing: 샘플 수가 적을수록 전체 평균에 가깝게 조정
    """
    # 전체 타겟 평균
    global_mean = train_df[target].mean()
    
    # 컬럼을 문자열로 변환 (Categorical 타입 처리)
    train_col = train_df[col].astype(str)
    test_col = test_df[col].astype(str)
    
    # 범주별 통계
    agg = train_df.groupby(train_col)[target].agg(['mean', 'count'])
    
    # Smoothing 적용: (count * mean + smoothing * global_mean) / (count + smoothing)
    smooth_mean = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
    
    # Train/Test에 적용
    train_encoded = train_col.map(smooth_mean)
    test_encoded = test_col.map(smooth_mean)
    
    # Test에서 새로운 범주가 있으면 전체 평균으로 대체
    test_encoded = test_encoded.fillna(global_mean)
    train_encoded = train_encoded.fillna(global_mean)
    
    return train_encoded, test_encoded, smooth_mean.to_dict()

# Train/Test 임시 분리 (Target Encoding은 Train 데이터로만 학습)
train_temp = df[df['is_train'] == 1].copy()
test_temp = df[df['is_train'] == 0].copy()

# Target Encoding 적용할 변수
target_encode_cols = ['HomePlanet', 'Destination', 'Deck', 'Side', 'AgeGroup', 'DeckSide', 'Route']

# Target Encoder 저장용 딕셔너리
target_encoders = {}

print("Target Encoding 적용:")
print("-" * 50)

for col in target_encode_cols:
    train_encoded, test_encoded, encoder_dict = target_encode(
        train_temp, test_temp, col, 'Transported', smoothing=10
    )
    
    # Train 데이터에 적용
    df.loc[df['is_train'] == 1, f'{col}_target'] = train_encoded.values
    # Test 데이터에 적용
    df.loc[df['is_train'] == 0, f'{col}_target'] = test_encoded.values
    
    # 인코더 저장
    target_encoders[col] = encoder_dict
    
    # 결과 출력
    print(f"\n{col}:")
    for cat, val in sorted(encoder_dict.items(), key=lambda x: x[1], reverse=True):
        print(f"  {cat}: {val:.4f}")

# Target Encoder 저장
with open('../data/processed/target_encoders.pkl', 'wb') as f:
    pickle.dump(target_encoders, f)

print("\n" + "=" * 50)
print("Target Encoding 완료!")

Target Encoding 적용:
--------------------------------------------------

HomePlanet:
  Europa: 0.6592
  Mars: 0.5231
  Earth: 0.4252

Destination:
  55 Cancri e: 0.6095
  PSO J318.5-22: 0.5019
  TRAPPIST-1e: 0.4721

Deck:
  B: 0.7296
  C: 0.6784
  G: 0.5158
  A: 0.5038
  F: 0.4399
  D: 0.4347
  T: 0.4024
  E: 0.3621

Side:
  S: 0.5530
  P: 0.4520

AgeGroup:
  nan: 0.7927
  Child: 0.6662
  Teen: 0.5370
  Middle: 0.4872
  Senior: 0.4741
  Adult: 0.4691

DeckSide:
  B_S: 0.7764
  C_S: 0.7572
  B_P: 0.6668
  G_S: 0.5841
  C_P: 0.5799
  A_S: 0.5469
  F_S: 0.4679
  D_S: 0.4633
  T_S: 0.4578
  A_P: 0.4548
  G_P: 0.4475
  T_P: 0.4312
  F_P: 0.4119
  D_P: 0.4108
  E_S: 0.3799
  E_P: 0.3463

Route:
  Europa_55 Cancri e: 0.6864
  Europa_PSO J318.5-22: 0.6564
  Europa_TRAPPIST-1e: 0.6368
  Mars_55 Cancri e: 0.6089
  Mars_TRAPPIST-1e: 0.5142
  Earth_55 Cancri e: 0.5097
  Earth_PSO J318.5-22: 0.5000
  Mars_PSO J318.5-22: 0.4506
  Earth_TRAPPIST-1e: 0.3903

Target Encoding 완료!


## 6. 최종 Feature 선택

In [19]:
# 사용할 Feature 목록
feature_cols = [
    # 기본 변수
    'CryoSleep', 'VIP', 'Age',
    
    # 지출 변수
    'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'TotalExpenditure', 'HasExpenditure', 'NumExpenseCategories',
    'LuxuryExpense', 'BasicExpense',
    
    # Log 변환 지출 변수
    'RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log',
    'TotalExpenditure_log', 'LuxuryExpense_log', 'BasicExpense_log',
    
    # 그룹/가족 변수
    'GroupSize', 'IsSolo', 'FamilySize',
    
    # Cabin 변수
    'CabinNum',
    
    # 나이 관련
    'IsMinor',
    
    # 상호작용 변수
    'CryoSleep_VIP', 'Solo_Expenditure',
    
    # Label Encoding된 범주형 변수
    'HomePlanet_encoded', 'Destination_encoded', 'Deck_encoded', 'Side_encoded',
    'AgeGroup_encoded', 'DeckSide_encoded', 'Route_encoded',
    
    # Target Encoding된 범주형 변수
    'HomePlanet_target', 'Destination_target', 'Deck_target', 'Side_target',
    'AgeGroup_target', 'DeckSide_target', 'Route_target',
    
    # 개별 지출 유무
    'HasRoomService', 'HasFoodCourt', 'HasShoppingMall', 'HasSpa', 'HasVRDeck'
]

# 그룹 통계 피처는 성능 하락으로 제외
# 'Group_MeanExpense', 'Group_MaxExpense', 'Group_SumExpense', ...

print(f"총 Feature 수: {len(feature_cols)}")
print("\nFeatures:")
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")

총 Feature 수: 47

Features:
 1. CryoSleep
 2. VIP
 3. Age
 4. RoomService
 5. FoodCourt
 6. ShoppingMall
 7. Spa
 8. VRDeck
 9. TotalExpenditure
10. HasExpenditure
11. NumExpenseCategories
12. LuxuryExpense
13. BasicExpense
14. RoomService_log
15. FoodCourt_log
16. ShoppingMall_log
17. Spa_log
18. VRDeck_log
19. TotalExpenditure_log
20. LuxuryExpense_log
21. BasicExpense_log
22. GroupSize
23. IsSolo
24. FamilySize
25. CabinNum
26. IsMinor
27. CryoSleep_VIP
28. Solo_Expenditure
29. HomePlanet_encoded
30. Destination_encoded
31. Deck_encoded
32. Side_encoded
33. AgeGroup_encoded
34. DeckSide_encoded
35. Route_encoded
36. HomePlanet_target
37. Destination_target
38. Deck_target
39. Side_target
40. AgeGroup_target
41. DeckSide_target
42. Route_target
43. HasRoomService
44. HasFoodCourt
45. HasShoppingMall
46. HasSpa
47. HasVRDeck


## 7. Train/Test 분리 및 저장

In [20]:
# Train/Test 분리
train_processed = df[df['is_train'] == 1].copy()
test_processed = df[df['is_train'] == 0].copy()

# Feature와 Target 분리
X_train = train_processed[feature_cols]
y_train = train_processed['Transported'].astype(int)
X_test = test_processed[feature_cols]

# PassengerId 저장 (제출용)
test_ids = test_processed['PassengerId']

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (8693, 47)
y_train shape: (8693,)
X_test shape: (4277, 47)


In [21]:
# 데이터 저장
X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
test_ids.to_csv('../data/processed/test_ids.csv', index=False)

# Feature 목록 저장
with open('../data/processed/feature_cols.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)

print("데이터 저장 완료!")
print("\n저장된 파일:")
print("- ../data/processed/X_train.csv")
print("- ../data/processed/y_train.csv")
print("- ../data/processed/X_test.csv")
print("- ../data/processed/test_ids.csv")
print("- ../data/processed/feature_cols.pkl")
print("- ../data/processed/label_encoders.pkl")
print("- ../data/processed/target_encoders.pkl  (NEW!)")

데이터 저장 완료!

저장된 파일:
- ../data/processed/X_train.csv
- ../data/processed/y_train.csv
- ../data/processed/X_test.csv
- ../data/processed/test_ids.csv
- ../data/processed/feature_cols.pkl
- ../data/processed/label_encoders.pkl
- ../data/processed/target_encoders.pkl  (NEW!)


## 8. 최종 데이터 확인

In [22]:
# 최종 데이터 확인
print("X_train 정보:")
print(X_train.info())
print("\nX_train 통계:")
X_train.describe()

X_train 정보:
<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0 to 8692
Data columns (total 47 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CryoSleep             8693 non-null   int32  
 1   VIP                   8693 non-null   int32  
 2   Age                   8693 non-null   float64
 3   RoomService           8693 non-null   float64
 4   FoodCourt             8693 non-null   float64
 5   ShoppingMall          8693 non-null   float64
 6   Spa                   8693 non-null   float64
 7   VRDeck                8693 non-null   float64
 8   TotalExpenditure      8693 non-null   float64
 9   HasExpenditure        8693 non-null   int32  
 10  NumExpenseCategories  8693 non-null   int64  
 11  LuxuryExpense         8693 non-null   float64
 12  BasicExpense          8693 non-null   float64
 13  RoomService_log       8693 non-null   float64
 14  FoodCourt_log         8693 non-null   float64
 15  ShoppingMall_l

Unnamed: 0,CryoSleep,VIP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalExpenditure,HasExpenditure,...,Deck_target,Side_target,AgeGroup_target,DeckSide_target,Route_target,HasRoomService,HasFoodCourt,HasShoppingMall,HasSpa,HasVRDeck
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,...,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,0.349362,0.022892,28.790291,220.035316,448.499022,169.598067,304.670309,298.305303,1441.108018,0.586564,...,0.50358,0.503626,0.503158,0.50348,0.50314,0.350627,0.364316,0.346256,0.380076,0.36075
std,0.476796,0.149568,14.341404,660.510429,1595.772461,597.999899,1125.540731,1134.115037,2803.061854,0.492478,...,0.105195,0.050492,0.066636,0.116457,0.105867,0.477194,0.481266,0.475803,0.485433,0.480246
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.362052,0.452015,0.469141,0.346324,0.390329,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.439889,0.452015,0.469141,0.411871,0.390329,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,717.0,1.0,...,0.50381,0.553018,0.474071,0.467865,0.509719,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,37.0,41.0,61.0,22.0,53.0,40.0,1441.0,1.0,...,0.515795,0.553018,0.487206,0.584065,0.608871,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,35987.0,1.0,...,0.729573,0.553018,0.792746,0.776367,0.686378,1.0,1.0,1.0,1.0,1.0


In [23]:
# 결측치 최종 확인
print("X_train 결측치:")
print(X_train.isnull().sum().sum())
print("\nX_test 결측치:")
print(X_test.isnull().sum().sum())

X_train 결측치:
0

X_test 결측치:
0


In [24]:
print("전처리 완료!")
print("다음 단계: 03_Modeling.ipynb에서 모델링 수행")

전처리 완료!
다음 단계: 03_Modeling.ipynb에서 모델링 수행
