In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [2]:
train = pd.read_csv('/kaggle/input/2024-4-big-data-analytics-certification-kr/train.csv')
test = pd.read_csv('/kaggle/input/2024-4-big-data-analytics-certification-kr/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              15000 non-null  int64  
 1   Sex             15000 non-null  object 
 2   Length          15000 non-null  float64
 3   Diameter        15000 non-null  float64
 4   Height          15000 non-null  float64
 5   Weight          15000 non-null  float64
 6   Shucked Weight  15000 non-null  float64
 7   Viscera Weight  15000 non-null  float64
 8   Shell Weight    15000 non-null  float64
 9   Age             15000 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.1+ MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              10000 non-null  int64  
 1   Sex             10000 non-null  object 
 2   Length          10000 non-null  float64
 3   Diameter        10000 non-null  float64
 4   Height          10000 non-null  float64
 5   Weight          10000 non-null  float64
 6   Shucked Weight  10000 non-null  float64
 7   Viscera Weight  10000 non-null  float64
 8   Shell Weight    10000 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 703.2+ KB


In [5]:
train.head(2)

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,F,1.375,1.05,0.2875,20.879407,9.823102,4.819415,5.10291,9.0
1,1,M,1.2375,0.95,0.3125,16.839603,7.540967,3.713785,4.394172,10.0


In [6]:
test.head(2)

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
0,15000,I,1.325,1.0625,0.375,21.970862,10.602713,4.562909,5.244657
1,15001,M,0.925,0.7125,0.25,7.555142,2.721552,1.502523,2.126212


In [7]:
#성별 컬럼 원-핫 인코딩
train = pd.get_dummies(train, columns=['Sex'])
test = pd.get_dummies(test, columns=['Sex'])

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              15000 non-null  int64  
 1   Length          15000 non-null  float64
 2   Diameter        15000 non-null  float64
 3   Height          15000 non-null  float64
 4   Weight          15000 non-null  float64
 5   Shucked Weight  15000 non-null  float64
 6   Viscera Weight  15000 non-null  float64
 7   Shell Weight    15000 non-null  float64
 8   Age             15000 non-null  float64
 9   Sex_F           15000 non-null  bool   
 10  Sex_I           15000 non-null  bool   
 11  Sex_M           15000 non-null  bool   
dtypes: bool(3), float64(8), int64(1)
memory usage: 1.1 MB


In [9]:
# train ID 컬럼 제외, test ID 컬럼 보관
train = train.drop(['id'], axis=1)
test_id = test.pop('id')

In [10]:
# 학습 데이터 X, y 분리
X = train.drop(['Age'], axis=1)
y = train['Age']

In [11]:
# 스케일링
# StandardScaler : 데이터의 평균을 0, 분산을 1로 조정합니다. 데이터가 정규 분포를 따를 때 적합
# MinMaxScaler : 데이터의 최소값을 0, 최대값을 1로 조정합니다. 데이터가 특정 범위 내에 있어야 할 때 적합
# RobustScaler : 중앙값과 IQR을 사용하여 데이터를 스케일링합니다. 이상치에 민감한 데이터를 처리할 때 적합
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
test = scaler.transform(test)

In [12]:
random_state = 42
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=random_state)

In [13]:
model = RandomForestRegressor()

In [14]:
model.fit(X_train, y_train)

In [15]:
# 검증 데이터 추론
preds = model.predict(X_val)

In [16]:
mse = mean_squared_error(y_val, preds)
mse

4.116695543037037

In [17]:
# test 데이터 추론
preds = model.predict(test)

In [18]:
submission = pd.DataFrame({'id': test_id,
                           'yield': preds})
submission.to_csv('submission.csv', index=False)

In [19]:
# MinMaxScaler   : 1.39015
# StandardScaler : 1.39836
# RobustScaler   : 1.39884
# no scaler      : 1.39972