In [1]:
# 다음 데이터 세트를 이용하여 다이아몬드 가격예측(회귀) 

# url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv" diamonds = pd.read_csv(url) 

# 1. ML (RF, DT, LR) 수행

# 2. Dense layer만 이용 FNCC 구현

# 3. 순환 데이터 변환후 CNN 구현



# 각 단계별로 수행완료후

# 검사받은 이후에 다음 단계 진행

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# 1. 데이터 로드 및 전처리
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv")

In [4]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [5]:
# 원-핫 인코딩
df = pd.get_dummies(df, columns=['cut','color','clarity'])

In [6]:
df

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,False,False,True,...,False,False,False,False,False,True,False,False,False,False
1,0.21,59.8,61.0,326,3.89,3.84,2.31,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,0.23,56.9,65.0,327,4.05,4.07,2.31,False,True,False,...,False,False,False,False,False,False,True,False,False,False
3,0.29,62.4,58.0,334,4.20,4.23,2.63,False,False,False,...,True,False,False,False,False,False,False,True,False,False
4,0.31,63.3,58.0,335,4.34,4.35,2.75,False,True,False,...,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50,False,False,True,...,False,False,False,False,True,False,False,False,False,False
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61,False,True,False,...,False,False,False,False,True,False,False,False,False,False
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56,False,False,False,...,False,False,False,False,True,False,False,False,False,False
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [7]:
# 특성과 타겟 분리
X = df.drop(['price'], axis=1)
y = df['price']

In [8]:
# 이상치 제거
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [9]:
# 주요 특성들에 대해 이상치 제거
columns_to_clean = ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']
for column in columns_to_clean:
    df = remove_outliers(df, column)

In [10]:
# 스케일링
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

In [11]:
# CNN 입력을 위한 reshape
X_reshaped = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)

In [12]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X_reshaped, y_scaled, test_size=0.2, random_state=42
)

In [13]:
# 2. CNN 모델 구성
model = Sequential([    
    # Dense 레이어
    Flatten(),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1)
])

In [14]:
# 3. 모델 컴파일
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

In [15]:
# 4. 모델 학습
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.3875 - mae: 0.4412 - val_loss: 0.0432 - val_mae: 0.1409
Epoch 2/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.1124 - mae: 0.2323 - val_loss: 0.0374 - val_mae: 0.1305
Epoch 3/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0879 - mae: 0.1969 - val_loss: 0.0303 - val_mae: 0.1071
Epoch 4/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0803 - mae: 0.1887 - val_loss: 0.0293 - val_mae: 0.1066
Epoch 5/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0667 - mae: 0.1736 - val_loss: 0.0295 - val_mae: 0.1047
Epoch 6/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0651 - mae: 0.1695 - val_loss: 0.0284 - val_mae: 0.1103
Epoch 7/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [18]:
# 5. 모델 평가
y_pred_scaled = model.predict(X_test)

[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [19]:
# 예측값을 원래 스케일로 변환
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test_orig = scaler_y.inverse_transform(y_test)

In [20]:
# 성능 지표 계산
mse = mean_squared_error(y_test_orig, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_orig, y_pred)

In [21]:
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'R2 Score: {r2:.4f}')

MSE: 379732.4826
RMSE: 616.2244
R2 Score: 0.9761
