## 데이터 전처리

In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt

In [16]:
df = pd.read_csv('csv/train.csv')

In [17]:
# 차트 한글폰트
plt.rc('font', family='Malgun Gothic')

In [18]:
def prep(df):
    drop_cols = ['ID','압축천연가스(CNG)','경유','가솔린','하이브리드','액화석유가스(LPG)','판매구역','판매도시']

    # '연료' 컬럼 생성 및 값 할당
    df['연료'] = 0
    # '액화석유가스(LPG)'에 해당하는 경우 4 할당
    df.loc[df['액화석유가스(LPG)'] == 1, '연료'] = 0
    # '경유'에 해당하는 경우 1 할당
    df.loc[df['경유'] == 1, '연료'] = 1
    # '가솔린'에 해당하는 경우 2 할당
    df.loc[df['가솔린'] == 1, '연료'] = 2
    # '압축천연가스(CNG)'에 해당하는 경우 3 할당
    df.loc[df['압축천연가스(CNG)'] == 1, '연료'] = 3
    # '하이브리드'에 해당하는 경우 4 할당
    df.loc[df['하이브리드'] == 1, '연료'] = 4

    # 기존 컬럼 제거
    df.drop(columns=drop_cols, inplace=True)
    df = df[df['주행거리'] != df['주행거리'].max()]
    df = df[df["주행거리"] >= 200]
    df = df.reset_index(drop=True)
    
    # 레이블 인코딩(Label Encoding) - 차량모델명 컬럼
    label_encoder = LabelEncoder()
    df['차량모델명'] = label_encoder.fit_transform(df['차량모델명'])
    # df['브랜드'] = label_encoder.fit_transform(df['브랜드'])
    
    # 원핫인코딩 - 브랜드 컬럼
    df = pd.get_dummies(df)
    
    df.info()
    return df

In [19]:
df = prep(df)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56698 entries, 0 to 56697
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   생산년도               56698 non-null  int64  
 1   모델출시년도             56698 non-null  int64  
 2   차량모델명              56698 non-null  int32  
 3   주행거리               56698 non-null  int64  
 4   배기량                56698 non-null  int64  
 5   가격                 56698 non-null  float64
 6   연료                 56698 non-null  int64  
 7   브랜드_audi           56698 non-null  uint8  
 8   브랜드_bmw            56698 non-null  uint8  
 9   브랜드_citroen        56698 non-null  uint8  
 10  브랜드_fiat           56698 non-null  uint8  
 11  브랜드_ford           56698 non-null  uint8  
 12  브랜드_honda          56698 non-null  uint8  
 13  브랜드_hyundai        56698 non-null  uint8  
 14  브랜드_kia            56698 non-null  uint8  
 15  브랜드_mazda          56698 non-null  uint8  
 16  브랜드_mercedes-benz  566

## 스케일링

In [20]:
# # 로그 스케일링
# df['주행거리'] = np.log(df['주행거리'])

In [21]:
# # 스탠다드 스케일링
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(df)
# scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

In [22]:
y_car_df = df['가격']
X_car_df = df.drop('가격', axis=1)

In [23]:
X_train, X_test, y_train, y_test= train_test_split(df, y_car_df, test_size=0.2)

## 선형회귀 - 분할된 학습

In [24]:
# 분할된 학습
condition_train = (df['가격'] >= 100)
condition_test = (df['가격'] >= 100)

X_train_condition_satisfied = X_train[condition_train].drop('가격', axis=1)
X_train_condition_not_satisfied = X_train[~condition_train].drop('가격', axis=1)
y_train_condition_satisfied = y_train[condition_train]
y_train_condition_not_satisfied = y_train[~condition_train]

X_test_condition_satisfied = X_test[condition_test].drop('가격', axis=1)
X_test_condition_not_satisfied = X_test[~condition_test].drop('가격', axis=1)
y_test_condition_satisfied = y_test[condition_test]
y_test_condition_not_satisfied = y_test[~condition_test]

# 모델 생성
model_condition_satisfied = LinearRegression()
model_condition_not_satisfied = LinearRegression()
# 모델 학습
model_condition_satisfied.fit(X_train_condition_satisfied, y_train_condition_satisfied)
model_condition_not_satisfied.fit(X_train_condition_not_satisfied, y_train_condition_not_satisfied)

# 예측
prediction_satisfied = model_condition_satisfied.predict(X_test_condition_satisfied)
prediction_not_satisfied = model_condition_not_satisfied.predict(X_test_condition_not_satisfied)

# R2 스코어 계산
r2_condition_satisfied = r2_score(y_test_condition_satisfied, prediction_satisfied)
r2_condition_not_satisfied = r2_score(y_test_condition_not_satisfied, prediction_not_satisfied)

# MSE 계산
mse_condition_satisfied = mean_squared_error(y_test_condition_satisfied, prediction_satisfied)
mse_condition_not_satisfied = mean_squared_error(y_test_condition_not_satisfied, prediction_not_satisfied)

# MAE 계산
mae_condition_satisfied = mean_absolute_error(y_test_condition_satisfied, prediction_satisfied)
mae_condition_not_satisfied = mean_absolute_error(y_test_condition_not_satisfied, prediction_not_satisfied)

# RMSE 계산
rmse_condition_satisfied = np.sqrt(mse_condition_satisfied)
rmse_condition_not_satisfied = np.sqrt(mse_condition_not_satisfied)


# 결과 출력

print("가격 100이상 - R2 Score:", r2_condition_satisfied)
print("가격 100이상 - MSE:", mse_condition_satisfied)
print("가격 100이상 - MAE:", mae_condition_satisfied)
print("가격 100이상 - RMSE:", rmse_condition_satisfied)
print("그외 - R2 Score:", r2_condition_not_satisfied)
print("그외 - MSE:", mse_condition_not_satisfied)
print("그외 - MAE:", mae_condition_not_satisfied)
print("그외 - RMSE:", rmse_condition_not_satisfied)

가격 100이상 - R2 Score: 0.15449038140291627
가격 100이상 - MSE: 219.6142941914061
가격 100이상 - MAE: 12.077298811526596
가격 100이상 - RMSE: 14.81938913017018
그외 - R2 Score: 0.7827147744573981
그외 - MSE: 130.53612051249803
그외 - MAE: 8.626305538030014
그외 - RMSE: 11.425240501297907


  X_train_condition_satisfied = X_train[condition_train].drop('가격', axis=1)
  X_train_condition_not_satisfied = X_train[~condition_train].drop('가격', axis=1)
  X_test_condition_satisfied = X_test[condition_test].drop('가격', axis=1)
  X_test_condition_not_satisfied = X_test[~condition_test].drop('가격', axis=1)


## 피처 확인

In [25]:
df_feature_importances = pd.DataFrame({'Features':X_test_condition_satisfied.columns,'계수':model_condition_satisfied.coef_})
df_feature_importances

Unnamed: 0,Features,계수
0,생산년도,2.10614
1,모델출시년도,0.478492
2,차량모델명,0.100178
3,주행거리,-4.7e-05
4,배기량,0.008506
5,연료,1.417227
6,브랜드_audi,13.44923
7,브랜드_bmw,3.942094
8,브랜드_citroen,4.116259
9,브랜드_fiat,-16.180634


In [26]:
print('절편 값 : ' ,model_condition_satisfied.intercept_)

절편 값 :  -5113.390423128641


In [27]:
# 회기 계수가 큰 값 순으로 정렬
coeff = pd.Series(data=np.round(model_condition_satisfied.coef_, 1), index=X_test_condition_satisfied.columns)
coeff.sort_values(ascending=False)

브랜드_mercedes-benz    14.3
브랜드_audi             13.4
브랜드_peugeot          13.1
브랜드_citroen           4.1
브랜드_bmw               3.9
브랜드_volkswagen        2.5
생산년도                  2.1
브랜드_honda             1.9
연료                    1.4
브랜드_mazda             1.1
브랜드_seat              0.5
모델출시년도                0.5
차량모델명                 0.1
배기량                   0.0
주행거리                 -0.0
브랜드_mitsubishi       -0.5
브랜드_ford             -1.1
브랜드_volvo            -1.3
브랜드_skoda            -1.5
브랜드_toyota           -2.4
브랜드_nissan           -3.5
브랜드_kia              -5.3
브랜드_opel             -7.5
브랜드_renault          -7.5
브랜드_hyundai          -8.1
브랜드_fiat            -16.2
dtype: float64

## KFOLD

In [28]:
# # cross_val_score()로 5 fold 세트로 MSE를 구한 뒤 이를 기반으로 다시 RMSE를 구함.
# neg_mse_scores = cross_val_score(linear_model, X_car_df, y_car_df, scoring="neg_mean_squared_error", cv=5)
# rmse_scores = np.sqrt(-1 * neg_mse_scores)
# avg_rmse = np.mean(rmse_scores)

# print(f'''
#       5 folds의 개별 Negative MSE 스코어 : {np.round(neg_mse_scores, 2)}
#       5 folds의 개별 rmse 스코어 : {np.round(rmse_scores,2)}
#       5 folds의 평균 rmse {avg_rmse}
#       ''')