# Data 분석
- 목적 : 중고차 예상 판매가격 예측

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/car_data.csv')
df

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,city,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,city,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0


In [2]:
# name을 index로 전환
df.set_index('Car_Name', inplace = True)

## Feature
- Present_Price, Kms_Driven을 제외한 나머지 Feature은 범주형
- 두 가지는 연속형
## Label
- Selling_Price, 중고차 판매가격으로 예측해야되는 연속적인 값

# Data 전처리
1. Feature, Label 분리
2. Feature
  - 범주형의 경우 pandas의 get_dummies() 함수 이용해 One-Hot Encoding
  - 연속형의 경우 앙상블을 위해 Feature Scaling  

In [3]:
ohe = ['Year','Fuel_Type','Seller_Type', 'Transmission', 'Owner']
df2 = pd.get_dummies(df, columns= ohe)     
df2
y = df2['Selling_Price']
X= df2.drop(columns='Selling_Price')

In [4]:
from sklearn.model_selection import train_test_split

# X(Feature), y(Label) 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((225, 28), (225,), (76, 28), (76,))

In [5]:
from sklearn.preprocessing import StandardScaler

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[ 0.20587776,  0.56585189, -0.09470274, ...,  0.16552118,
        -0.15075567, -0.06681531],
       [-0.74959057, -0.7716353 , -0.09470274, ...,  0.16552118,
        -0.15075567, -0.06681531],
       [-0.20644237, -0.32054944, -0.09470274, ...,  0.16552118,
        -0.15075567, -0.06681531],
       ...,
       [ 0.20587776,  0.17024008, -0.09470274, ...,  0.16552118,
        -0.15075567, -0.06681531],
       [-0.72462848,  4.22754942, -0.09470274, ...,  0.16552118,
        -0.15075567, -0.06681531],
       [-0.11840645, -0.52529866, -0.09470274, ...,  0.16552118,
        -0.15075567, -0.06681531]])

## 모델 선정
- 모델 선정을 위해 Ensemble Voting 방식 이용해 검증 및 상관관계 판단

In [6]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [7]:
# 각 model 생성
knn = KNeighborsRegressor(n_neighbors=5)
rf = RandomForestRegressor(n_estimators=200, max_depth=5, random_state=0)
lr = LinearRegression()
xgb = XGBRegressor(max_depth=2, random_state=0)

In [8]:
# 학습
knn.fit(X_train_scaled, y_train)   
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lr.fit(X_train_scaled, y_train)
# 평가
pred_train_knn = knn.predict(X_train_scaled)
pred_train_rf = rf.predict(X_train)
pred_train_xgb = xgb.predict(X_train)
pred_train_lr = lr.predict(X_train_scaled)

pred_test_knn = knn.predict(X_test_scaled)
pred_test_rf = rf.predict(X_test)
pred_test_xgb = xgb.predict(X_test)
pred_test_lr = lr.predict(X_test_scaled)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [9]:
# 평가 요소 관련 함수 정의
from sklearn.metrics import mean_squared_error, r2_score
def print_regression_metrics(y, pred, title=None):
    mse = mean_squared_error(y, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y, pred)
    if title:
        print(title)
    print(f"MSE: {mse}, RMSE: {rmse}, R2: {r2}")

In [10]:
# train, test set 평가
print_regression_metrics(y_train, pred_train_knn, "Train KNN")
print_regression_metrics(y_test, pred_test_knn, "Test KNN")
print('-'*100)
print_regression_metrics(y_train, pred_train_rf, "Train RF")
print_regression_metrics(y_test, pred_test_rf, "Test RF")
print('-'*100)
# print_regression_metrics(y_train, pred_train_lr, "Train LogisticRegression")
# print_regression_metrics(y_test, pred_test_lr, "Test LogisticRegression")
print_regression_metrics(y_train, pred_train_lr, 'Train lr')
print_regression_metrics(y_test, pred_test_lr, 'Test lr')
print('-'*100)
print_regression_metrics(y_train, pred_train_xgb, 'Train XGB')
print_regression_metrics(y_test, pred_test_xgb, 'Test XGB')

Train KNN
MSE: 3.8468891377777785, RMSE: 1.961348805740014, R2: 0.8361350499630428
Test KNN
MSE: 5.1136786842105275, RMSE: 2.2613444417448942, R2: 0.8400262578579532
----------------------------------------------------------------------------------------------------
Train RF
MSE: 0.9196642334055922, RMSE: 0.9589912582529583, R2: 0.9608252985047449
Test RF
MSE: 3.632671057422685, RMSE: 1.905956730207348, R2: 0.8863573527172579
----------------------------------------------------------------------------------------------------
Train lr
MSE: 2.2714504583902033, RMSE: 1.5071331919874247, R2: 0.9032436073552802
Test lr
MSE: 4.107933903937594, RMSE: 2.0268038642003803, R2: 0.8714894697795155
----------------------------------------------------------------------------------------------------
Train XGB
MSE: 0.2059643375824475, RMSE: 0.4538329401690092, R2: 0.9912265899331741
Test XGB
MSE: 2.826120227592253, RMSE: 1.681106845977451, R2: 0.9115890816630252


### 상관관계 분석

In [11]:
df = pd.DataFrame(np.c_[pred_train_knn,pred_train_rf, pred_train_xgb,pred_train_lr], 
                  columns=[ "KNN", 'RandomForest', 'XGBoost','lr'])
df.corr()

Unnamed: 0,KNN,RandomForest,XGBoost,lr
KNN,1.0,0.931315,0.921375,0.874068
RandomForest,0.931315,1.0,0.989343,0.94184
XGBoost,0.921375,0.989343,1.0,0.956381
lr,0.874068,0.94184,0.956381,1.0


- XGBoost 기준으로 상관관계 낮은 KNN까지 두 가지 
  - KNN의 경우 최종 모델로 사용하기 부적합하기에 XGBoost 선정

# XGBOOST

In [17]:
import xgboost 
from sklearn.metrics import mean_squared_error, r2_score

# xgb_model = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)xgb = XGBRegressor(n_estimators = 200, learning_rate = 0.1, random_state = 2000)
xgb = xgboost.XGBRegressor()

- XGBoost, Random Search 이용해 모델 학습

In [18]:
from sklearn.model_selection import RandomizedSearchCV
params = {
 "learning_rate" : [0.01,0.05,0.1, 1.5, 2, 2.5, 3],
 "max_depth" : [3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7, 9, 11, 13, 15, 17],
 "gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 ],
 "n_estimators" : [10, 20, 30, 50, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900 ,1000]  # 반복횟수,
#  "subsample" = 
}
n_iter_search = 60
xgb_random = RandomizedSearchCV(xgb,    # 모델
                                param_distributions=params,   # 파라미터 조합
                                n_iter=n_iter_search,   
                                scoring= 'r2',
                                cv=4,   # cross validation의 fold 개수
                                n_jobs=-1) 
xgb_random.fit(X_train,y_train)

  3.05259667e-01 -3.99876223e-02 -1.37675461e+16  5.47173924e-01
  7.58885642e-01 -9.82419675e-02  8.52557438e-01 -1.32751614e+06
  1.36924622e-01  6.70183671e-01  5.33590058e-01 -1.18170296e+06
  8.43298250e-01  3.10262054e-01  7.59128767e-01             nan
 -5.41176627e+57  7.52372203e-01  7.49804361e-01             nan
  8.24284742e-01             nan  1.31747051e-01             nan
  3.83653490e-01 -2.74776011e-01  6.86888114e-01  7.96204632e-01
  4.93501979e-01  5.11340346e-01             nan -1.49269139e+11
  7.65923077e-01  7.02218482e-01 -4.29212755e+57  4.88530981e-01
  2.65334291e-01             nan  3.50663282e-01  8.21796245e-01
 -3.00211972e+28  4.12526810e-01 -1.21102857e+06 -4.31225032e+57
  7.69919589e-01  7.50039029e-01 -1.32468285e+58  6.97410962e-01
  2.87541914e-01 -1.16240483e+33  2.96740095e-01  4.86551327e-01
  2.72385992e-01  8.74892681e-01 -5.84350716e+05  5.32195611e-01]
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [19]:
# best model 조회
xgb_random.best_score_

0.8748926809864681

In [20]:
# best parameger 조회
xgb_random.best_params_

{'n_estimators': 600,
 'min_child_weight': 5,
 'max_depth': 5,
 'learning_rate': 0.05,
 'gamma': 0.0}

- 검증 결과 조회

In [21]:
best = xgb_random.best_estimator_

pred_train = best.predict(X_train)
pred_test = best.predict(X_test)

print('train_mse : ',mean_squared_error(y_train, pred_train))
print('train_r2 : ',r2_score(y_train, pred_train))
print('test_mse : ',mean_squared_error(y_test, pred_test))
print('test_r2 : ',r2_score(y_test, pred_test))

train_mse :  0.0822948459980462
train_r2 :  0.9964945075501814
test_mse :  2.902094114953689
test_r2 :  0.9092123529288139
