# Regression 종합실습 : Car seat sales
유아용 카시트 매출액을 예측해 봅시다.

* 카시트에 대해서 지역 매장 별 매출액을 예측하고자 합니다.

![](https://cdn.images.express.co.uk/img/dynamic/24/590x/child-car-seat-986556.jpg?r=1532946857754)

## 1.환경준비

### (1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) Data Loading

In [2]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)

**변수설명**
> * Sales - 각 지역 판매량(단위 : 1000개) <== Target
* CompPrice - 각 지역 경쟁사 가격
* Income - 각 지역 평균 소득수준(단위 : 1000달러)
* Advertising - 각 지역, 회사의 광고 예산(단위 : 1000달러)
* Population - 지역 인구수(단위 : 1000명)
* Price - 자사 지역별 판매가격
* ShelveLoc - 진열상태
* Age - 지역 인구의 평균 연령
* Education - 각 지역 교육수준 레벨
* Urban - 매장 도시 지역 여부
* US - 매장이 미국에 있는지 여부

## 2.데이터 이해

* 둘러보기

In [3]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


## 3.데이터 준비

### (1) 데이터 정리

In [142]:
data['Price_diff'] =  data['CompPrice'] - data['Price'] 
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,Price_diff
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes,18
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes,28
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes,33
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes,20
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No,13


### (2) 데이터분할1 : x, y 나누기

In [143]:
target = 'Sales'
x = data.drop(target, axis=1)
y = data[target]

### (3) NA 조치

In [144]:
data.isna().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
Price_diff     0
dtype: int64

### (4) 가변수화

In [145]:
var = ['ShelveLoc', 'Education', 'Urban', 'US']
x = pd.get_dummies(x, columns=var, drop_first=True)

### (5) 데이터분할2 : train : validation 나누기

In [146]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [147]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler1 = MinMaxScaler()
x_train_s1 = scaler1.fit_transform(x_train)
x_val_s1 = scaler1.transform(x_val)

scaler2 = StandardScaler()
x_train_s2 = scaler2.fit_transform(x_train)
x_val_s2 = scaler2.transform(x_val)

## 4.모델링 : 선형회귀

* 변수를 조절하며 최소 2개 이상의 모델을 생성하고 예측하고 평가해 봅시다.

In [148]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

* 모델1 : ['ShelvLoc_Good', 'Price']

In [156]:
features = ['ShelveLoc_Good']
x_train1 = x_train[features]
x_val1 = x_val[features]

model1 = LinearRegression()
model1.fit(x_train1, y_train)
pred1 = model1.predict(x_val1)

In [157]:
print(mean_squared_error(y_val, pred1, squared=False))
print(mean_absolute_error(y_val, pred1))
print(mean_absolute_percentage_error(y_val, pred1))
print(1-mean_absolute_percentage_error(y_val, pred1))

2.5126619621549557
2.0659154240447344
0.7946856361158002
0.20531436388419977


* 모델2 : 전체

In [151]:
model2 = LinearRegression()
model2.fit(x_train, y_train)
pred2 = model2.predict(x_val)

In [152]:
print(mean_squared_error(y_val, pred2, squared=False))
print(mean_absolute_error(y_val, pred2))
print(mean_absolute_percentage_error(y_val, pred2))
print(1-mean_absolute_percentage_error(y_val, pred2))

1.0417209287542293
0.8291702856537149
0.21735487590453373
0.7826451240954663


## 5.모델링 : KNN

* 하이퍼파라미터를 조절하며 모델을 최소 3가지 이상 생성하시오.

In [131]:
from sklearn.neighbors import KNeighborsRegressor

In [132]:
x.shape

(400, 19)

* 모델3

In [133]:
model3 = KNeighborsRegressor(n_neighbors=10)
model3.fit(x_train_s2, y_train)
pred3 = model3.predict(x_val_s2)

In [134]:
print(mean_squared_error(y_val, pred3, squared=False))
print(mean_absolute_error(y_val, pred3))
print(mean_absolute_percentage_error(y_val, pred3))
print(1-mean_absolute_percentage_error(y_val, pred3))

2.2055317159360914
1.781
0.6497432279296568
0.3502567720703432


* 모델4

In [135]:
model4 = KNeighborsRegressor(n_neighbors=5)
model4.fit(x_train_s2, y_train)
pred4 = model4.predict(x_val_s2)

In [136]:
print(mean_squared_error(y_val, pred4, squared=False))
print(mean_absolute_error(y_val, pred4))
print(mean_absolute_percentage_error(y_val, pred4))
print(1-mean_absolute_percentage_error(y_val, pred4))

2.0308625507404483
1.61045
0.6302908295274053
0.36970917047259466


* 모델5

In [158]:
model5 = KNeighborsRegressor(n_neighbors=6, metric = 'euclidean')
model5.fit(x_train_s2, y_train)
pred5 = model5.predict(x_val_s2)

print(mean_squared_error(y_val, pred5, squared=False))
print(mean_absolute_error(y_val, pred5))
print(mean_absolute_percentage_error(y_val, pred5))
print(1-mean_absolute_percentage_error(y_val, pred5))

2.02800437851087
1.6560277777777779
0.6247951550075317
0.3752048449924683


In [160]:
model6 = KNeighborsRegressor(n_neighbors=3, metric = 'manhattan')
model6.fit(x_train_s2, y_train)
pred6 = model5.predict(x_val_s2)

print(mean_squared_error(y_val, pred6, squared=False))
print(mean_absolute_error(y_val, pred6))
print(mean_absolute_percentage_error(y_val, pred6))
print(1-mean_absolute_percentage_error(y_val, pred6))

2.02800437851087
1.6560277777777779
0.6247951550075317
0.3752048449924683


## 6.성능비교

In [161]:
model_no, rmse, mae, mape = [], [], [], []
pred = [pred1, pred2, pred3, pred4, pred5, pred6]
for i, p in enumerate(pred):
    model_no.append(i+1)
    rmse.append(mean_squared_error(y_val, p, squared=False))
    mae.append(mean_absolute_error(y_val, p))
    mape.append(mean_absolute_percentage_error(y_val, p))

perf = pd.DataFrame({'model_no' : model_no,
                     'rmse' : rmse,
                     'mse' : mae,
                     'mape' : mape,              
                    })
perf

Unnamed: 0,model_no,rmse,mse,mape
0,1,2.512662,2.065915,0.794686
1,2,1.041721,0.82917,0.217355
2,3,2.205532,1.781,0.649743
3,4,2.030863,1.61045,0.630291
4,5,2.028004,1.656028,0.624795
5,6,2.028004,1.656028,0.624795
