# 회귀 기반 추천 시스템

**데이터 구조**

![](https://d.pr/i/YEs8M6+)


여행스타일 8가지 (7단계)
(매우선호 - 중간선호 - 약간선호 - 중립 - 약간선호 - 중간선호 - 매우 선호)
- `TRAVEL_STYL_1` 자연 vs 도시
- `TRAVEL_STYL_2` 숙박 vs 당일
- `TRAVEL_STYL_3` 새로운 지역 vs 익숙한 지역
- `TRAVEL_STYL_4` 편하지만 비싼 숙소 vs 불편하지만 저렴한 숙소
- `TRAVEL_STYL_5` 휴양/휴식 vs 체험활동
- `TRAVEL_STYL_6` 잘알려지지 않은 방문지 vs 알려진 방문지 
- `TRAVEL_STYL_7` 계획에 따른 여행 vs 상황에 따른 여행
- `TRAVEL_STYL_8` 사진촬영 중요하지 않음 vs 사진촬영 중요

여행동기 3가지 (10단계)
- `TRAVEL_MOTIVE_1` 여행의 주요 목적
- `TRAVEL_MOTIVE_2` 여행의 부수적 목적1
- `TRAVEL_MOTIVE_3` 여행의 부수적 목적2

1. 일상적인 환경에서의 탈출
2. 육체적 정신적 휴식
3. 여행 동반자와의 친밀감 증진
4. 자아찾기
5. … (확인 안됨)

In [None]:
# !pip install catboost



In [2]:
import pandas as pd

In [3]:
travel_df = pd.read_csv('./data/travel.csv')
print(travel_df.shape)
travel_df.head()

(34572, 15)


Unnamed: 0,GENDER,AGE_GRP,TRAVEL_STYL_1,TRAVEL_STYL_2,TRAVEL_STYL_3,TRAVEL_STYL_4,TRAVEL_STYL_5,TRAVEL_STYL_6,TRAVEL_STYL_7,TRAVEL_STYL_8,TRAVEL_MOTIVE_1,TRAVEL_COMPANIONS_NUM,VISIT_AREA_NM,MVMN_NM,DGSTFN
0,남,30.0,1.0,4.0,2.0,2.0,6.0,2.0,2.0,7.0,3.0,3.0,미스틱3도,자가용,5.0
1,남,20.0,4.0,1.0,5.0,1.0,1.0,4.0,1.0,6.0,3.0,1.0,에스제이렌트카,대중교통 등,4.0
2,여,50.0,4.0,1.0,2.0,4.0,3.0,3.0,2.0,3.0,1.0,3.0,법환식당,대중교통 등,5.0
3,남,30.0,1.0,1.0,1.0,5.0,6.0,3.0,5.0,7.0,7.0,3.0,에코랜드호텔,자가용,5.0
4,여,20.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,1.0,2.0,윤스타피자앤파스타,자가용,4.0


In [4]:
travel_df[['AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM']]\
= travel_df[['AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM']].astype(int)

travel_df.head()

Unnamed: 0,GENDER,AGE_GRP,TRAVEL_STYL_1,TRAVEL_STYL_2,TRAVEL_STYL_3,TRAVEL_STYL_4,TRAVEL_STYL_5,TRAVEL_STYL_6,TRAVEL_STYL_7,TRAVEL_STYL_8,TRAVEL_MOTIVE_1,TRAVEL_COMPANIONS_NUM,VISIT_AREA_NM,MVMN_NM,DGSTFN
0,남,30,1,4,2,2,6,2,2,7,3,3,미스틱3도,자가용,5.0
1,남,20,4,1,5,1,1,4,1,6,3,1,에스제이렌트카,대중교통 등,4.0
2,여,50,4,1,2,4,3,3,2,3,1,3,법환식당,대중교통 등,5.0
3,남,30,1,1,1,5,6,3,5,7,7,3,에코랜드호텔,자가용,5.0
4,여,20,5,3,3,3,3,3,3,5,1,2,윤스타피자앤파스타,자가용,4.0


In [5]:
from sklearn.model_selection import train_test_split

X = travel_df.drop("DGSTFN", axis=1)
y = travel_df['DGSTFN']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
from catboost import Pool

cat_features = [
    'GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 
    'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 
    'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM',
    'VISIT_AREA_NM', 'MVMN_NM'
]

X_train_pool = Pool(X_train, y_train, cat_features=cat_features)
X_test_pool = Pool(X_test, y_test, cat_features=cat_features)

In [7]:
from catboost import CatBoostRegressor

cb_reg = CatBoostRegressor(
    n_estimators=500,    # 반복횟수(내부 예측기 개수)
    depth=5,             # 개별 트리의 최대 깊이
    learning_rate=0.03,
    loss_function='RMSE', # 손실함수 (기본값)
    eval_metric='RMSE'    # 평가지표 (기본값)
)

cb_reg.fit(X_train_pool, eval_set=X_test_pool, verbose=100)

0:	learn: 0.8346704	test: 0.8359469	best: 0.8359469 (0)	total: 227ms	remaining: 1m 53s
100:	learn: 0.7988190	test: 0.7908820	best: 0.7908820 (100)	total: 4.01s	remaining: 15.9s
200:	learn: 0.7867175	test: 0.7774156	best: 0.7774156 (200)	total: 7.36s	remaining: 11s
300:	learn: 0.7825145	test: 0.7736886	best: 0.7736843 (299)	total: 11.3s	remaining: 7.44s
400:	learn: 0.7794152	test: 0.7712615	best: 0.7712615 (400)	total: 15.2s	remaining: 3.74s
499:	learn: 0.7768232	test: 0.7695659	best: 0.7695624 (497)	total: 19.2s	remaining: 0us

bestTest = 0.7695623604
bestIteration = 497

Shrink model to first 498 iterations.


<catboost.core.CatBoostRegressor at 0x1fd32a49eb0>

In [8]:
col_importance = pd.DataFrame({
    'column': X_train.columns,
    'importance': cb_reg.feature_importances_
})

col_importance

Unnamed: 0,column,importance
0,GENDER,0.373585
1,AGE_GRP,8.654944
2,TRAVEL_STYL_1,6.916474
3,TRAVEL_STYL_2,7.216198
4,TRAVEL_STYL_3,4.894502
5,TRAVEL_STYL_4,9.159067
6,TRAVEL_STYL_5,9.944409
7,TRAVEL_STYL_6,7.010335
8,TRAVEL_STYL_7,7.473501
9,TRAVEL_STYL_8,11.8442


##### 추천 시스템 구축

1. 방문지 목록을 생성
2. 사용자 특성 입력
3. 가상 만족도 예측
4. 만족도가 높은 순으로 추천

In [9]:
visit_areas = travel_df['VISIT_AREA_NM'].unique()
visit_areas.shape
# visit_areas[:10]

(10711,)

In [10]:
# 'GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 
# 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 
# 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM',
# 'VISIT_AREA_NM', 'MVMN_NM'

user_input = ['여', 60, 4, 4, 4, 4, 4, 4, 4, 4, 1, 2, '방문지', '자가용']
pred_results = []

for area in visit_areas:
    user_input[-2] = area
    dgstfn_pred = cb_reg.predict(user_input)
    pred_results.append(dgstfn_pred)

pred_results[:10]

[4.401964984917985,
 4.179443458185306,
 4.378276066571079,
 4.200209037595552,
 4.162203566447283,
 4.158752819945988,
 4.133033908069257,
 4.193716386232936,
 4.269295763590878,
 4.224832315153722]

In [11]:
result_df = pd.DataFrame({
    'VISIT_AREA_NM': visit_areas,
    'DGSTFN_PRED': pred_results
})

result_df.sort_values(by='DGSTFN_PRED', ascending=False).head(10)

Unnamed: 0,VISIT_AREA_NM,DGSTFN_PRED
129,스누피가든,4.583687
153,목장카페 밭디,4.556824
2300,그랜드하얏트제주,4.553043
47,보래드베이커스,4.55057
481,하라케케,4.548637
3054,대정오일시장,4.548467
1485,청파식당횟집,4.541439
251,김녕해수욕장,4.531456
1492,한라산아래첫마을,4.526219
54,연돈,4.520506
