In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# Data Processing
import pandas as pd
import numpy as np
import requests

# Data Regex
import re

# Data Visualization
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt
%matplotlib inline

# Machine Learning
from sklearn.metrics.pairwise import cosine_similarity 


# !

###### to-do-list
1. server 어디를 이용할 지
2. back-up 자료를 풍성하게 준비
3. 군집화를 하고 customer's review 준비
    - 임의로 점수를 넣는 것보다 좋다 안좋다로 하는 것이 좋을 수 있다.
    - 데이터 줄이기 (약 3000개)  (이미지 없는거 빼기)

# Purpose 


**wine recommendation system based on coffee preference for wine beginners that desire more economical alternatives**


- https://www.businessinsider.com/wine-reccomendations-based-on-coffee-2015-9
- https://www.google.com/search?q=NY+Mulino+a+Vino&oq=NY+Mulino+a+Vino&aqs=chrome..69i57j0i7i8i30.3828j0j9&sourceid=chrome&ie=UTF-8

# Data crawling 
- data crawled from www.wine21.com
- wine product data filtered
    - price : under ₩ 200,000
    - wine types limited to red, white, rose, sparkling and porto
- omitted any data w/o
    - image
    - title
    - alcohol level
    - either sweetness, body rate, acidity, tannin rate
    - price
    - maker's note (for aroma categorization)
- use only items being sold in off-line stores like E-mart, Lotte, Costco, and etc.
- add 'aroma' column later

In [3]:
df_1 = pd.read_csv('./data/df_aroma.csv')
df_1.drop(['Unnamed: 0'], axis=1, inplace=True)
df_1

Unnamed: 0,title,image_link,winery,country,province,grape_type,wine_type,occasion,alcohol,sweetness,acidity,body_rate,tannin_rate,food_pairing,price,vintage,size,link,aroma_y
0,트리풀라 피에몬테 로쏘,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,몬도 델 비노,이탈리아,피에몬테,"['바르베라', '네비올로']",레드,테이블 와인,13.5,1,3,3,3,"피자, 파스타, 붉은 육류, 그리고 치즈 플레이트와 잘 어울리며, 특유의 산도가...",68000,2017,750,https://www.wine21.com/13_search/wine_view.htm...,"다채로운 꽃 향과 오디의 과실향이 완벽하게 어우러져 있으며, 비교적 낮은 산도와 부..."
1,아케시 로사토 브뤼,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,몬도 델 비노,이탈리아,피에몬테,"['바르베라', '돌체토']",스파클링,테이블 와인,11.5,1,3,3,1,식전 와인으로 적합하며 전체요리나 기름진 생선요리와 잘 어울린다.,55000,NV,750,https://www.wine21.com/13_search/wine_view.htm...,"피에몬테 지역에서 재배된 바르베라 70%, 돌체토 30%로 만든 스파클링 와인으로 ..."
2,엘리자베스 로제 샤도네이,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,고스트 블락 와이너리,미국,캘리포니아,['샤르도네'],화이트,테이블 와인,13.5,1,4,3,1,"치즈, 해산물, 샐러드 등과 잘 어울린다.",135000,2018,750,https://www.wine21.com/13_search/wine_view.htm...,"밝고 투명한 볏짚색을 띠고 신선한 하얀 배와 리치, 애플리콧, 시트러스의 아로마가 ..."
3,"맥매니스, 캘리포니아 피노 누아",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,맥매니스 와이너리,미국,캘리포니아,['피노 누아'],레드,테이블 와인,13.5,1,4,3,3,"치즈, 오리고기 등과 잘 어울린다.",65000,2019,750,https://www.wine21.com/13_search/wine_view.htm...,짙은 루비색을 띠고 달콤한 딸기잼과 신선한 체리의 아로마가 느껴진다. 입 안에서는 ...
4,"백하우스, 피노 누아",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,백하우스,미국,캘리포니아,['피노 누아'],레드,테이블 와인,13.5,1,4,3,3,"치즈, BBQ, 구운 고기, 버섯 요리 등과 잘 어울린다.",48000,2018,750,https://www.wine21.com/13_search/wine_view.htm...,"루비색을 띠고 완숙한 블랙베리, 산딸기 등의 과실향, 은은한 바닐라, 오크의 아로마..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1905,"샹파뉴 델라모뜨, 브뤼",http://img.wine21.com/WINE_MST/TITLE/0154000/W...,샹파뉴 들라모뜨,프랑스,샹파뉴,"['샤르도네', '피노 누아', '피노 뮈니에']",스파클링,테이블 와인,12.0,1,5,3,1,너무 차지 않게 해 식전주로 마시거나 살구 타르트 등의 과일 디저트와 곁들이면 잘 ...,110000,NV,750,https://www.wine21.com/13_search/wine_view.htm...,"신선한 과일 향과 섬세한 질감, 우아하면서도 무게감이 있는 전형적인 브륏 스타일의 ..."
1906,칭퀘테레 샤케트라,http://img.wine21.com/WINE_MST/TITLE/0154000/W...,깐티나 칭퀘테레,이탈리아,리구리아,"['보스코', '알라롤라', '베르멘티노']",화이트,테이블 와인,9.5,5,2,4,1,"와인 자체로도 완벽하지만, 각종 디져트와 향이 강한 치즈와 잘 어울린다.",200000,2011,375,https://www.wine21.com/13_search/wine_view.htm...,"호박색이 감도는 영롱한 토파즈 색깔을 띠고, 오렌지 사탕, 말린 무화과, 말린 살구..."
1907,칭퀘테레 뻬르골레 스파르세,http://img.wine21.com/WINE_MST/TITLE/0154000/W...,깐티나 칭퀘테레,이탈리아,리구리아,['보스코'],화이트,테이블 와인,8.5,1,3,4,1,"신선한 셀러드, 각종 파스타류 또는 풍미가 강하지 않은 생선과 잘 어울린다.\n",80000,2013,750,https://www.wine21.com/13_search/wine_view.htm...,"녹색이 살짝 감도는 금빛 반짝이는 노란색을 띠고, 지중해의 각종 허브 향과 프루티한..."
1908,칭퀘테레 코스타 데 캄푸,http://img.wine21.com/WINE_MST/TITLE/0154000/W...,깐티나 칭퀘테레,이탈리아,리구리아,"['보스코', '알라롤라', '베르멘티노']",화이트,테이블 와인,8.5,1,3,3,1,"신선한 샐러드, 각종 파스타류 또는 풍미가 강하지 않은 생선과 잘 어울린다.",80000,2013,750,https://www.wine21.com/13_search/wine_view.htm...,"금빛 반짝이는 노란색을 띠고, 은은히 느껴지는 세이지, 허브 향과 시트러스, 야생 ..."


# Data Processing

In [4]:
df = df_1.copy()

## grape_type : drop all aside from first in list

In [5]:
for i in range(len(df)):
    if ',' in df['grape_type'][i]:
        df['grape_type'][i] = df['grape_type'][i][2:-2].split(',')[0][:-1]
    else:
        df['grape_type'][i] = df['grape_type'][i][2:-2]
df['grape_type']

0        바르베라
1        바르베라
2        샤르도네
3       피노 누아
4       피노 누아
        ...  
1905     샤르도네
1906      보스코
1907      보스코
1908      보스코
1909      보스코
Name: grape_type, Length: 1910, dtype: object

In [6]:
df['grape_type'].unique()

array(['바르베라', '샤르도네', '피노 누아', '카베르네 소비뇽', '메를로', '마르산느', '산지오베제',
       '비오시뉴', '투리가 프란체사', '시라/쉬라즈', '말벡', '슈냉 블랑', '진판델', '모나스트렐',
       '소비뇽 블랑', '그르나슈', '쌩쏘', '모스카토', '블랜드', '템프라니요', '네로 다볼라', '프리미티보',
       '네비올로', '카르메네르', '투리가 나시오날', '자렐로', '베르데호', '아시르티코', '시노마브로',
       '마브루드', '아기오르기티코', '로디티스', '빌라나', '까리냥', '모스카텔 세투발', '가르나차', '피아노',
       '마카베오', '틴타 네그라 몰레', '아이렌', '리슬링', '글레라', '오미자', '코르비나 베로네제',
       '쁘띠 시라', '누라구스', '소비뇽 그리', '베르디키오 비앙코', '몬테풀치아노', '라크리마', '비오니에',
       '모스카텔', '푸르민트', '젤렌', '트라하두라', '그르나슈 블랑', '멘시아', '보발', '돌체토',
       '코르테제', '아르네이스', '뮈스까 까넬리', '세르시알', '알리아니코', '피노 네로', '세미용',
       '베르멘티노', '카베르네 프랑', '믈롱 드 부르고뉴', '피노 그리지오', '트라미너', '비달', '피노 뮈니에',
       '가르나차 틴토레라', '레불라', '쁘띠 메슬리에', '모작', '네로 디 트로이아', '레포스코', '투르비아노',
       '말바시아 디 칸디다', '클레렛', '피노 그리', '페르나오 피레스', '아라고네즈', '코르비나', '론디넬라',
       '롤', '카스텔라옹', '비칼', '사그란티노', '고데요', '트레비아노', '알바리뇨', '플라바치 말리',
       '포십', '게뷔르츠트라미너', '모스까뗄 데 알레한드리아', '오르테가', '틴타 로리즈', '청포도 즙',
    

# !
- 사과로 만든 과실주를 제거해야하는데 그러면 하단에 있는 one hot encoding이 안돌아간다
- 그래서 일단 스킵함

In [7]:
# dropped 'wine' made from apple
# df.drop(index=df[df['grape_type']=='사과'].index, inplace=True)

## one_hot encoding

### food pairing categories
- conversion of string values(sentences) in food_pairing into category dummies

- add meat / cheese / seafood / vegetables / dessert columns

- meat : 고기, 육류, 돼지, 소, 비프,  닭, 오리, 양, 스테이크, 보쌈, 냉채, 바비큐, 하몽, 불고기, 햄, 쇠고기, 소시지, 칠면조, 치킨
- cheese : 치즈
- seafood : 해산물, 조개, 새우, 갑각류, 연어, 참치, 초밥, 생선, 굴, 회, 사시미, 관자
- vegetables : 버섯, 딸기, 올리브, 아몬드, 샐러드, 아보카도, 과일, 호두, 멜론, 무화과, 야채, 견과류
- dessert : 디저트, 수플레, 케익, 케이크, 파이, 마카롱, 아이스크림

In [8]:
meat_ls = ['고기', '육류', '돼지', '소', '비프', '닭', '오리', '양', '스테이크', '보쌈', '냉채', '바비큐', 
           '하몽', '불고기', '햄', '쇠고기', '소시지', '칠면조', '치킨']
cheese_ls = ['치즈']
seafood_ls = ['해산물', '조개', '새우', '갑각류', '연어', '참치', '초밥', '생선', '굴', '회', '사시미', '관자']
vegetables_ls = ['버섯', '딸기', '올리브', '아몬드', '샐러드', '아보카도', '과일', '호두', '멜론', '무화과', '야채', '견과류']
dessert_ls = ['디저트', '수플레', '케익', '케이크', '파이', '마카롱', '아이스크림']

In [9]:
# append food category columns to df
df['meat'] = 0
df['cheese'] = 0
df['seafood'] = 0
df['vegetables'] = 0
df['dessert'] = 0

for i in range(len(df['food_pairing'])):
    for meat in meat_ls:
        if meat in df['food_pairing'][i]:
            df['meat'][i] = 1
    for cheese in cheese_ls:
        if cheese in df['food_pairing'][i]:
            df['cheese'][i] = 1
    for seafood in seafood_ls:
        if seafood in df['food_pairing'][i]:
            df['seafood'][i] = 1
    for vegetables in vegetables_ls:
        if vegetables in df['food_pairing'][i]:
            df['vegetables'][i] = 1
    for dessert in dessert_ls:
        if dessert in df['food_pairing'][i]:
            df['dessert'][i] = 1

In [10]:
df[['food_pairing', 'meat', 'cheese', 'seafood', 'vegetables', 'dessert']]

Unnamed: 0,food_pairing,meat,cheese,seafood,vegetables,dessert
0,"피자, 파스타, 붉은 육류, 그리고 치즈 플레이트와 잘 어울리며, 특유의 산도가...",1,1,0,0,0
1,식전 와인으로 적합하며 전체요리나 기름진 생선요리와 잘 어울린다.,0,0,1,0,0
2,"치즈, 해산물, 샐러드 등과 잘 어울린다.",0,1,1,1,0
3,"치즈, 오리고기 등과 잘 어울린다.",1,1,0,0,0
4,"치즈, BBQ, 구운 고기, 버섯 요리 등과 잘 어울린다.",1,1,0,1,0
...,...,...,...,...,...,...
1905,너무 차지 않게 해 식전주로 마시거나 살구 타르트 등의 과일 디저트와 곁들이면 잘 ...,0,0,0,1,1
1906,"와인 자체로도 완벽하지만, 각종 디져트와 향이 강한 치즈와 잘 어울린다.",0,1,0,0,0
1907,"신선한 셀러드, 각종 파스타류 또는 풍미가 강하지 않은 생선과 잘 어울린다.\n",0,0,1,0,0
1908,"신선한 샐러드, 각종 파스타류 또는 풍미가 강하지 않은 생선과 잘 어울린다.",0,0,1,1,0


### aroma categories
- append aroma category dummies based on string values in aroma_y column
- floral / fruit / cologne / oriental_leather / oriental_spice / earth*
* aroma criteria based off of wine aroma wheel from
https://www.kellyandjones.com/blogs/news/perfumers-guide-to-wine-aromas

In [11]:
floral = ['로럴', '백합', '자스민', '제라니움', '쟈스민', '피오니', '꽃', '아이리스', '부케', '아카시아', '장미', '클로버', '로즈힙',
         '플라워', '카모마일']
fruit = ['트로피칼', '복숭아', '과일','과육', '열매', '스트로베리', '딸기', '블루베리', '베리', '아로니아', '과실', '라스베리', '자두', '체리', 
         '카시스', '사과', '배향', '커런트', '멜론', '애플', '프룻', '매실', '프루티', '살구', '애플리콧', '배', '무화과', '플럼', 
         '패션후르츠', '오디', '모과', '파파야', '망고', '석류', '버찌', '코코넛', '바나나', '호박']
cologne = ['시트러스', '씨트런향', '자몽', '오렌지', '레몬', '귤', '라임', '탠저린', '베르가못']

oriental_leather = ['에스프레소', '담배', '가죽', '쵸콜릿', '바닐라', '카라멜', '초콜릿', '캬라멜', '스모크', '흑연', '모카', 
                    '스파이', '꿀',  '헤이즐넛', '코코아', '타바코', '시가', '레더', '아몬드', '호두','허니', '크림']
oriental_spice = ['향신료', '세이지', '감초', '씨나몬', '라벤다', '라벤더', '발사믹','스파이스' '스파이시', '계피', '정향', '페퍼', 
                  '후추']
earth = ['낙엽', '흙내', '커피', '삼나무', '로즈마리', '나무', '버섯', '허브', '유칼립투스', '토양', '밀짚','회향', '젖은 돌', 
         '머쉬룸', '트러플']

In [12]:
# convert data type from object to string type data
df.aroma_y = df.aroma_y.apply(lambda x: str(x))

In [13]:
# add aroma category columns to df

df['floral'] = 0
df['fruit'] = 0
df['cologne'] = 0
df['oriental_spice'] = 0
df['oriental_leather'] = 0
df['earth'] = 0

for i in range(len(df)):
    for floral_ in floral:
        if floral_ in df['aroma_y'][i]:
            df['floral'][i] = 1
    for fruit_ in fruit:
        if fruit_ in df['aroma_y'][i]:
            df['fruit'][i] = 1
    for cologne_ in cologne:
        if cologne_ in df['aroma_y'][i]:
            df['cologne'][i] = 1
    for oriental_spice_ in oriental_spice:
        if oriental_spice_ in df['aroma_y'][i]:
            df['oriental_spice'][i] = 1
    for oriental_leather_ in oriental_leather:
        if oriental_leather_ in df['aroma_y'][i]:
            df['oriental_leather'][i] = 1
    for earth_ in earth:
        if earth_ in df['aroma_y'][i]:
            df['earth'][i] = 1

In [14]:
for name in ['floral', 'fruit', 'cologne', 'oriental_spice', 'oriental_leather','earth']:
    print(df[name].value_counts())

0    1453
1     457
Name: floral, dtype: int64
1    1688
0     222
Name: fruit, dtype: int64
0    1508
1     402
Name: cologne, dtype: int64
0    1498
1     412
Name: oriental_spice, dtype: int64
0    1139
1     771
Name: oriental_leather, dtype: int64
0    1478
1     432
Name: earth, dtype: int64


In [15]:
df.columns

Index(['title', 'image_link', 'winery', 'country', 'province', 'grape_type',
       'wine_type', 'occasion', 'alcohol', 'sweetness', 'acidity', 'body_rate',
       'tannin_rate', 'food_pairing', 'price', 'vintage', 'size', 'link',
       'aroma_y', 'meat', 'cheese', 'seafood', 'vegetables', 'dessert',
       'floral', 'fruit', 'cologne', 'oriental_spice', 'oriental_leather',
       'earth'],
      dtype='object')

#### dark/fruity aroma
- aroma category combinations created for filtering wine recommendation list based on coffee type preference

In [16]:
# fruity aroma
# aroma is considered overall fruity if it is in either of the following categories
# 'floral', 'fruit', 'cologne'
# and not in any of the following categories:
# 'oriental_spice', 'oriental_leather', 'earth'

df[((df['floral'] ==1) | (df['fruit'] == 1) | (df['cologne'] == 1)) & (df['oriental_spice'] ==0) & 
   (df['oriental_leather'] ==0) & (df['earth'] ==0)]

Unnamed: 0,title,image_link,winery,country,province,grape_type,wine_type,occasion,alcohol,sweetness,...,cheese,seafood,vegetables,dessert,floral,fruit,cologne,oriental_spice,oriental_leather,earth
0,트리풀라 피에몬테 로쏘,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,몬도 델 비노,이탈리아,피에몬테,바르베라,레드,테이블 와인,13.5,1,...,1,0,0,0,1,1,0,0,0,0
1,아케시 로사토 브뤼,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,몬도 델 비노,이탈리아,피에몬테,바르베라,스파클링,테이블 와인,11.5,1,...,0,1,0,0,1,1,0,0,0,0
2,엘리자베스 로제 샤도네이,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,고스트 블락 와이너리,미국,캘리포니아,샤르도네,화이트,테이블 와인,13.5,1,...,1,1,1,0,0,1,1,0,0,0
5,"백하우스, 샤도네이",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,백하우스,미국,캘리포니아,샤르도네,화이트,테이블 와인,13.5,1,...,0,1,1,0,0,1,1,0,0,0
7,"올리비에 르플레브, 부르고뉴 “옹끌 뱅썽”",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,올리비에 르플레브,프랑스,부르고뉴,샤르도네,화이트,테이블 와인,13.5,1,...,0,1,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1898,글라디움 템프라니요,http://img.wine21.com/WINE_MST/TITLE/0154000/1...,보데가스 깜뽀스 레알레스,스페인,라,템프라니요,레드,테이블 와인,8.5,1,...,1,0,1,0,0,1,0,0,0,0
1900,글라디움 아이렌,http://img.wine21.com/WINE_MST/TITLE/0154000/1...,보데가스 깜뽀스 레알레스,스페인,라,아이렌,화이트,테이블 와인,8.0,1,...,0,1,0,0,0,1,0,0,0,0
1902,"떼누따 델레 떼레, 에뜨나 비앙코",http://img.wine21.com/WINE_MST/TITLE/0154000/1...,떼누따 델레 떼레 네레,이탈리아,시칠리아,카리칸테,화이트,테이블 와인,13.0,1,...,0,1,0,0,1,0,0,0,0,0
1905,"샹파뉴 델라모뜨, 브뤼",http://img.wine21.com/WINE_MST/TITLE/0154000/W...,샹파뉴 들라모뜨,프랑스,샹파뉴,샤르도네,스파클링,테이블 와인,12.0,1,...,0,0,1,1,1,1,1,0,0,0


In [17]:
# dark aroma
# aroma is considered overall dark if it is in either of the following categories
# 'oriental_spice', 'oriental_leather', 'earth'
# and not in any of the following categories:
# 'floral', 'fruit', 'cologne'

df[((df['oriental_spice'] == 1) | (df['oriental_leather'] == 1) | (df['earth'] ==1)) & 
  (df['floral'] == 0) & (df['fruit'] == 0) & (df['cologne'] == 0)]

Unnamed: 0,title,image_link,winery,country,province,grape_type,wine_type,occasion,alcohol,sweetness,...,cheese,seafood,vegetables,dessert,floral,fruit,cologne,oriental_spice,oriental_leather,earth
139,"아나이, 파운더스 콜렉션 시라",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,아나이,칠레,라펠,시라/쉬라즈,레드,테이블 와인,13.5,1,...,0,0,0,0,0,0,0,1,0,0
149,리저브 생 마르땡 까베르네 소비뇽,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,리저브 생 마르땡,프랑스,서던,카베르네 소비뇽,레드,테이블 와인,8.5,1,...,0,0,0,0,0,0,0,1,0,1
294,"에구렌 우가르떼, 크리안자",http://img.wine21.com/WINE_MST/TITLE/0166000/W...,에구렌 우가르떼,스페인,리오하,템프라니요,레드,테이블 와인,13.5,1,...,0,1,0,0,0,0,0,0,1,1
295,"에구렌 우가르떼, 코세차",http://img.wine21.com/WINE_MST/TITLE/0166000/W...,에구렌 우가르떼,스페인,리오하,템프라니요,레드,테이블 와인,13.5,1,...,0,1,0,0,0,0,0,0,1,1
296,"에구렌 우가르떼, 레세르바",http://img.wine21.com/WINE_MST/TITLE/0166000/W...,에구렌 우가르떼,스페인,리오하,템프라니요,레드,테이블 와인,14.5,1,...,0,1,0,0,0,0,0,0,1,1
320,"리버비, 싱글 빈야드 샤르도네",http://img.wine21.com/WINE_MST/TITLE/0166000/W...,리버비 에스테이트,뉴질랜드,사우스,샤르도네,화이트,테이블 와인,12.5,1,...,0,0,0,0,0,0,0,0,0,1
447,"로랑 페리에, 하모니 드미 섹",,로랑 페리에,프랑스,샹파뉴,샤르도네,스파클링,테이블 와인,12.5,3,...,0,0,0,1,0,0,0,0,1,0
584,"도멘 드 라 호예르, 루베롱 블랑",http://img.wine21.com/WINE_MST/TITLE/0164000/W...,도멘 드 라 호예르,프랑스,론,그르나슈 블랑,화이트,테이블 와인,13.5,1,...,1,1,1,0,0,0,0,0,1,1
685,오르넬로,http://img.wine21.com/WINE_MST/TITLE/0163000/W...,까스텔라레 디 까스텔리나,이탈리아,토스카나,산지오베제,레드,테이블 와인,14.5,1,...,1,0,0,0,0,0,0,0,1,0
715,"테사리, 아브스",http://img.wine21.com/WINE_MST/TITLE/0163000/0...,테사리,이탈리아,베네토,가르가네가,화이트,테이블 와인,12.5,1,...,0,1,0,0,0,0,0,0,1,1


In [18]:
df['dark_aroma'] = 0
df['fruity_aroma'] = 0

for i in df[(df.oriental_spice==1) | (df.oriental_leather==1) | (df.earth==1)].index:
    df['dark_aroma'][i] = 1

for i in df[(df.floral==1) | (df.fruit==1) | (df.cologne==1)].index:
    df['fruity_aroma'][i] = 1

In [19]:
df[['dark_aroma', 'fruity_aroma']]

Unnamed: 0,dark_aroma,fruity_aroma
0,0,1
1,0,1
2,0,1
3,1,1
4,1,1
...,...,...
1905,0,1
1906,0,1
1907,1,1
1908,1,1


In [20]:
# 134 wines neither considered solely dark nor fruity
len(df[(df['dark_aroma'] == 0) & (df['fruity_aroma'] == 0)])

134

In [21]:
# these were cosnidered inappropriate for our purpose
# leading to the deletion of these datas
df = df[(df['dark_aroma'] == 1) | (df['fruity_aroma'] == 1)]
df.reset_index(inplace=True)
len(df)

1776

### continent

- grouping countries based on their geographic location and climate
- necessary to reduce bias due to overfitting

- Europe_A : 프랑스, 오스트리아, 몰도바, 독일, 헝가리, 슬로베니아, 불가리아, 조지아, 루마니아
- Europe_B : 이탈리아, 포르투갈, 스페인, 그리스, 크로아티아, 이스라엘, 레바논
- North_America : 미국, 캐나다
- South_America : 아르헨티나, 칠레, 우루과이
- New_World : 호주, 뉴질랜드, 남아프리카
- Korea : 대한민국

In [22]:
europe_a = ['프랑스', '오스트리아', '몰도바', '독일', '헝가리', '슬로베니아', '불가리아', '조지아', '루마니아']
europe_b = ['이탈리아', '포르투갈', '스페인', '그리스', '크로아티아', '이스라엘', '레바논']
north_america = ['미국', '캐나다']
south_america = ['아르헨티나', '칠레', '우루과이'] 
new_world = ['호주', '뉴질랜드', '남아프리카']
korea = ['대한민국']

In [23]:
df['europe_a'] = 0
df['europe_b'] = 0
df['north_america'] = 0
df['south_america'] = 0
df['new_world'] = 0
df['korea'] = 0

for i in range(len(df['country'])):
    for a in europe_a:
        if a in df['country'][i]:
            df['europe_a'][i] = 1
    for b in europe_b:
        if b in df['country'][i]:
            df['europe_b'][i] = 1
    for north in north_america:
        if north in df['country'][i]:
            df['north_america'][i] = 1
    for south in south_america:
        if south in df['country'][i]:
            df['south_america'][i] = 1
    for new in new_world:
        if new in df['country'][i]:
            df['new_world'][i] = 1
    for k in korea:
        if k in df['country'][i]:
            df['korea'][i] = 1
            
df[['country', 'europe_a', 'europe_b', 'north_america', 'south_america', 'new_world', 'korea']]

Unnamed: 0,country,europe_a,europe_b,north_america,south_america,new_world,korea
0,이탈리아,0,1,0,0,0,0
1,이탈리아,0,1,0,0,0,0
2,미국,0,0,1,0,0,0
3,미국,0,0,1,0,0,0
4,미국,0,0,1,0,0,0
...,...,...,...,...,...,...,...
1771,프랑스,1,0,0,0,0,0
1772,이탈리아,0,1,0,0,0,0
1773,이탈리아,0,1,0,0,0,0
1774,이탈리아,0,1,0,0,0,0


In [24]:
# count european wines
print('europe_a wine_count : ', len(df[(df['europe_a'] == 1)]))
print('europe_b wine_count : ', len(df[(df['europe_b'] == 1)]))
print('north_america wine_count : ', len(df[(df['north_america'] == 1)]))
print('south_america wine_count : ', len(df[(df['south_america'] == 1)]))
print('new_world wine_count : ', len(df[(df['new_world'] == 1)]))
print('korea wine_count : ', len(df[(df['korea'] == 1)]))

europe_a wine_count :  384
europe_b wine_count :  728
north_america wine_count :  244
south_america wine_count :  229
new_world wine_count :  189
korea wine_count :  2


### wine_type

- wine type varieties reduced to either red or white based on customer perception
- instead of on grape type

- '로제' & '스파클링' -> '화이트'
- '주정강화' -> '레드' or '화이트'
    - port wines in '주정강화' -> '레드'
    - the others (셰리와인) -> '삭제'

In [25]:
df['wine_type'].unique()

array(['레드', '스파클링', '화이트', '주정강화', '로제'], dtype=object)

In [26]:
df[(df['wine_type'] == '로제') | (df['wine_type'] == '스파클링')]['wine_type'] = '화이트'

In [27]:
for i in range(len(df)):
    if '포트' in df['title'][i]:
        df['wine_type'][i] = '레드'

# ! 왜 백퍼가 아니죠?

In [28]:
print("Red wine : " , round((len(df[df['wine_type'] == '레드']) / len(df)) * 100, 2), "%")
print("White wine : " , round((len(df[df['wine_type'] == '화이트']) / len(df)) * 100, 2), "%")

Red Counts :  57.6 %
White Counts :  27.25 %


In [29]:
for i in range(len(df)):
    if df['wine_type'][i] == '레드':
        df['wine_type'][i] = 'red'
    elif df['wine_type'][i] == '화이트':
        df['wine_type'][i] = 'white'
    else:
        df['wine_type'][i] = 'etc'

In [30]:
# deletion of etc. wines (sherry wine)
for i in range(len(df)):
    if df['wine_type'][i] == 'etc':
        df2 = df.drop([i], inplace=True)

In [31]:
# 403 row datas deleted up to this point
len(df)

1507

In [32]:
df.columns

Index(['index', 'title', 'image_link', 'winery', 'country', 'province',
       'grape_type', 'wine_type', 'occasion', 'alcohol', 'sweetness',
       'acidity', 'body_rate', 'tannin_rate', 'food_pairing', 'price',
       'vintage', 'size', 'link', 'aroma_y', 'meat', 'cheese', 'seafood',
       'vegetables', 'dessert', 'floral', 'fruit', 'cologne', 'oriental_spice',
       'oriental_leather', 'earth', 'dark_aroma', 'fruity_aroma', 'europe_a',
       'europe_b', 'north_america', 'south_america', 'new_world', 'korea'],
      dtype='object')

In [47]:
# one_hot encoding for wine_type
df['red'], df['white'] = 0, 0
df[['red', 'white']] = pd.get_dummies(df['wine_type'])

In [49]:
print(df['red'].value_counts())
print(df['white'].value_counts())

1    1023
0     484
Name: red, dtype: int64
0    1023
1     484
Name: white, dtype: int64


In [50]:
df.columns

Index(['index', 'title', 'image_link', 'winery', 'country', 'province',
       'grape_type', 'wine_type', 'occasion', 'alcohol', 'sweetness',
       'acidity', 'body_rate', 'tannin_rate', 'food_pairing', 'price',
       'vintage', 'size', 'link', 'aroma_y', 'meat', 'cheese', 'seafood',
       'vegetables', 'dessert', 'floral', 'fruit', 'cologne', 'oriental_spice',
       'oriental_leather', 'earth', 'dark_aroma', 'fruity_aroma', 'europe_a',
       'europe_b', 'north_america', 'south_america', 'new_world', 'korea',
       'red', 'white'],
      dtype='object')

## data reduction

### columns

In [51]:
df.drop('occasion', axis=1, inplace=True)

In [55]:
df.drop(['index'], axis=1, inplace=True)

### indices

In [52]:
df.reset_index(inplace=True)

In [53]:
df[(df['winery'] == '테스코 파이니스트') | (df['winery'] == '테스코 그룹  (테스코 심플리 와인) ')].index

Int64Index([1034, 1035, 1372, 1406, 1407], dtype='int64')

# ! 이거 아예 data processing 앞부분으로 빼버릴까요? 
- df_1.copy()하기 이전에

In [58]:
df.iloc[1034]

level_0              NaN
title               None
image_link          None
winery              None
country             None
province            None
grape_type          None
wine_type           None
alcohol              NaN
sweetness            NaN
acidity              NaN
body_rate            NaN
tannin_rate          NaN
food_pairing        None
price                NaN
vintage             None
size                 NaN
link                None
aroma_y             None
meat                 NaN
cheese               NaN
seafood              NaN
vegetables           NaN
dessert              NaN
floral               NaN
fruit                NaN
cologne              NaN
oriental_spice       NaN
oriental_leather     NaN
earth                NaN
dark_aroma           NaN
fruity_aroma         NaN
europe_a             NaN
europe_b             NaN
north_america        NaN
south_america        NaN
new_world            NaN
korea                NaN
red                  NaN
white                NaN


In [56]:
# Tesco does not exist in Korea
# which leads to the deletion of Tesco wines
# [1034, 1035, 1372, 1406, 1407]

df[(df['winery'] == '테스코 파이니스트') | (df['winery'] == '테스코 그룹  (테스코 심플리 와인) ')] = df[(df['winery'] == '테스코 파이니스트') | (df['winery'] == '테스코 그룹  (테스코 심플리 와인) ')].drop([1034, 1035, 1372, 1406, 1407], inplace=True)

In [57]:
len(df)

1507

In [59]:
# delete image nan

df = df[df.image_link.notnull()]
df

Unnamed: 0,level_0,title,image_link,winery,country,province,grape_type,wine_type,alcohol,sweetness,...,dark_aroma,fruity_aroma,europe_a,europe_b,north_america,south_america,new_world,korea,red,white
0,0.0,트리풀라 피에몬테 로쏘,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,몬도 델 비노,이탈리아,피에몬테,바르베라,red,13.5,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,엘리자베스 로제 샤도네이,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,고스트 블락 와이너리,미국,캘리포니아,샤르도네,white,13.5,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,3.0,"맥매니스, 캘리포니아 피노 누아",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,맥매니스 와이너리,미국,캘리포니아,피노 누아,red,13.5,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,4.0,"백하우스, 피노 누아",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,백하우스,미국,캘리포니아,피노 누아,red,13.5,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,5.0,"백하우스, 샤도네이",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,백하우스,미국,캘리포니아,샤르도네,white,13.5,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500,1768.0,"떼누따 델레 떼레, 에뜨나 비앙코",http://img.wine21.com/WINE_MST/TITLE/0154000/1...,떼누따 델레 떼레 네레,이탈리아,시칠리아,카리칸테,white,13.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1503,1772.0,칭퀘테레 샤케트라,http://img.wine21.com/WINE_MST/TITLE/0154000/W...,깐티나 칭퀘테레,이탈리아,리구리아,보스코,white,9.5,5.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1504,1773.0,칭퀘테레 뻬르골레 스파르세,http://img.wine21.com/WINE_MST/TITLE/0154000/W...,깐티나 칭퀘테레,이탈리아,리구리아,보스코,white,8.5,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1505,1774.0,칭퀘테레 코스타 데 캄푸,http://img.wine21.com/WINE_MST/TITLE/0154000/W...,깐티나 칭퀘테레,이탈리아,리구리아,보스코,white,8.5,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
df.to_csv('df_aroma_processing.csv', encoding='utf-8-sig')

In [60]:
df.columns

Index(['level_0', 'title', 'image_link', 'winery', 'country', 'province',
       'grape_type', 'wine_type', 'alcohol', 'sweetness', 'acidity',
       'body_rate', 'tannin_rate', 'food_pairing', 'price', 'vintage', 'size',
       'link', 'aroma_y', 'meat', 'cheese', 'seafood', 'vegetables', 'dessert',
       'floral', 'fruit', 'cologne', 'oriental_spice', 'oriental_leather',
       'earth', 'dark_aroma', 'fruity_aroma', 'europe_a', 'europe_b',
       'north_america', 'south_america', 'new_world', 'korea', 'red', 'white'],
      dtype='object')

In [61]:
df.drop('level_0', axis=1, inplace=True)

In [62]:
df.reset_index(inplace=True)
df['wine_id'] = df.index
df['wine_id']

0          0
1          1
2          2
3          3
4          4
        ... 
1229    1229
1230    1230
1231    1231
1232    1232
1233    1233
Name: wine_id, Length: 1234, dtype: int64

# Questionnaire for service user

###### Survey

> 1) Which coffee alternative do you prefer to drink?
1. black coffee (e.g. americano, espresso)
2. coffee with milk & sugar (eg. frappuccino, vanila latte, etc.)
3. I prefer tea

> 2) Which type of coffee bean (or tea) do you prefer?
1. acidic type
2. full body type

###### cases (100 cases per type)
- dark flavor : Earth, Oriental / fruity flavor : floral, fruity, cologne
1. type A : black - acidic : Dry / light / high acidity / dark flavors
2. type B : black - bodied : Dry / bold / low acidity / dark flavors
3. type C : cafe latte with sugar - acidic : sweet / bold / high acidity 
4. type D : cafe latte with sugar - bodied : sweet / bold / low acidity 
5. type E : Tea - acidic : Dry / light / high acidity / fruity flavors 
6. type F : Tea - bodied : Dry / bold / low acidity / fruity flavors

## Data Clustering

> 1) Which coffee alternative do you prefer to drink?
- if black or tea: sweetness below 3 -> type a, b, e or f
- if cafe latte w/ sugar: sweetness greater or equal to 3 -> type c or d

> 2) Which type of coffee bean (or tea) do you prefer?
1-1. acidic type: body rate below 3 & acidity greater than 3 -> type a or e
1-2. acidic type: body rate greater than 3 & acidity equal to or lower than 3 -> type c
2. full body type: body rate greater than 3 * acidity equal to or lower than 3 -> type b, d or f
    
dark aroma checked -> type a or b
fruity aroma checked -> type e or f

# ! 타입 c 하고 d 확인

In [63]:
# set-up criteria again
# 함수로 function('a') resampling 

type_a = df[(df['sweetness'] < 3) & (df['body_rate'] < 3) & (df['acidity'] > 3) & (df['dark_aroma'] == 1)]
type_b = df[(df['sweetness'] < 3) & (df['body_rate'] > 2) & (df['acidity'] <= 3) & (df['dark_aroma'] == 1)]
type_c = df[(df['sweetness'] > 2) & (df['body_rate'] > 3) & (df['acidity'] > 3)]
type_d = df[(df['sweetness'] > 2) & (df['body_rate'] > 3) & (df['acidity'] <= 3)]
type_e = df[(df['sweetness'] < 3) & (df['body_rate'] < 3) & (df['acidity'] > 3) & (df['fruity_aroma'] == 1)]
type_f = df[(df['sweetness'] < 3) & (df['body_rate'] > 2) & (df['acidity'] <= 3) & (df['fruity_aroma'] == 1)]
len(type_a), len(type_b), len(type_c), len(type_d), len(type_e), len(type_f)

(35, 478, 2, 17, 94, 659)

In [65]:
types = [type_a, type_b, type_c, type_d, type_e, type_f]

In [84]:
result = pd.DataFrame(columns=['index', 'title', 'image_link', 'winery', 'country', 'province',
       'grape_type', 'wine_type', 'alcohol', 'sweetness', 'acidity',
       'body_rate', 'tannin_rate', 'food_pairing', 'price', 'vintage', 'size',
       'link', 'aroma_y', 'meat', 'cheese', 'seafood', 'vegetables', 'dessert',
       'floral', 'fruit', 'cologne', 'oriental_spice', 'oriental_leather',
       'earth', 'dark_aroma', 'fruity_aroma', 'europe_a', 'europe_b',
       'north_america', 'south_america', 'new_world', 'korea', 'red', 'white',
       'wine_id'])

In [85]:
def make_100(type_, n=50):
    global result
    for i in range(100):
        result = result.append(type_.sample(n), ignore_index=True)

    return result

In [87]:
make_100(type_b)

Unnamed: 0,index,title,image_link,winery,country,province,grape_type,wine_type,alcohol,sweetness,...,fruity_aroma,europe_a,europe_b,north_america,south_america,new_world,korea,red,white,wine_id
0,1487,"오 봉 클리마, 샤도네이 로스 알라모스",http://img.wine21.com/WINE_MST/TITLE/0154000/W...,오 봉 클리마,미국,캘리포니아,샤르도네,white,9.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1217
1,509,"페우디, 프리미티보 디 만두리아",http://img.wine21.com/WINE_MST/TITLE/0164000/W...,페우디 (페우디 디 산 그레고리오),이탈리아,뿔리아,프리미티보,red,9.5,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,484
2,74,까바예로 템프라니요,http://img.wine21.com/WINE_MST/TITLE/0167000/W...,보데가스 파라 히메네즈,스페인,,템프라니요,red,13.5,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,72
3,600,"간세도, 제스탈",http://img.wine21.com/WINE_MST/TITLE/0163000/W...,보데가 간세도,스페인,비에르소,멘시아,red,14.5,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,568
4,1351,돈파스칼 타나 로블 크리안자,http://img.wine21.com/WINE_MST/TITLE/0156000/W...,에스타브레시미엔토 주아니코,우루과이,,따나,red,13.5,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,436,"세인트 할렛, 부쳐스 카트",http://img.wine21.com/WINE_MST/TITLE/0164000/W...,세인트 할렛 와인즈,호주,사우스,시라/쉬라즈,red,9.5,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,415
4996,92,"몰 와인즈, 바이올렌토 말벡",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,몰 와인즈,아르헨티나,멘도사,말벡,red,14.5,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,90
4997,886,발렌시소 레세르바,http://img.wine21.com/WINE_MST/TITLE/0161000/W...,발렌시소,스페인,리오하,템프라니요,red,9.5,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,833
4998,984,카발로 로코 그랑 크뤼 리마리,http://img.wine21.com/WINE_MST/TITLE/0161000/W...,발디비에소,칠레,코킴보,시라/쉬라즈,red,15.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,913


In [89]:
len(result)

5000

### food_pairing

In [None]:
# data clustering performed on user-based ratings dataframe(result)

In [208]:
from sklearn.metrics.pairwise import cosine_similarity 

# 음식 버튼 클릭
class food_selection:
    food_list = []
    customer_df = pd.DataFrame()
    best_df = pd.DataFrame()
    best_fit_item = pd.DataFrame(columns=['title', 'alcohol', 'sweetness', 'acidity', 'body_rate', 'tannin_rate', 
                                              'meat', 'cheese', 'seafood', 'vegetables', 'dessert', 'floral', 'fruit', 'cologne', 'oriental_spice',
                                              'oriental_leather', 'earth', 'europe_a', 'europe_b', 'north_america', 'south_america', 
                                              'new_world', 'korea'])
    
    def __init__(self, food_list, customer_df):
        self.food_list = food_list
        self.customer_df = customer_df
        self.best_df = best_df
        self.best_fit_item = best_fit_item
    
    # 음식 카테고리 선택
    def food_button(self, btn_clicked):
        self.food_list.append(btn_clicked)
        return self.food_list
    
    # 선택된 음식 카테고리 데이터프레임 반환
    def food_df(self):
        for food in self.food_list:
            self.customer_df = self.customer_df.append(result[result[food]==1]['title'].value_counts())
        return self.customer_df.T
    
    # 음식 기준 반환된 고객 평가 데이터프레임의 인덱스(
    def get_best_match(self):
        # 선택된 음식 카테고리 기준으로 필터링된 전체 와인 데이터프레임 
        for food in self.food_list:
            self.best_df = self.best_df.append(result[result[food]==1])
        self.best_df['counts'] = 0     
    
        # 음식 기준 반환된 고객 평가 데이터프레임의 인덱스(와인명)와 전체 df의 title이 일치할 경우
        # 전체 df의 카운트를 해당 와인명의 만족고객카운트를 등록해준다
        for i in range(len(self.best_df)):
            for n in range(len(self.customer_df)):
                if self.best_df['title'].iloc[i,] == self.customer_df.index[n]:
                    self.best_df['counts'].iloc[i,] = self.customer_df[n]
                    
        self.best_df = self.best_df.sort_values(by='counts', ascending=False)
        self.best_df.drop_duplicates(keep='first', inplace=True)
    
        # flatten df into single dimension vector
        data =list(self.best_df.iloc[0,:][['title','alcohol', 'sweetness', 'acidity', 'body_rate', 'tannin_rate', 
                                         'meat', 'cheese', 'seafood', 'vegetables', 'dessert', 'floral', 'fruit', 'cologne', 'oriental_spice',
                                         'oriental_leather', 'earth', 'europe_a', 'europe_b', 'north_america', 'south_america', 
                                         'new_world', 'korea']])

        data = pd.Series(data, index=best_fit_item.columns)

        self.best_fit_item = self.best_fit_item.append(data, ignore_index=True)
        self.best_fit_item.set_index('title', drop=False, inplace=True)
        return self.best_fit_item
    
        def set_price_range(self, lower_price_range=5000, upper_price_range=200000):
            df_in_price_range = self.best_df[(lower_price_range <= self.best_df['price']) & (self.best_df['price'] <= upper_price_range)]

            # flatten to single dimension vector
            df_in_price_range = df_in_price_range[['title', 'alcohol', 'sweetness', 'acidity', 'body_rate', 'tannin_rate', 
                                                   'meat', 'cheese', 'seafood', 'vegetables', 'dessert', 'floral', 'fruit', 'cologne', 'oriental_spice',
                                                   'oriental_leather', 'earth', 'europe_a', 'europe_b', 'north_america', 'south_america', 
                                                   'new_world', 'korea']]

            self.best_fit_item = self.best_fit_item.append(df_in_price_range)
            self.best_fit_item.set_index('title', inplace=True)
            self.best_fit_item
        
        def retrieve_sim_item(self):
            # compare cosine similarity for each items to another
            sim_item = cosine_similarity(self.best_fit_item, self.best_fit_item)
            sim_item_df = pd.DataFrame(data=sim_item, index = self.best_fit_item.index, columns=self.best_fit_item.index)
            return sim_item_df.iloc[0,:].sort_values(ascending=False)

In [202]:
# ! 예시

In [203]:
a = food_selection([], pd.DataFrame())

In [204]:
a.food_button('meat')

['meat']

In [205]:
a.food_button('seafood')

['meat', 'seafood']

In [206]:
x = a.food_df()
x

Unnamed: 0,title,title.1
조엘 고트 워싱턴 레드,15.0,
1000스토리 까베르네 소비뇽,15.0,
1000스토리 진판델,11.0,
"19 크라임스, 매지스트레이트",6.0,
"19 크라임스, 업라이징 레드",5.0,
...,...,...
클렌드넌 패밀리 빈야드 스탈리온 소비뇽 블랑,,12.0
"타라파카, 그란 레세르바 샤르도네",,15.0
파고 데 로스 카페자네스 오 루아르 두 실 바리카,,11.0
"파고 데 로스 카페자네스, '오 루아르 두 실' 리아스",,12.0


In [189]:
fish_price = result[result['seafood']==1]
fish_price['counts'] = 0

In [190]:
fish_price

Unnamed: 0,index,title,image_link,winery,country,province,grape_type,wine_type,alcohol,sweetness,...,europe_a,europe_b,north_america,south_america,new_world,korea,red,white,wine_id,counts
0,1487,"오 봉 클리마, 샤도네이 로스 알라모스",http://img.wine21.com/WINE_MST/TITLE/0154000/W...,오 봉 클리마,미국,캘리포니아,샤르도네,white,9.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1217,0
12,1439,"얄리, 레세르바 샤르도네",http://img.wine21.com/WINE_MST/TITLE/0154000/1...,얄리,칠레,아콩카구아,샤르도네,white,13.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1190,0
69,671,"카바이, 루이자",http://img.wine21.com/WINE_MST/TITLE/0163000/W...,카바이,슬로베니아,고리스카,리볼라 지알라,white,13.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,636,0
71,136,"래번, 샤르도네 러시안 리버 밸리",http://img.wine21.com/WINE_MST/TITLE/0167000/W...,래번,미국,캘리포니아,샤르도네,white,14.5,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,132,0
72,1023,라 뷔띠니에르 샤르도네,http://img.wine21.com/WINE_MST/TITLE/0161000/W...,꺄브 안느 드 조와이유스,프랑스,서던,샤르도네,white,9.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,947,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4975,604,파고 에레다드 데 우루에냐 토르알토,http://img.wine21.com/WINE_MST/TITLE/0163000/W...,파고 에레다드 데 우루에냐,스페인,토로,템프라니요,red,15.5,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,572,0
4985,1039,"위라 위라, 애들레이드 샤도네이",http://img.wine21.com/WINE_MST/TITLE/0160000/W...,위라 위라 빈야즈,호주,사우스,샤르도네,white,8.5,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,958,0
4988,1504,칭퀘테레 뻬르골레 스파르세,http://img.wine21.com/WINE_MST/TITLE/0154000/W...,깐티나 칭퀘테레,이탈리아,리구리아,보스코,white,8.5,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1231,0
4993,1439,"얄리, 레세르바 샤르도네",http://img.wine21.com/WINE_MST/TITLE/0154000/1...,얄리,칠레,아콩카구아,샤르도네,white,13.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1190,0


In [None]:
fish_price_30000 = fish_price[fish_price['price'] <= 30000]
fish_price_30000 = fish_price_30000[['title', 'alcohol', 'sweetness', 'acidity', 'body_rate', 'tannin_rate', 
                      'meat', 'cheese', 'seafood', 'vegetables', 'dessert', 'floral', 'fruit', 'cologne', 'oriental_spice',
                     'oriental_leather', 'earth', 'europe_a', 'europe_b', 'north_america', 'south_america', 
                      'new_world', 'korea']]
best_fish_item = best_fish_item.append(fish_price_30000)
best_fish_item.set_index('title', inplace=True)
best_fish_item

In [None]:
from sklearn.metrics.pairwise import cosine_similarity 

item_sim = cosine_similarity(best_fish_item, best_fish_item)
items_sim_df = pd.DataFrame(data=item_sim, index = best_fish_item.index, columns=best_fish_item.index)
print(items_sim_df.shape)
items_sim_df

In [None]:
items_sim_df.iloc[0,:].sort_values(ascending=False)

# User-based Recommendation
- recommend top-3 wines, considering cosign similarity of customer's type, price_range, food, aroma and continent

## split the data by customer's type

## Cosign Similarity

### food you have with

### preferred aroma

### afforable price

### continent 

# Kakao Plus Friends Chatbot