In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
wine_data_df = pd.read_csv('./data/vivino_data_241107.csv')
wine_data_df.head() 

Unnamed: 0,vintage_id,vintage_name,vintage_year,vintage_price,vintage_ratings_average,vintage_ratings_count,vintage_wine_id,vintage_wine_name,vintage_winery,vintage_country,vintage_region,vintage_wine_type_id,acidity,fizziness,intensity,sweetness,tannin,flavor,foods
0,177404982,Carl Loewen 1896 Riesling 2023,2023,82045,5.0,115,1945087,1896 Riesling 2023,Carl Loewen,Germany,Mosel,2,4.61,,3.32,2.05,,tree_fruit,"['Pork', 'Shellfish', 'Spicy food', 'Poultry',..."
1,1510217,Château Haut-Brion Pessac-Léognan (Premier Gra...,1989,3244496,4.8,1538,1152755,Pessac-Léognan (Premier Grand Cru Classé) 1989,Château Haut-Brion,France,Pessac-Léognan,1,4.22,,4.08,1.67,4.23,oak,"['Beef', 'Lamb', 'Game (deer, venison)', 'Poul..."
2,2611979,Château Latour Grand Vin Pauillac (Premier Gra...,1982,2759686,4.8,1445,1655970,Grand Vin Pauillac (Premier Grand Cru Classé) ...,Château Latour,France,Pauillac,1,4.12,,4.12,1.68,4.13,oak,"['Beef', 'Lamb', 'Game (deer, venison)', 'Poul..."
3,170413977,Domaines Ott Clos Mireille Rosé (Coeur de Grai...,2022,59594,4.8,581,1382222,Clos Mireille Rosé (Coeur de Grain) 2022,Domaines Ott,France,Côtes de Provence,4,3.77,,2.5,1.33,,citrus_fruit,"['Pork', 'Shellfish', 'Vegetarian', 'Poultry']"
4,1688597,Krug Clos du Mesnil 2000,2000,2175423,4.8,347,79635,Clos du Mesnil 2000,Krug,France,Champagne,3,4.54,4.22,3.97,,,non_oak,"['Pork', 'Rich fish (salmon, tuna etc)', 'Shel..."


In [3]:
# 와인 타입 숫자에서 문자열로 변경
# vintage_wine_type_id 
# 1-red, 2-white, 3-sparkling, 4-rose, 7-dessert, 24-fortified
vintage_wine_type_id_dic = {
    1 : 'Red',
    2 : 'White',
    3 : 'Sparkling',
    4 : 'Rose',
    7 : 'Dessert',
    24 : 'Fortified',
    25 : 'Unknown'
}

wine_data_df['vintage_wine_type_id'] = wine_data_df['vintage_wine_type_id'].apply(lambda x: vintage_wine_type_id_dic[x])

# 컬럼명 'vintage_wine_type_id'에서 'vintage_wine_type' 으로 변경
wine_data_df = wine_data_df.rename(columns={'vintage_wine_type_id': 'vintage_wine_type'})

In [6]:
# 중복된 행 찾기
duplicates = wine_data_df[wine_data_df.duplicated()]
duplicates.sort_values(by='vintage_id', ascending=False)
wine_data_df = wine_data_df.drop_duplicates(keep=False)

In [7]:
wine_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 0 to 1999
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   vintage_id               2000 non-null   int64  
 1   vintage_name             2000 non-null   object 
 2   vintage_year             2000 non-null   object 
 3   vintage_price            2000 non-null   int64  
 4   vintage_ratings_average  2000 non-null   float64
 5   vintage_ratings_count    2000 non-null   int64  
 6   vintage_wine_id          2000 non-null   int64  
 7   vintage_wine_name        2000 non-null   object 
 8   vintage_winery           1999 non-null   object 
 9   vintage_country          2000 non-null   object 
 10  vintage_region           2000 non-null   object 
 11  vintage_wine_type        2000 non-null   object 
 12  acidity                  1947 non-null   float64
 13  fizziness                105 non-null    float64
 14  intensity                1947

In [8]:
# catboost 사용을 위해 속성 컬럼 정수형으로 변경
# 소수점 이하 반올림 처리
# NaN 값은 0 으로 변경
wine_data_df[['acidity', 'fizziness', 'intensity', 'sweetness', 'tannin']] = wine_data_df[['acidity', 'fizziness', 'intensity', 'sweetness', 'tannin', ]].fillna(0).round().astype(int)
wine_data_df.head()

Unnamed: 0,vintage_id,vintage_name,vintage_year,vintage_price,vintage_ratings_average,vintage_ratings_count,vintage_wine_id,vintage_wine_name,vintage_winery,vintage_country,vintage_region,vintage_wine_type,acidity,fizziness,intensity,sweetness,tannin,flavor,foods
0,177404982,Carl Loewen 1896 Riesling 2023,2023,82045,5.0,115,1945087,1896 Riesling 2023,Carl Loewen,Germany,Mosel,White,5,0,3,2,0,tree_fruit,"['Pork', 'Shellfish', 'Spicy food', 'Poultry',..."
1,1510217,Château Haut-Brion Pessac-Léognan (Premier Gra...,1989,3244496,4.8,1538,1152755,Pessac-Léognan (Premier Grand Cru Classé) 1989,Château Haut-Brion,France,Pessac-Léognan,Red,4,0,4,2,4,oak,"['Beef', 'Lamb', 'Game (deer, venison)', 'Poul..."
2,2611979,Château Latour Grand Vin Pauillac (Premier Gra...,1982,2759686,4.8,1445,1655970,Grand Vin Pauillac (Premier Grand Cru Classé) ...,Château Latour,France,Pauillac,Red,4,0,4,2,4,oak,"['Beef', 'Lamb', 'Game (deer, venison)', 'Poul..."
3,170413977,Domaines Ott Clos Mireille Rosé (Coeur de Grai...,2022,59594,4.8,581,1382222,Clos Mireille Rosé (Coeur de Grain) 2022,Domaines Ott,France,Côtes de Provence,Rose,4,0,2,1,0,citrus_fruit,"['Pork', 'Shellfish', 'Vegetarian', 'Poultry']"
4,1688597,Krug Clos du Mesnil 2000,2000,2175423,4.8,347,79635,Clos du Mesnil 2000,Krug,France,Champagne,Sparkling,5,4,4,0,0,non_oak,"['Pork', 'Rich fish (salmon, tuna etc)', 'Shel..."


In [9]:
# vintage_winery 와 flavor 컬럼 내 NaN 셀에 None 값 추가
wine_data_df[['vintage_winery','flavor']] = wine_data_df[['vintage_winery', 'flavor']].fillna('None')
wine_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 0 to 1999
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   vintage_id               2000 non-null   int64  
 1   vintage_name             2000 non-null   object 
 2   vintage_year             2000 non-null   object 
 3   vintage_price            2000 non-null   int64  
 4   vintage_ratings_average  2000 non-null   float64
 5   vintage_ratings_count    2000 non-null   int64  
 6   vintage_wine_id          2000 non-null   int64  
 7   vintage_wine_name        2000 non-null   object 
 8   vintage_winery           2000 non-null   object 
 9   vintage_country          2000 non-null   object 
 10  vintage_region           2000 non-null   object 
 11  vintage_wine_type        2000 non-null   object 
 12  acidity                  2000 non-null   int32  
 13  fizziness                2000 non-null   int32  
 14  intensity                2000

In [51]:
from sklearn.model_selection import train_test_split
# 데이터 분리 
X = wine_data_df[['acidity', 'fizziness', 'intensity', 'sweetness', 'tannin', 'vintage_wine_type']]
y = wine_data_df['vintage_ratings_average']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1600, 6), (400, 6), (1600,), (400,))

In [52]:
X_train.columns

Index(['acidity', 'fizziness', 'intensity', 'sweetness', 'tannin',
       'vintage_wine_type'],
      dtype='object')

In [53]:
# CatBoost 데이터 준비
# - 범주형 데이터 처리를 위한 Pool 객체 생성
from catboost import Pool

# 카테고리 컬럼명 선언 (연속형 데이터 컬럼은 제외)
cat_features = ['acidity', 'fizziness', 'intensity', 'sweetness', 'tannin', 'vintage_wine_type']


X_train_pool = Pool(X_train, y_train, cat_features=cat_features)
X_test_pool = Pool(X_test, y_test, cat_features=cat_features)

In [54]:
# CatBoostRegressor 회귀 모델 학습
from catboost import CatBoostRegressor

cb_reg = CatBoostRegressor(
    n_estimators=5000, # 반복횟수 (내부 예측기 개수) 
    learning_rate=0.03, # 학습률  
    depth=3, 
    loss_function='RMSE', # 손실함수 (기본값)
    eval_metric='RMSE' # 평가지표 (기본값)
)

# 훈련
cb_reg.fit(X_train_pool, eval_set=X_test_pool, verbose=200) # 100번째마다 출력


0:	learn: 0.1890221	test: 0.2043260	best: 0.2043260 (0)	total: 7.25ms	remaining: 36.3s
200:	learn: 0.1752807	test: 0.1971757	best: 0.1971757 (200)	total: 1.48s	remaining: 35.3s
400:	learn: 0.1736427	test: 0.1968726	best: 0.1968718 (397)	total: 2.82s	remaining: 32.4s
600:	learn: 0.1725451	test: 0.1966090	best: 0.1966049 (596)	total: 4.18s	remaining: 30.6s
800:	learn: 0.1716050	test: 0.1963393	best: 0.1963393 (800)	total: 5.53s	remaining: 29s
1000:	learn: 0.1707308	test: 0.1961283	best: 0.1961256 (998)	total: 6.88s	remaining: 27.5s
1200:	learn: 0.1698889	test: 0.1960466	best: 0.1960254 (1187)	total: 8.28s	remaining: 26.2s
1400:	learn: 0.1690442	test: 0.1959013	best: 0.1959013 (1400)	total: 9.65s	remaining: 24.8s
1600:	learn: 0.1682279	test: 0.1956885	best: 0.1956677 (1587)	total: 11s	remaining: 23.4s
1800:	learn: 0.1675684	test: 0.1956393	best: 0.1956038 (1748)	total: 12.5s	remaining: 22.1s
2000:	learn: 0.1669223	test: 0.1955251	best: 0.1955238 (1991)	total: 13.8s	remaining: 20.7s
2200:	

<catboost.core.CatBoostRegressor at 0x24dd53c9250>

In [55]:
# 가장 영향력이 있는 특성은?
pd.DataFrame({
    'column': X_train.columns,
    'importance': cb_reg.feature_importances_
})

Unnamed: 0,column,importance
0,acidity,15.255086
1,fizziness,7.406104
2,intensity,25.438052
3,sweetness,21.374633
4,tannin,13.387924
5,vintage_wine_type,17.138202


In [88]:
wine_types = wine_data_df['vintage_wine_type']
print(wine_types.shape)
wine_types

(2000,)


0           White
1             Red
2             Red
3            Rose
4       Sparkling
          ...    
1995          Red
1996        White
1997      Dessert
1998          Red
1999        White
Name: vintage_wine_type, Length: 2000, dtype: object

In [95]:
# 사용자 데이터 입력
# 'acidity', 'fizziness', 'intensity', 'sweetness', 'tannin', 'vintage_wine_type'
user_input = ['3','1','1','1','1','Red']
pred_results = []

for wine_type in wine_types:
    user_input[-2] = wine_type
    vintage_ratings_average_pred = cb_reg.predict(user_input)
    pred_results.append(vintage_ratings_average_pred)

pred_results

[4.133845301438231,
 4.144627478872043,
 4.144627478872043,
 4.155547432803078,
 4.110527908922457,
 4.133845301438231,
 4.1456272223427435,
 4.1581761643731605,
 4.144627478872043,
 4.1581761643731605,
 4.144627478872043,
 4.144627478872043,
 4.1581761643731605,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.110527908922457,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.1581761643731605,
 4.110527908922457,
 4.144627478872043,
 4.1581761643731605,
 4.144627478872043,
 4.144627478872043,
 4.144627478872043,
 4.110527908922457,
 4.144627478872043,
 4.144627478872043,
 4.110527908922457,
 4.1456272223427435,
 4.144627478872043,
 4.110527908922457,
 4.144627478872043,
 4.144627478872043,
 4.110527908922457,
 4.144627478872043,
 4.133845301438231,
 4.1446274788

In [98]:
result_df = pd.DataFrame({
    'wine_name': wine_data_df['vintage_wine_name'],
    'year': wine_data_df['vintage_year'],
    'country': wine_data_df['vintage_country'],
    'region': wine_data_df['vintage_region'],
    'wine_types': wine_types,
    'acidity': wine_data_df['acidity'],
    'fizziness': wine_data_df['fizziness'],
    'intensity': wine_data_df['intensity'],
    'sweetness': wine_data_df['sweetness'],
    'tannin': wine_data_df['tannin'],
    'flavor': wine_data_df['flavor'],
    'ratings': wine_data_df['vintage_ratings_average'],
    'price' : wine_data_df['vintage_price'],
    'vintage_ratings_average_pred': pred_results,
})
result_df = result_df.sort_values(by='vintage_ratings_average_pred', ascending=False).head(10)
result_df

Unnamed: 0,wine_name,year,country,region,wine_types,acidity,fizziness,intensity,sweetness,tannin,flavor,ratings,price,vintage_ratings_average_pred
1448,Gran Duque d'Alba Gran Reserva N.V.,N.V.,Spain,Aragón,Unknown,0,0,0,0,0,dried_fruit,4.1,60377,4.184484
1663,Sauternes 1929,1929,France,Sauternes,Dessert,4,0,5,5,0,earth,4.0,683701,4.158176
1167,Eiswein 2004,2004,Germany,Rheinhessen,Dessert,0,0,0,0,0,tree_fruit,4.2,58220,4.158176
1280,Sauternes 1932,1932,France,Sauternes,Dessert,4,0,5,5,0,earth,4.1,652628,4.158176
1936,Aria Late Picked Riesling 2020,2020,New Zealand,Waipara,Dessert,5,0,4,5,0,tree_fruit,4.0,36996,4.158176
403,Barsac (Premier Grand Cru Classé) 2007,2007,France,Barsac,Dessert,4,0,5,5,0,earth,4.4,141714,4.158176
1572,Saperavi - Muscat Semi-Sweet 2009,2009,Georgia,Kakheti,Dessert,0,0,0,0,0,red_fruit,4.1,112128,4.158176
1931,Gemma del Sole Greco Bianco Passito 2013,2013,Italy,Calabria,Dessert,0,0,0,0,0,dried_fruit,4.0,62533,4.158176
1794,Late Harvest Zinfandel 2005,2005,United States,Dry Creek Valley,Dessert,4,0,4,5,0,oak,4.0,155254,4.158176
442,Sauternes (Premier Grand Cru Classé) 1986,1986,France,Sauternes,Dessert,4,0,5,5,0,earth,4.4,93233,4.158176


In [47]:
def recommend_wine(acidity, fizziness, intensity, sweetness, tannin, n_recommendations=5):
    input_data = pd.DataFrame({
        'acidity': [acidity],
        'fizziness': [fizziness],
        'intensity': [intensity],
        'sweetness': [sweetness],
        'tannin': [tannin]
    })

    # Predict the target (e.g., rating or preference score) for the input data
    predicted_score = cb_reg.predict(input_data)

    wine_data_df['similarity'] = np.abs(wine_data_df['vintage_ratings_average'] - predicted_score) 
    recommendations = wine_data_df.sort_values('similarity').head(n_recommendations)

    # 추천 와인
    return recommendations[['vintage_wine_name', 'vintage_year', 'vintage_country', 'vintage_region', 'vintage_wine_type', 'flavor', 'acidity', 'fizziness', 'intensity', 'sweetness', 'tannin', 'vintage_price']] 

# 예제
user_acidity = int(input("Enter acidity level: "))
user_fizziness = int(input("Enter fizziness level: "))
user_intensity = int(input("Enter intensity level: "))
user_sweetness = int(input("Enter sweetness level: "))
user_tannin = int(input("Enter tannin level: "))

recommendations = recommend_wine(user_acidity, user_fizziness, user_intensity, user_sweetness, user_tannin)
recommendations


Unnamed: 0,vintage_wine_name,vintage_year,vintage_country,vintage_region,vintage_wine_type,flavor,acidity,fizziness,intensity,sweetness,tannin,vintage_price
1410,Gran Corte 2007,2007,Argentina,Vista Flores,Red,oak,3,0,4,3,3,45282
1473,The Holy Trinity Red Blend 2018,2018,Australia,Barossa,Red,oak,3,0,5,2,3,68139
1472,Pessac-Léognan Blanc 2012,2012,France,Pessac-Léognan,White,oak,3,0,4,2,0,404003
1471,Reserve Chardonnay 2019,2019,New Zealand,Waiheke Island,White,oak,4,0,3,1,0,41108
1470,Barbera d'Alba Conca Tre Pile 2008,2008,Italy,Barbera d'Alba,Red,oak,4,0,4,2,2,88409
