# lightgbm 활용

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
wine_data_df = pd.read_csv('./data/vivino_data_241107.csv')
wine_data_df.head() 

Unnamed: 0,vintage_id,vintage_name,vintage_year,vintage_price,vintage_ratings_average,vintage_ratings_count,vintage_wine_id,vintage_wine_name,vintage_winery,vintage_country,vintage_region,vintage_wine_type_id,acidity,fizziness,intensity,sweetness,tannin,flavor,foods
0,177404982,Carl Loewen 1896 Riesling 2023,2023,82045,5.0,115,1945087,1896 Riesling 2023,Carl Loewen,Germany,Mosel,2,4.61,,3.32,2.05,,tree_fruit,"['Pork', 'Shellfish', 'Spicy food', 'Poultry',..."
1,1510217,Château Haut-Brion Pessac-Léognan (Premier Gra...,1989,3244496,4.8,1538,1152755,Pessac-Léognan (Premier Grand Cru Classé) 1989,Château Haut-Brion,France,Pessac-Léognan,1,4.22,,4.08,1.67,4.23,oak,"['Beef', 'Lamb', 'Game (deer, venison)', 'Poul..."
2,2611979,Château Latour Grand Vin Pauillac (Premier Gra...,1982,2759686,4.8,1445,1655970,Grand Vin Pauillac (Premier Grand Cru Classé) ...,Château Latour,France,Pauillac,1,4.12,,4.12,1.68,4.13,oak,"['Beef', 'Lamb', 'Game (deer, venison)', 'Poul..."
3,170413977,Domaines Ott Clos Mireille Rosé (Coeur de Grai...,2022,59594,4.8,581,1382222,Clos Mireille Rosé (Coeur de Grain) 2022,Domaines Ott,France,Côtes de Provence,4,3.77,,2.5,1.33,,citrus_fruit,"['Pork', 'Shellfish', 'Vegetarian', 'Poultry']"
4,1688597,Krug Clos du Mesnil 2000,2000,2175423,4.8,347,79635,Clos du Mesnil 2000,Krug,France,Champagne,3,4.54,4.22,3.97,,,non_oak,"['Pork', 'Rich fish (salmon, tuna etc)', 'Shel..."


In [33]:
# 와인 타입 숫자에서 문자열로 변경
# vintage_wine_type_id 
# 1-red, 2-white, 3-sparkling, 4-rose, 7-dessert, 24-fortified
vintage_wine_type_id_dic = {
    1 : 'Red',
    2 : 'White',
    3 : 'Sparkling',
    4 : 'Rose',
    7 : 'Dessert',
    24 : 'Fortified',
    25 : 'Unknown'
}

wine_data_df['vintage_wine_type_id'] = wine_data_df['vintage_wine_type_id'].apply(lambda x: vintage_wine_type_id_dic[x])

# 컬럼명 'vintage_wine_type_id'에서 'vintage_wine_type' 으로 변경
wine_data_df = wine_data_df.rename(columns={'vintage_wine_type_id': 'vintage_wine_type'})

In [46]:
wine_data_df[['acidity', 'fizziness', 'intensity', 'sweetness', 'tannin']] = wine_data_df[['acidity', 'fizziness', 'intensity', 'sweetness', 'tannin', ]].fillna(0)

In [47]:
# 속성, 타겟 설정
features = ['acidity', 'fizziness', 'intensity', 'sweetness', 'tannin']
target = 'vintage_ratings_average'

In [48]:
from sklearn.model_selection import train_test_split

X = wine_data_df[features]
y = wine_data_df[target]

# 학습/데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

# 학습데이터를 다시 학습/검증 분리
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, random_state=0, test_size=0.1)
X_tr.shape, X_val.shape, X_test.shape, y_tr.shape, y_val.shape, y_test.shape

((2646, 5), (294, 5), (735, 5), (2646,), (294,), (735,))

In [49]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, classification_report

# LGBMRegressor 사용
lgbm = LGBMRegressor()
eval_set = [(X_tr, y_tr), (X_val, y_val)]

# 모델 학습
lgbm.fit(X_tr, y_tr, eval_set=eval_set)

# 예측
y_train_pred = lgbm.predict(X_train)
y_test_pred = lgbm.predict(X_test)

# 평가 지표 출력
print('학습 MSE: ', mean_squared_error(y_train, y_train_pred))
print('테스트 MSE: ', mean_squared_error(y_test, y_test_pred))

# R^2 값 출력
print('학습 R^2: ', r2_score(y_train, y_train_pred))
print('테스트 R^2: ', r2_score(y_test, y_test_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 722
[LightGBM] [Info] Number of data points in the train set: 2646, number of used features: 5
[LightGBM] [Info] Start training from score 4.079025
학습 MSE:  0.008465016406708832
테스트 MSE:  0.014431528989761765
학습 R^2:  0.818479397273819
테스트 R^2:  0.7142271225300335


In [50]:
def recommend_top_wines(acidity, fizziness, intensity, sweetness, tannin, top_n=5):
    # 사용자 입력 값 준비
    user_input = np.array([[acidity, fizziness, intensity, sweetness, tannin]])

    # 데이터의 모든 와인에 대해 예측 점수를 계산
    feature_cols = ['acidity', 'fizziness', 'intensity', 'sweetness', 'tannin']
    wine_data_df['predicted_score'] = lgbm.predict(wine_data_df[feature_cols])

    # 예측 점수 기반 상위 추천 와인 선택
    top_wines = wine_data_df.nlargest(top_n, 'predicted_score')[['vintage_wine_name', 'vintage_year', 'vintage_country', 'vintage_region', 'vintage_wine_type', 'flavor', 'acidity', 'fizziness', 'intensity', 'sweetness', 'tannin']]

    return top_wines

# 사용자 입력 예시
acidity = float(input("Acidity: "))
fizziness = float(input("Fizziness: "))
intensity = float(input("Intensity: "))
sweetness = float(input("Sweetness: "))
tannin = float(input("Tannin: "))

# 상위 추천 와인 목록 표시
top_recommended_wines = recommend_top_wines(acidity, fizziness, intensity, sweetness, tannin)
print("Top recommended wines:")
top_recommended_wines

Top recommended wines:


Unnamed: 0,vintage_wine_name,vintage_year,vintage_country,vintage_region,vintage_wine_type,flavor,acidity,fizziness,intensity,sweetness,tannin
11,Chateauneuf-du-Pape Réserve des Célestins 1990,1990,France,Châteauneuf-du-Pape,Red,earth,3.14,0.0,4.48,2.01,3.03
44,Chateauneuf-du-Pape Réserve des Célestins 2007,2007,France,Châteauneuf-du-Pape,Red,earth,3.14,0.0,4.48,2.01,3.03
114,Chateauneuf-du-Pape Réserve des Célestins 2011,2011,France,Châteauneuf-du-Pape,Red,earth,3.14,0.0,4.48,2.01,3.03
131,Chateauneuf-du-Pape Réserve des Célestins 2005,2005,France,Châteauneuf-du-Pape,Red,earth,3.14,0.0,4.48,2.01,3.03
2,Grand Vin Pauillac (Premier Grand Cru Classé) ...,1982,France,Pauillac,Red,oak,4.12,0.0,4.12,1.68,4.13
