In [60]:
import os
import sys
import gc
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

# 0. 데이터 불러오기

In [4]:
with open('./data/train_v2_201130.json', 'r') as f:
    train = json.load(f)

In [5]:
with open('./data/test_v2_201130.json', 'r') as f:
    test = json.load(f)

In [114]:
with open('./data/User_Meta_v1_201129.json', 'r') as f:
    user_meta = json.load(f)

In [20]:
with open('./data/Wine_Meta_v1_201129.json', 'r') as f:
    wine = json.load(f)

In [33]:
with open('./data/Wine_Token_v1_201129.json', 'r') as f:
    token = json.load(f)

In [68]:
with open('./data/RAW_v2_201130.json', 'r') as f:
    raw = json.load(f)

# 1. DNN train/test set 만들기

In [70]:
df = (pd.DataFrame(zip(
    raw['wine_id'].values(),
    raw['vintage_id'].values(),
    raw['userID'].values(),
    raw['rating_per_user'].values()),
                   columns = ['wine_id', 'vintage_id', 'userID', 'rating'])
     )

In [71]:
df

Unnamed: 0,wine_id,vintage_id,userID,rating
0,1141133,164942680,19484511,4.0
1,1141133,164942680,352674,4.0
2,1141133,164942680,2148498,1.5
3,1141133,164942680,3450270,4.5
4,1141133,164942680,17786617,4.0
...,...,...,...,...
944094,63654,2435472,11274168,4.0
944095,5602,2293611,11274168,4.5
944096,1396664,8169599,11274168,3.0
944097,1218423,1590767,11274168,4.0


In [72]:
print('wine 개수:', df.wine_id.nunique())
print('vintage 개수:', df.vintage_id.nunique())
print('user 개수:', df.userID.nunique())

wine 개수: 50861
vintage 개수: 307405
user 개수: 6343


In [73]:
user_ids = df['userID'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

wine_ids = df["wine_id"].unique().tolist()
wine2wine_encoded = {x: i for i, x in enumerate(wine_ids)}
wine_encoded2wine = {i: x for i, x in enumerate(wine_ids)}

In [74]:
df_copy = df.copy()

In [75]:
df['userID'] = df["userID"].map(user2user_encoded)
df["wine_id"] = df["wine_id"].map(wine2wine_encoded)

In [76]:
df

Unnamed: 0,wine_id,vintage_id,userID,rating
0,0,164942680,0,4.0
1,0,164942680,1,4.0
2,0,164942680,2,1.5
3,0,164942680,3,4.5
4,0,164942680,4,4.0
...,...,...,...,...
944094,12120,2435472,5425,4.0
944095,359,2293611,5425,4.5
944096,39281,8169599,5425,3.0
944097,9125,1590767,5425,4.0


In [77]:
df_copy

Unnamed: 0,wine_id,vintage_id,userID,rating
0,1141133,164942680,19484511,4.0
1,1141133,164942680,352674,4.0
2,1141133,164942680,2148498,1.5
3,1141133,164942680,3450270,4.5
4,1141133,164942680,17786617,4.0
...,...,...,...,...
944094,63654,2435472,11274168,4.0
944095,5602,2293611,11274168,4.5
944096,1396664,8169599,11274168,3.0
944097,1218423,1590767,11274168,4.0


In [82]:
user = df[['userID']].drop_duplicates().reset_index(drop = True).sort_values('userID')
user = user.rename(columns = {'userID' : 'enc_userID'})
user

Unnamed: 0,userID
0,0
1,1
2,2
3,3
4,4
...,...
6338,6338
6339,6339
6340,6340
6341,6341


In [87]:
def makedataset(df, user):
    # 모든 user의 소비 wine_id 넣어주기
    history = {}
    recent = []
    label = []
    for u in tqdm(user.userID.unique()):
        history[u] = list(df.loc[df['userID'] == u, 'wine_id']) # 모든 구매내역
        label.append(history[u][-1]) # 마지막 구매
        recent.append(history[u][-11:-1]) # 최근 10개
    
    # 모든 user의 평점&최신 순 wine 뽑기
    like = {}
    dislike = {}
    for u in tqdm(user.userID.unique()):
        temp = df.loc[df['userID']== u].sort_index().iloc[:-1] # label 제외
        like[u] = list(temp.loc[temp['rating'] > 4].reset_index().sort_values(['rating', 'index'], ascending = [False, False])['wine_id'][:10])
        dislike[u] = list(temp.loc[temp['rating'] < 4].reset_index().sort_values(['rating', 'index'], ascending = [True, False])['wine_id'][:10])

    user['history'] = list(history.values())
    user['recent'] = recent
    user['label'] = label
    user['like'] = list(like.values())
    user['dislike'] = list(dislike.values())
    
    return user

In [88]:
user = makedataset(df, user)

100%|██████████| 6343/6343 [00:07<00:00, 854.52it/s]
100%|██████████| 6343/6343 [00:46<00:00, 137.21it/s]


In [96]:
df.columns = ['enc_wine_id', 'enc_vintage_id', 'enc_userID', 'rating']

In [101]:
df = pd.concat([df_copy, df], axis = 1).iloc[:, :-1]

In [102]:
df

Unnamed: 0,wine_id,vintage_id,userID,rating,enc_wine_id,enc_vintage_id,enc_userID
0,1141133,164942680,19484511,4.0,0,164942680,0
1,1141133,164942680,352674,4.0,0,164942680,1
2,1141133,164942680,2148498,1.5,0,164942680,2
3,1141133,164942680,3450270,4.5,0,164942680,3
4,1141133,164942680,17786617,4.0,0,164942680,4
...,...,...,...,...,...,...,...
944094,63654,2435472,11274168,4.0,12120,2435472,5425
944095,5602,2293611,11274168,4.5,359,2293611,5425
944096,1396664,8169599,11274168,3.0,39281,8169599,5425
944097,1218423,1590767,11274168,4.0,9125,1590767,5425


In [104]:
user = user[['userID', 'history', 'recent', 'like', 'dislike', 'label']]

In [110]:
user

Unnamed: 0,userID,history,recent,like,dislike,label
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[209, 210, 211, 212, 213, 214, 215, 216, 217, ...","[202, 199, 194, 151, 168, 127, 136, 101, 81, 78]","[211, 31, 7, 217, 207, 201, 192, 191, 187, 183]",95
1,1,"[0, 376, 844, 945, 1094, 1721, 1999, 2346, 139...","[9567, 9568, 5735, 9569, 9570, 4129, 9571, 955...","[9540, 9495, 2269, 9455, 9439, 8145, 3149, 314...","[2263, 9526, 9243, 2421, 9570, 5735, 9567, 956...",579
2,2,"[0, 410, 450, 677, 846, 945, 502, 1381, 2339, ...","[15582, 15563, 15583, 15584, 15585, 15586, 155...","[15499, 15545, 15586, 15563, 5289, 3357, 15563...","[15561, 15556, 3703, 1234, 15575, 15569, 15565...",15589
3,3,"[0, 164, 184, 677, 767, 1334, 1498, 1920, 1179...","[301, 15651, 21186, 21023, 15530, 5286, 9597, ...","[301, 5115, 14261, 15611, 16222, 21168, 18015,...","[21088, 5831, 5223, 12503, 15386, 21110, 11540...",21156
4,4,"[0, 163, 225, 1886, 2726, 2099, 4037, 4792, 48...","[21598, 11681, 21599, 18471, 15664, 14586, 216...","[1955, 263, 10077, 4679, 20631, 14038, 4926, 6...","[21600, 190, 31, 21577, 21560, 21545, 14816, 8...",9361
...,...,...,...,...,...,...
6338,6338,"[769, 1409, 8145, 46902, 6612, 4840, 11316, 40...","[9826, 1387, 1435, 18472, 18472, 40348, 7695, ...","[1324, 6112, 242, 7695, 6468, 6112, 18244, 182...","[18332, 8405, 18472, 18472, 18540, 2002, 2332,...",350
6339,6339,"[3337, 442, 1051, 717, 44543, 31922, 2173, 483...","[6532, 42282, 2743, 4725, 6534, 6608, 6611, 16...","[16955, 6608, 6612, 1848, 19652, 4418, 2573, 7...","[19106, 1337, 6637, 8283, 31902, 12745, 47533,...",953
6340,6340,"[2107, 2576, 46476, 2806, 25997, 44159, 5499, ...","[3703, 18113, 5528, 657, 5554, 18238, 1920, 16...","[336, 1920, 18238, 5554, 18113, 3703, 36293, 2...","[813, 38534, 3375, 29261, 17371, 26521, 15006,...",48838
6341,6341,"[8574, 19701, 11336, 14057, 1029, 442, 29577, ...","[18980, 36012, 475, 3605, 1677, 10070, 6674, 1...","[1433, 6674, 18980, 49, 6124, 9592, 1213, 1368...","[16610, 2412, 11690, 1163, 36012, 43465, 2301,...",28273


In [124]:
user_info = (pd.DataFrame(zip(
    user_meta['userID'].values(),
    user_meta['user_follower_count'].values(),
    user_meta['user_following_count'].values(),
    user_meta['user_rating_count'].values(),
    user_meta['user_rating_sum'].values(),
    user_meta['reviews_count'].values())
                         , columns = ['userID', 'follower', 'following', 'rating_count', 'rating_sum', 'review_count']))

In [105]:
import joblib
joblib.dump({
    'data' : df,
    'user' : user
}, 'dataset.pkl')

['dataset.pkl']

In [149]:
enc_user_info = (df.merge(user_info,
                          on = 'userID',
                          how = 'left')[['enc_userID', 'follower', 'following', 'rating_count', 'rating_sum', 'review_count']]
                .drop_duplicates('enc_userID')
                )

In [150]:
enc_user_info

Unnamed: 0,enc_userID,follower,following,rating_count,rating_sum,review_count
0,0,34.0,65.0,383.0,1453.0,323.0
1,1,425.0,915.0,382.0,1516.5,297.0
2,2,332.0,47.0,378.0,1168.0,347.0
3,3,2.0,2.0,412.0,1602.5,352.0
4,4,46.0,75.0,412.0,1589.0,170.0
...,...,...,...,...,...,...
939795,6338,17.0,16.0,260.0,1107.5,173.0
940171,6339,16.0,16.0,260.0,933.0,11.0
940752,6340,5.0,1.0,260.0,1115.0,175.0
941644,6341,19.0,34.0,259.0,865.5,169.0


In [152]:
user = user.merge(enc_user_info, on = 'enc_userID', how = 'left')

In [156]:
import joblib
joblib.dump({
    'data' : df,
    'user' : user
}, 'dataset.pkl')

['dataset.pkl']

# 2. wine 군집으로 label 만들기

In [213]:
wine.keys()

dict_keys(['name', 'wine_id', 'rating_count', 'rating_average', 'rating_distribution', 'label_count', 'review_count', 'type_id', 'body', 'acidity', 'alcohol', 'food', 'grapes', 'grapes_id', 'grapes_count', 'grape_composition', 'rank', 'region_id', 'region_name', 'country_code', 'country_most_used_grapes_id', 'country_most_used_grapes_name', 'country_most_used_grapes_wines_count', 'winery_id', 'winery_name', 'winery_ratings_count', 'winery_ratings_average', 'winery_labels_count', 'winery_wines_count'])

In [265]:
columns = ['wine_id', 'name', 'rating_count', 'rating_average', 'rating_distribution', 'label_count', 'review_count',
          'type_id', 'body', 'acidity', 'alcohol', 'food', 'grapes_id', 'grapes_count', 'grape_composition', 'region_id',
          'country_code', 'country_most_used_grapes_id', 'country_most_used_grapes_wines_count', 'winery_id', 'winery_ratings_count',
          'winery_ratings_average', 'winery_labels_count', 'winery_wines_count']

In [266]:
item = (pd.DataFrame(zip(
    wine['wine_id'].values(), # wine_id
    wine['name'].values(), # wine명
    wine['rating_count'].values(), # wine rating 개수 -> continuous
    wine['rating_average'].values(), # wine rating 평균 -> continuous
#     wine['rating_distribution'].values(), # wine rating
    wine['label_count'].values(), # wine label 개수 -> continuous
    wine['review_count'].values(), # wine review 개수 -> continuous
    wine['type_id'].values(), # wine type -> categorical
    wine['body'].values(), # wine taste - body -> continuous
    wine['acidity'].values(), # wine taste - acidity -> continuous
    wine['alcohol'].values(), # wine taste - alcohol -> continuous
    wine['food'].values(), # wine with food -> categorical
    wine['grapes_id'].values(), # wine 포도 원산지? -> categorical
    wine['grapes_count'].values(), # wine 포도 원산지 개수? -> continuous
#     wine['grape_composition'].values(),
#     wine['rank'].values(),
    wine['region_id'].values(), # wine 생산지역 -> categorical
#     wine['region_name'].values(),
    wine['country_code'].values(), # wine 생산국가 -> categorical
#     wine['country_most_used_grapes_id'].values(), wine 생산국가로 구별 가능
    wine['country_most_used_grapes_wines_count'].values(),
    wine['winery_id'].values(),
    wine['winery_ratings_count'].values(),
    wine['winery_ratings_average'].values(),
    wine['winery_labels_count'].values(),
    wine['winery_wines_count'].values())
                    ,columns = columns))

In [276]:
wine['grape_composition']['0']

{'2': 100}

In [278]:
pd.DataFrame(zip(wine['country_code'].values(), wine['country_most_used_grapes_id'].values()))

Unnamed: 0,0,1
0,us,"[2, 14, 5]"
1,fr,"[14, 10, 5]"
2,us,"[2, 14, 5]"
3,us,"[2, 14, 5]"
4,us,"[2, 14, 5]"
...,...,...
50855,it,"[16, 10, 5]"
50856,it,"[16, 10, 5]"
50857,it,"[16, 10, 5]"
50858,fr,"[14, 10, 5]"


In [274]:
for k in wine['grape_composition'].keys():
    for i in wine['grape_composition'][k]:
        

{'0': {'2': 100},
 '1': {'1': 100},
 '2': {},
 '3': {'10': 100, '2': 0},
 '4': {},
 '5': {},
 '6': {'14': 100},
 '7': {},
 '8': {'2': 100},
 '9': {},
 '10': {},
 '11': {},
 '12': {},
 '13': {'1': 100},
 '14': {'8': 100},
 '15': {'9': 100},
 '16': {'14': 100},
 '17': {},
 '18': {},
 '19': {'1': 0, '22': 0, '25': 0, '9': 0},
 '20': {'14': 100},
 '21': {'16': 100},
 '22': {'2': 100},
 '23': {'92': 100},
 '24': {},
 '25': {},
 '26': {'80': 100},
 '27': {'14': 100},
 '28': {},
 '29': {'19': 100},
 '30': {},
 '31': {'5': 100},
 '32': {'2': 100},
 '33': {},
 '34': {},
 '35': {'19': 0, '25': 0, '31': 0, '8': 0},
 '36': {},
 '37': {},
 '38': {'14': 100},
 '39': {'80': 100},
 '40': {},
 '41': {'22': 100},
 '42': {'14': 100},
 '43': {},
 '44': {},
 '45': {},
 '46': {},
 '47': {'14': 100},
 '48': {},
 '49': {'20': 100},
 '50': {'1': 100},
 '51': {'14': 100},
 '52': {},
 '53': {},
 '54': {'12': 100, '9': 100},
 '55': {},
 '56': {'97': 100},
 '57': {},
 '58': {},
 '59': {},
 '60': {},
 '61': {'14': 

In [272]:
pd.DataFrame(wine['grape_composition'].values())

Unnamed: 0,2,1,10,14,8,9,22,25,16,92,...,1153,289,122,1262,303,1327,568,552,346,232
0,100.0,,,,,,,,,,...,,,,,,,,,,
1,,100.0,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,0.0,,100.0,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50855,,,,,,,,,,,...,,,,,,,,,,
50856,,,,,,,,,,,...,,,,,,,,,,
50857,,,,,,,,,,,...,,,,,,,,,,
50858,,,,,,,,,,,...,,,,,,,,,,


In [268]:
item.head()

Unnamed: 0,wine_id,name,rating_count,rating_average,rating_distribution,label_count,review_count,type_id,body,acidity,...,grape_composition,region_id,country_code,country_most_used_grapes_id,country_most_used_grapes_wines_count,winery_id,winery_ratings_count,winery_ratings_average,winery_labels_count,winery_wines_count
0,1938520,1882 Cabernet Sauvignon,1697,4.1,"{'1': 11, '2': 26, '3': 315, '4': 1060, '5': 286}",14879,16,1,5.0,2.0,...,{'2': 100},105.0,us,"[2, 14, 5]","[687800, 483935, 510440]",2412.0,18888.0,4.3,121618.0,62.0
1,14604,Les Bessards Hermitage,1078,4.3,"{'1': 12, '2': 13, '3': 95, '4': 625, '5': 333}",5370,3,1,5.0,3.0,...,{'1': 100},535.0,fr,"[14, 10, 5]","[483935, 482768, 510440]",7636.0,72079.0,3.8,462021.0,57.0
2,1930757,Patriarch Estate Grown,1072,4.6,"{'1': 1, '2': 4, '3': 53, '4': 438, '5': 576}",6042,25,1,4.0,3.0,...,{},88.0,us,"[2, 14, 5]","[687800, 483935, 510440]",1905.0,7747.0,4.4,49362.0,21.0
3,1564280,Merlot,3577,4.3,"{'1': 15, '2': 28, '3': 351, '4': 2182, '5': 1...",18748,52,1,4.0,3.0,...,"{'10': 100, '2': 0}",24.0,us,"[2, 14, 5]","[687800, 483935, 510440]",1297.0,14091.0,4.4,83324.0,19.0
4,2576427,Cabernet Sauvignon F Block,115,4.4,"{'1': 0, '2': 3, '3': 7, '4': 58, '5': 47}",806,1,1,5.0,2.0,...,{},42.0,us,"[2, 14, 5]","[687800, 483935, 510440]",2232.0,1077.0,4.4,7749.0,18.0


In [269]:
item

Unnamed: 0,wine_id,name,rating_count,rating_average,rating_distribution,label_count,review_count,type_id,body,acidity,...,grape_composition,region_id,country_code,country_most_used_grapes_id,country_most_used_grapes_wines_count,winery_id,winery_ratings_count,winery_ratings_average,winery_labels_count,winery_wines_count
0,1938520,1882 Cabernet Sauvignon,1697,4.1,"{'1': 11, '2': 26, '3': 315, '4': 1060, '5': 286}",14879,16,1,5.0,2.0,...,{'2': 100},105.0,us,"[2, 14, 5]","[687800, 483935, 510440]",2412.0,18888.0,4.3,121618.0,62.0
1,14604,Les Bessards Hermitage,1078,4.3,"{'1': 12, '2': 13, '3': 95, '4': 625, '5': 333}",5370,3,1,5.0,3.0,...,{'1': 100},535.0,fr,"[14, 10, 5]","[483935, 482768, 510440]",7636.0,72079.0,3.8,462021.0,57.0
2,1930757,Patriarch Estate Grown,1072,4.6,"{'1': 1, '2': 4, '3': 53, '4': 438, '5': 576}",6042,25,1,4.0,3.0,...,{},88.0,us,"[2, 14, 5]","[687800, 483935, 510440]",1905.0,7747.0,4.4,49362.0,21.0
3,1564280,Merlot,3577,4.3,"{'1': 15, '2': 28, '3': 351, '4': 2182, '5': 1...",18748,52,1,4.0,3.0,...,"{'10': 100, '2': 0}",24.0,us,"[2, 14, 5]","[687800, 483935, 510440]",1297.0,14091.0,4.4,83324.0,19.0
4,2576427,Cabernet Sauvignon F Block,115,4.4,"{'1': 0, '2': 3, '3': 7, '4': 58, '5': 47}",806,1,1,5.0,2.0,...,{},42.0,us,"[2, 14, 5]","[687800, 483935, 510440]",2232.0,1077.0,4.4,7749.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50855,1669561,Garganega - Pinot Grigio,788,3.5,"{'1': 11, '2': 60, '3': 407, '4': 251, '5': 59}",6635,9,2,3.0,3.0,...,{},983.0,it,"[16, 10, 5]","[108135, 482768, 510440]",19962.0,64503.0,3.4,675950.0,517.0
50856,1861275,Dadà Langhe Chardonnay,231,3.8,"{'1': 3, '2': 11, '3': 85, '4': 109, '5': 23}",961,6,2,3.0,3.0,...,{'5': 100},613.0,it,"[16, 10, 5]","[108135, 482768, 510440]",17655.0,2573.0,3.9,12719.0,13.0
50857,2201892,Metodo Zero Prosecco Extra Dry,390,3.9,"{'1': 2, '2': 11, '3': 114, '4': 220, '5': 43}",1983,14,3,1.0,3.0,...,{},3232.0,it,"[16, 10, 5]","[108135, 482768, 510440]",13763.0,7521.0,4.0,42703.0,49.0
50858,2396179,Les Monts Damnés Sancerre,302,4.2,"{'1': 0, '2': 4, '3': 52, '4': 194, '5': 52}",730,4,2,4.0,3.0,...,{},635.0,fr,"[14, 10, 5]","[483935, 482768, 510440]",58349.0,2698.0,4.1,8772.0,10.0
