In [70]:
import os
import sys
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

In [242]:
dataset = pd.read_pickle('temp_dataset.pkl')

In [243]:
dataset = dataset[['userID', 'profile']]

In [4]:
with open('Final_wine_rating.json', 'r') as f:
    data = json.load(f)

In [5]:
data.keys()

dict_keys(['user_note', 'rating_per_user', 'vintage_id', 'user_like_count', 'userID', 'wine_id', 'wine_name', 'url'])

In [8]:
df = (pd.DataFrame(zip(
    data['wine_id'].values(),
    data['vintage_id'].values(),
    data['userID'].values(),
    data['rating_per_user'].values()),
                   columns = ['wine_id', 'vintage_id', 'userID', 'rating'])
     )

In [15]:
df.head()

Unnamed: 0,wine_id,vintage_id,userID,rating
0,1141133,164942680,19484511,4.0
1,1141133,164942680,352674,4.0
2,1141133,164942680,2148498,1.5
3,1141133,164942680,3450270,4.5
4,1141133,164942680,17786617,4.0


In [13]:
print('wine 개수:', df.wine_id.nunique())
print('vintage 개수:', df.vintage_id.nunique())
print('user 개수:', df.userID.nunique())

wine 개수: 50861
vintage 개수: 307405
user 개수: 37445


In [248]:
user = dataset.merge(df_copy[['userID']].drop_duplicates(), on = 'userID')

In [249]:
user

Unnamed: 0,userID,profile
0,140,"[39.0, 17.0, 173.0, 689.5]"
1,1201,"[128.0, 90.0, 267.0, 1185.5]"
2,2742,"[45.0, 47.0, 200.0, 704.0]"
3,2764,"[2848.0, 103.0, 582.0, 2209.5]"
4,2891,"[54.0, 33.0, 1233.0, 4173.0]"
...,...,...
35825,46828642,"[47.0, 53.0, 1001.0, 3742.5]"
35826,46874748,"[0.0, 0.0, 8.0, 29.0]"
35827,46886068,"[0.0, 0.0, 2.0, 5.5]"
35828,46895838,"[1.0, 2.0, 17.0, 73.5]"


In [40]:
df = df.loc[df['userID'].isin(user['userID'])]

In [41]:
print('wine 개수:', df.wine_id.nunique())
print('vintage 개수:', df.vintage_id.nunique())
print('user 개수:', df.userID.nunique())

wine 개수: 50858
vintage 개수: 303156
user 개수: 35830


# 1. dataset 만들기

In [207]:
user_ids = dataset['userID'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

wine_ids = df["wine_id"].unique().tolist()
wine2wine_encoded = {x: i for i, x in enumerate(wine_ids)}
wine_encoded2wine = {i: x for i, x in enumerate(wine_ids)}

In [208]:
df_copy = df.copy()

In [209]:
df['userID'] = df["userID"].map(user2user_encoded)
df["wine_id"] = df["wine_id"].map(wine2wine_encoded)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [225]:
df_copy.loc[df_copy['userID'] == 140]

Unnamed: 0,wine_id,vintage_id,userID,rating
80781,14362,1492826,140,5.0
287743,1724358,6784672,140,5.0


In [232]:
df.loc[df['userID'] == user2user_encoded[140]]

Unnamed: 0,wine_id,vintage_id,userID,rating
80781,473,1492826,1,5.0
287743,3143,6784672,1,5.0


## 1-1. 가장 마지막에 마신 wine을 label로

In [251]:
user['userID'] = user['userID'].map(user2user_encoded)

In [252]:
user

Unnamed: 0,userID,profile
0,1,"[39.0, 17.0, 173.0, 689.5]"
1,3,"[128.0, 90.0, 267.0, 1185.5]"
2,4,"[45.0, 47.0, 200.0, 704.0]"
3,5,"[2848.0, 103.0, 582.0, 2209.5]"
4,7,"[54.0, 33.0, 1233.0, 4173.0]"
...,...,...
35825,52190,"[47.0, 53.0, 1001.0, 3742.5]"
35826,52191,"[0.0, 0.0, 8.0, 29.0]"
35827,52192,"[0.0, 0.0, 2.0, 5.5]"
35828,52193,"[1.0, 2.0, 17.0, 73.5]"


In [253]:
# 모든 user의 소비 wine_id 넣어주기
history = {}
for u in tqdm(user.userID.unique()):
    history[u] = list(df.loc[df['userID'] == u, 'wine_id'])

100%|██████████| 35830/35830 [00:41<00:00, 871.33it/s]


In [254]:
# 최근 소비한 10개 wine 뽑기
recent = []
label = []
for u in tqdm(user.userID.values):
    label.append(history[u][-1])
    recent.append(history[u][-11:-1])

100%|██████████| 35830/35830 [00:00<00:00, 188080.20it/s]


In [256]:
len(label), len(recent)

(35830, 35830)

In [257]:
user['recent'] = recent
user['label'] = label
user

Unnamed: 0,userID,profile,recent,label
0,1,"[39.0, 17.0, 173.0, 689.5]",[473],3143
1,3,"[128.0, 90.0, 267.0, 1185.5]","[28384, 43893, 49041, 26549, 50701, 6108, 3855...",39572
2,4,"[45.0, 47.0, 200.0, 704.0]",[],18012
3,5,"[2848.0, 103.0, 582.0, 2209.5]","[78, 6182, 1395, 821]",6689
4,7,"[54.0, 33.0, 1233.0, 4173.0]",[],11455
...,...,...,...,...
35825,52190,"[47.0, 53.0, 1001.0, 3742.5]",[13553],20448
35826,52191,"[0.0, 0.0, 8.0, 29.0]",[],23478
35827,52192,"[0.0, 0.0, 2.0, 5.5]",[],12967
35828,52193,"[1.0, 2.0, 17.0, 73.5]",[],2573


In [258]:
# 모든 user의 평점&최신 순 wine 뽑기
like = {}
for u in tqdm(user.userID.unique()):
    temp = df.loc[df['userID']== u].sort_index().iloc[:-1] # label 제외
    like[u] = list(temp.loc[temp['rating'] > 4].reset_index().sort_values(['rating', 'index'], ascending = [False, False])['wine_id'][:10])

100%|██████████| 35830/35830 [02:24<00:00, 247.14it/s]


In [259]:
len(like.keys())

35830

In [260]:
# 모든 user의 평점&최신 순 wine 뽑기
dislike = {}
for u in tqdm(user.userID.unique()):
    temp = df.loc[df['userID']== u].sort_index().iloc[:-1] # label 제외
    
    
    dislike[u] = list(temp.loc[temp['rating'] < 4].reset_index().sort_values(['rating', 'index'], ascending = [True, False])['wine_id'][:10])

100%|██████████| 35830/35830 [02:24<00:00, 248.15it/s]


In [261]:
user['like'] = list(like.values())
user['dislike'] = list(dislike.values())

In [262]:
user

Unnamed: 0,userID,profile,recent,label,like,dislike
0,1,"[39.0, 17.0, 173.0, 689.5]",[473],3143,[473],[]
1,3,"[128.0, 90.0, 267.0, 1185.5]","[28384, 43893, 49041, 26549, 50701, 6108, 3855...",39572,"[38554, 38554, 50701, 26549, 49041, 43893, 136...","[3487, 80, 6112, 1365, 1760, 9229, 45157, 9615..."
2,4,"[45.0, 47.0, 200.0, 704.0]",[],18012,[],[]
3,5,"[2848.0, 103.0, 582.0, 2209.5]","[78, 6182, 1395, 821]",6689,"[6182, 821, 1395, 78]",[]
4,7,"[54.0, 33.0, 1233.0, 4173.0]",[],11455,[],[]
...,...,...,...,...,...,...
35825,52190,"[47.0, 53.0, 1001.0, 3742.5]",[13553],20448,[13553],[]
35826,52191,"[0.0, 0.0, 8.0, 29.0]",[],23478,[],[]
35827,52192,"[0.0, 0.0, 2.0, 5.5]",[],12967,[],[]
35828,52193,"[1.0, 2.0, 17.0, 73.5]",[],2573,[],[]


In [283]:
user['profile_a'] = user['profile'].apply(lambda x : x[0])
user['profile_b'] = user['profile'].apply(lambda x : x[1])
user['profile_c'] = user['profile'].apply(lambda x : x[2])
user['profile_d'] = user['profile'].apply(lambda x : x[3])

In [285]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()

In [297]:
user['profile_scale'] = list(sc.fit_transform(user.iloc[:, -4:]))

In [299]:
user.to_pickle('dataset.pkl')

## 1-2. taste

In [16]:
taste = pd.read_csv('Final_wine_meta.csv')

In [19]:
taste = taste.rename(columns = {'Unnamed: 0' : 'wine_id'})
taste

Unnamed: 0,wine_id,red_fruit_count,red_fruit_score,red_fruit_mentions_count,citrus_fruit_count,citrus_fruit_score,citrus_fruit_mentions_count,non_oak_count,non_oak_score,non_oak_mentions_count,...,dried_fruit_count,dried_fruit_score,dried_fruit_mentions_count,acidity,fizziness,intensity,sweetness,tannin,user_structure_count,calculated_structure_count
0,1141133,387.0,61208.0,612.0,130.0,15600.0,156.0,123.0,8830.0,129.0,...,8.0,405.0,3.0,4.283229,4.365849,3.687198,,,505.0,37.0
1,2532733,145.0,17836.0,177.0,8.0,1200.0,12.0,87.0,1638.0,7.0,...,4.0,221.0,2.0,,,,,,,
2,1253802,88.0,10413.0,103.0,10.0,1100.0,11.0,51.0,894.0,3.0,...,9.0,637.0,6.0,,,,,,,
3,1123441,,,,5.0,600.0,6.0,,,,...,1.0,100.0,1.0,,,,,,,
4,1157656,25.0,2248.0,22.0,288.0,35400.0,354.0,71.0,4511.0,65.0,...,5.0,348.0,3.0,3.897176,3.910853,2.077880,,,165.0,711.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50856,5563017,,,,1.0,100.0,1.0,,,,...,,,,,,,,,,
50857,3849645,14.0,1816.0,18.0,1.0,100.0,1.0,2.0,28.0,0.0,...,1.0,16.0,0.0,3.976166,,3.439617,1.260703,1.937348,8.0,38.0
50858,97879,1.0,100.0,1.0,,,,,,,...,,,,3.172619,,5.000000,1.000000,3.869048,0.0,2.0
50859,1196584,,,,7.0,700.0,7.0,8.0,285.0,4.0,...,5.0,491.0,5.0,,,,,,,
