In [1]:
import os
import sys
import gc
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
with open('review_vintage_id_116562_v3.json', 'r') as f:
    raw1 = json.load(f)

In [3]:
with open('vintage_data_review_token_use_review_time.json', 'r') as f:
    raw2 = json.load(f)

In [4]:
raw3 = pd.read_csv('taste_v2_813.csv')

In [5]:
raw3 = raw3.rename(columns = {'Unnamed: 0' : 'wine_id'})
raw3.head()

Unnamed: 0,wine_id,oak_count,oak_score,oak_mentions_count,black_fruit_count,black_fruit_score,black_fruit_mentions_count,earth_count,earth_score,earth_mentions_count,...,acidity,fizziness,intensity,sweetness,tannin,user_structure_count,calculated_structure_count,tropical_fruit_count,tropical_fruit_score,tropical_fruit_mentions_count
0,1611255,102.0,13046.0,146.0,89.0,13291.0,135.0,64.0,5324.0,49.0,...,3.346686,,4.766704,1.719518,3.476704,138.0,170.0,,,
1,1194532,73.0,9216.0,100.0,55.0,7764.0,79.0,47.0,4467.0,46.0,...,2.263482,,4.088044,2.758416,2.958805,96.0,118.0,2.0,200.0,2.0
2,84351,50.0,7070.0,77.0,39.0,6338.0,65.0,32.0,3289.0,34.0,...,3.341118,,4.793787,1.741003,3.384199,70.0,110.0,,,
3,5926791,1.0,185.0,2.0,3.0,300.0,3.0,,,,...,2.125,,4.258334,2.091667,2.908333,11.0,2.0,,,
4,4382344,27.0,3362.0,38.0,13.0,2187.0,22.0,18.0,1256.0,11.0,...,3.314615,,4.863647,1.363623,3.286253,41.0,44.0,,,


# 1.

In [6]:
raw1.keys()

dict_keys(['wine_id', 'vintage_id', 'vintage_type', 'vintage_year', 'userID', 'rating_per_user', 'user_note', 'user_follower_count', 'user_following_count', 'user_rating_count', 'user_rating_sum', 'reviews_count', 'user_like_count', 'user_comments_count', 'review_time', 'review_language', 'review_year'])

In [7]:
raw2.keys()

dict_keys(['wine_id', 'vintage_id', 'vintage_type', 'vintage_year', 'userID', 'rating_per_user', 'user_note', 'user_follower_count', 'user_following_count', 'user_rating_count', 'user_rating_sum', 'reviews_count', 'user_like_count', 'user_comments_count', 'review_time', 'review_language', 'review_year', 'review_token'])

In [8]:
# wine_id 는 813개
pd.Series(list(raw1['wine_id'].values())).nunique(), pd.Series(list(raw2['wine_id'].values())).nunique(), 

(813, 813)

In [9]:
# vintage_id 는 2839, 3025개
pd.Series(list(raw1['vintage_id'].values())).nunique(), pd.Series(list(raw2['vintage_id'].values())).nunique(), 

(2839, 3025)

## 1-1. 변수 설명

  - wine_id : wine 고유 id
  - vintage_id : wine에 속한 vintage 고유 id
  - vintage_type : 와인 내에서 해당 빈티지의 특징 정도로 보임
  - vintage_year : wine에 속한 vintage 생산년도
  - userID : user 고유 id
  - rating_per_user : user가 vintage에 부여한 rating
  - user_note : user가 vintage에 남긴 review
  - user_follower_count : user의 follower 수
  - user_following_count : user의 following 수
  - user_rating_count : user가 남긴 rating 총 개수
  - user_rating_sum : user가 남긴 rating 총 합
  - reviews_count : user가 남긴 review 총 개수
  - user_like_count : 해당 리뷰에 대한 좋아요 수
  - user_comments_count : 해당 리뷰에 대한 댓글 수
  - review_time : user가 review를 남긴 시간(timestamp)
  - review_language : review 언어
  - review_year : vintage_year과 매칭해야함
  - review_token : review에 사용된 token

In [66]:
def makeDF(column1, column2):
    return pd.DataFrame(zip(raw2[column1].values(), raw2[column2].values()))

## 1-2. item / user context
- item(vintage) 프로필 (wine_id, vintage_year, review_token...)
- user 프로필 (user_follower_count, user_following_count, user_rating_count, user_rating_sum, user_comments_count, 이제까지 구매한 와인 개수, 이제까지 구매한 vintage 개수, 최근 구매한 5개 vintage_id vector의 평균...) 

## 2. item 프로필 만들기

## 2-1. review_token

In [145]:
# review token 사용하기

In [155]:
review = pd.DataFrame(zip(raw2['vintage_id'].values(), raw2['user_note'].values(), raw2['review_token'].values(), raw2['review_language'].values()), columns = ['vintage_id', 'review', 'token', 'lang'])

In [156]:
## review가 'en' (english)인 경우만 -> 약 2만개 del
review = review.loc[review['lang'] == 'en']

In [157]:
review

Unnamed: 0,vintage,review,token,lang
0,26195,"For my son's 20th birthday, I think this 19 ye...","[{'id': 320, 'match': 'pepper'}, {'id': 384, '...",en
1,26195,Celebrating Lunar New Year!\nI've debated the ...,"[{'id': 242, 'match': 'leather'}, {'id': 103, ...",en
2,26195,Delicious. Beautiful tannins interlaced with r...,"[{'id': 354, 'match': 'red fruit'}]",en
3,26195,#workwine\nThe Penfold style absolutely came t...,,en
4,26195,"Wow, what a Cab! Not a typical Aussie bomb. Re...",,en
...,...,...,...,...
116208,163411918,This is why I love Rhone varietals right here....,,en
116226,163653190,This is a Bomb Riserva needs some years or a l...,,en
116227,163653190,Benvenuto Brunello 2020: If a vineyard brings ...,"[{'id': 292, 'match': 'oak'}, {'id': 434, 'mat...",en
116228,163653190,What a treat to try this recently released 201...,"[{'id': 38, 'match': 'black cherry'}]",en


In [None]:
# 각 vintage에 나온 token 정리

In [170]:
review['token'] = review['token'].fillna('')

In [196]:
review['prep_token'] = review['token'].apply(lambda x : '|'.join([item['match'] for item in x]))

In [197]:
token_group = review.loc[review['prep_token'] != ''].groupby('vintage')['prep_token'].apply(lambda x : '|'.join(x).replace('|', '|')).reset_index()

In [198]:
token_group

Unnamed: 0,vintage,prep_token
0,2384,blackberry|cherry|cinnamon|earthy|licorice|cho...
1,3031,oak|leather|licorice|savory|tobacco|oak|black ...
2,4137,oak|tobacco|black fruits|blackcurrant|blueberr...
3,5855,leather|tobacco|cherry|coffee|blackberry|black...
4,7697,cigar box|leather|dark fruit|black cherry|leat...
...,...,...
2511,162997022,dark chocolate|vanilla|sour cherries
2512,163195536,cherry|cinnamon|clove|espresso|raspberry|miner...
2513,163411918,vanilla|licorice|pepper|violet|blackberry|blue...
2514,163422370,cured meat|oak|blackberries|strawberries|coffe...


In [219]:
# token별 term document matrix

In [224]:
piv = token_group.join(token_group.prep_token.str.get_dummies().astype(float))

In [268]:
# 초기화
piv.iloc[:, 2:] = 0

In [269]:
piv

Unnamed: 0,vintage,prep_token,5-spice powder,acacia,acai berry,allspice,almond,almonds,anise,aniseed,...,wild blueberries,wild blueberry,wild iris,wild strawberries,wild strawberry,wood smoke,yeast,yellow beets,yellow raisin,yogurt
0,2384,blackberry|cherry|cinnamon|earthy|licorice|cho...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3031,oak|leather|licorice|savory|tobacco|oak|black ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4137,oak|tobacco|black fruits|blackcurrant|blueberr...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5855,leather|tobacco|cherry|coffee|blackberry|black...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7697,cigar box|leather|dark fruit|black cherry|leat...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2511,162997022,dark chocolate|vanilla|sour cherries,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2512,163195536,cherry|cinnamon|clove|espresso|raspberry|miner...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2513,163411918,vanilla|licorice|pepper|violet|blackberry|blue...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2514,163422370,cured meat|oak|blackberries|strawberries|coffe...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [270]:
for i, tokens in tqdm(enumerate(piv['prep_token'])):
    temp_dic = {}
    for token in tokens.split('|'):
        temp_dic[token] = temp_dic.get(token, 0) + 1
        
    for k, v in temp_dic.items():
        piv.loc[i, k] = v

2516it [00:16, 155.67it/s]


In [275]:
piv

Unnamed: 0,vintage,prep_token,5-spice powder,acacia,acai berry,allspice,almond,almonds,anise,aniseed,...,wild blueberries,wild blueberry,wild iris,wild strawberries,wild strawberry,wood smoke,yeast,yellow beets,yellow raisin,yogurt
0,2384,blackberry|cherry|cinnamon|earthy|licorice|cho...,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3031,oak|leather|licorice|savory|tobacco|oak|black ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4137,oak|tobacco|black fruits|blackcurrant|blueberr...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5855,leather|tobacco|cherry|coffee|blackberry|black...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7697,cigar box|leather|dark fruit|black cherry|leat...,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2511,162997022,dark chocolate|vanilla|sour cherries,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2512,163195536,cherry|cinnamon|clove|espresso|raspberry|miner...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2513,163411918,vanilla|licorice|pepper|violet|blackberry|blue...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2514,163422370,cured meat|oak|blackberries|strawberries|coffe...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [294]:
meta = pd.DataFrame(zip(raw2['wine_id'].values(), raw2['vintage_id'].values()), columns = ['wine_id', 'vintage_id'])

In [296]:
meta = meta.drop_duplicates('vintage_id')

In [303]:
piv = meta.merge(piv, on = 'vintage_id')

In [371]:
# TF-IDF 구하기

In [384]:
from math import log
N = len(piv) # 총 문서의 수

def tf(t, d):
    return d.count(t)

def idf(t):
    df = 0
    for doc in piv['prep_token']:
        df += t in doc
    return log(N/(df + 1))

def tfidf(t, d):
    return tf(t,d)* idf(t)

In [385]:
vocab = list(piv.columns[3:])
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])
idf_

Unnamed: 0,IDF
5-spice powder,6.731813
acacia,4.785903
acai berry,6.731813
allspice,3.736081
almond,2.648642
...,...
wood smoke,4.785903
yeast,4.572329
yellow beets,7.137278
yellow raisin,7.137278


In [387]:
result = []
for i in range(N):
    result.append([])
    d = piv['prep_token'][i]
    for j in range(len(vocab)):
        t = vocab[j]

        result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,5-spice powder,acacia,acai berry,allspice,almond,almonds,anise,aniseed,apple,apple blossom,...,wild blueberries,wild blueberry,wild iris,wild strawberries,wild strawberry,wood smoke,yeast,yellow beets,yellow raisin,yogurt
0,0.0,0.0,0.0,0.0,2.648642,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,2.742829,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,1.716743,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2511,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2512,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2513,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2514,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,4.785903,0.0,0.0,0.0,0.0


In [393]:
piv_tfidf = pd.concat([piv[['wine_id', 'vintage_id']], tfidf_], axis = 1)

## 2-2. item_taste

In [290]:
# item의 taste는 wine별로밖에 특징안잡혀있음
taste = raw3.copy()

In [394]:
from sklearn.preprocessing import MinMaxScaler

In [406]:
taste = taste.fillna(0)

In [408]:
scaler = MinMaxScaler()
temp = pd.DataFrame(scaler.fit_transform(taste.iloc[:, 1:]), columns = taste.columns[1:])

In [411]:
taste_norm = pd.concat([taste['wine_id'], temp], axis = 1)

In [415]:
item = piv_tfidf.merge(taste_norm, on = 'wine_id')

In [416]:
item.to_pickle('item_vector.pkl')

In [417]:
item

Unnamed: 0,wine_id,vintage_id,5-spice powder,acacia,acai berry,allspice,almond,almonds,anise,aniseed,...,acidity,fizziness,intensity,sweetness,tannin,user_structure_count,calculated_structure_count,tropical_fruit_count,tropical_fruit_score,tropical_fruit_mentions_count
0,1264,26195,0.0,0.0,0.0,0.0,2.648642,0.0,0.000000,0.0,...,0.705283,0.0,0.796638,0.467290,0.762790,0.048539,0.058198,0.037037,0.032258,0.032258
1,1264,26196,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.705283,0.0,0.796638,0.467290,0.762790,0.048539,0.058198,0.037037,0.032258,0.032258
2,1264,26197,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.705283,0.0,0.796638,0.467290,0.762790,0.048539,0.058198,0.037037,0.032258,0.032258
3,1264,1840610,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.705283,0.0,0.796638,0.467290,0.762790,0.048539,0.058198,0.037037,0.032258,0.032258
4,1651,1166340,0.0,0.0,0.0,0.0,0.000000,0.0,1.716743,0.0,...,0.637695,0.0,0.680082,0.516048,0.638973,0.475495,0.134680,0.185185,0.074839,0.032258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2511,7774865,162598030,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.631792,0.0,0.955744,0.531688,0.715564,0.019321,0.005617,0.000000,0.000000,0.000000
2512,8014725,163195536,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.796637,0.0,0.646667,0.582092,0.801228,0.001414,0.000478,0.000000,0.000000,0.000000
2513,8075591,163422370,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.833625,0.0,0.940774,0.622199,0.834637,0.001414,0.000956,0.000000,0.000000,0.000000
2514,8075594,163411918,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.782943,0.0,1.000000,0.422859,0.792999,0.003770,0.000956,0.037037,0.032258,0.032258


# 3. User 프로필 만들기

In [21]:
user = (pd.DataFrame(zip(raw2['userID'].values(),
                         raw2['user_follower_count'].values(),
                         raw2['user_following_count'].values(),
                         raw2['user_rating_count'].values(),
                         raw2['user_rating_sum'].values(),
                        ),
                     columns = ['userID', 'follower', 'following', 'rating_count', 'rating_sum']))

In [29]:
# userID unique는 약 5만2천개
user.drop_duplicates().sort_values('userID')

Unnamed: 0,userID,follower,following,rating_count,rating_sum
75284,135,12,13,425,1771.5
16583,140,39,17,173,689.5
42298,159,78,114,57,215.5
41581,1201,128,90,267,1185.5
85665,2742,45,47,200,704.0
...,...,...,...,...,...
8008,46910813,0,0,8,34.0
109769,46937893,1,1,9,36.5
47553,46950004,0,0,2,8.5
52597,47062624,0,0,8,30.0


In [34]:
user = user.drop_duplicates()
user = user.sort_values(['userID', 'rating_count']).drop_duplicates('userID')

In [39]:
assert user.shape[0] == user['userID'].nunique()

In [90]:
meta = (pd.DataFrame(zip(raw2['userID'].values(),
                         raw2['rating_per_user'].values(),
                         raw2['review_time'].values(),
                         raw2['review_token'].values(),
                         raw2['user_like_count'].values(),
                         raw2['user_comments_count'].values(),
                         raw2['vintage_id'].values()
                        ),
                     columns = ['userID', 'rating', 'time', 'token', 'like', 'comments', 'vintage_id']))

In [91]:
meta

Unnamed: 0,userID,rating,time,token,like,comments,vintage_id
0,2426402,4.5,1487975093000,"[{'id': 320, 'match': 'pepper'}, {'id': 384, '...",142.0,35.0,26195
1,7062896,4.5,1485501730000,"[{'id': 242, 'match': 'leather'}, {'id': 103, ...",51.0,4.0,26195
2,8168534,4.5,1440898705000,"[{'id': 354, 'match': 'red fruit'}]",7.0,0.0,26195
3,4562576,4.5,1474055613000,,5.0,0.0,26195
4,8213806,4.5,1534863443000,,47.0,5.0,26195
...,...,...,...,...,...,...,...
116232,9311541,4.5,1595675745000,,4.0,0.0,163653190
116233,26702188,5.0,1598678573000,,1.0,0.0,163653190
116234,1552454,5.0,1601816446000,,0.0,0.0,163653190
116235,16307137,4.5,1591817870000,,51.0,13.0,163653190


In [59]:
# 해당 데이터는 일부데이터이기 때문에 사용하기 애매할 가능성이 높을 것 같음
user_add = meta.groupby('userID').agg({
    'rating' : [('rating_sum', np.sum),
               ('rating_count', 'count')],
    'time' : [('first_review', np.min),
             ('last_review', np.max)],
    'like' : [('max_like', np.max),
             ('avg_like', np.mean)],
    'comments' : [('max_comments', np.max),
             ('avg_comments', np.mean)]
})

In [62]:
meta['token'] = meta['token'].fillna('')
meta['prep_token'] = meta['token'].apply(lambda x : '|'.join([item['match'] for item in x]))

In [73]:
user_token = meta.groupby('userID')['prep_token'].apply(lambda x : '|'.join(x).replace('||', '')).reset_index()

In [69]:
meta.loc[meta['prep_token'] != '']

Unnamed: 0,userID,rating,time,token,like,comments,prep_token
0,2426402,4.5,1487975093000,"[{'id': 320, 'match': 'pepper'}, {'id': 384, '...",142.0,35.0,pepper|smoke|black currants|oak|strawberries|b...
1,7062896,4.5,1485501730000,"[{'id': 242, 'match': 'leather'}, {'id': 103, ...",51.0,4.0,leather|cigar|chocolate
2,8168534,4.5,1440898705000,"[{'id': 354, 'match': 'red fruit'}]",7.0,0.0,red fruit
5,1133314,4.5,1538880889000,"[{'id': 384, 'match': 'smoke'}, {'id': 422, 'm...",4.0,0.0,smoke|tobacco|truffle|black fruits|leather
6,11724528,4.5,1592832332000,"[{'id': 49, 'match': 'blackberries'}, {'id': 4...",3.0,0.0,blackberries|vanilla
...,...,...,...,...,...,...,...
116202,940318,4.0,1585819135000,"[{'id': 261, 'match': 'mango'}, {'id': 434, 'm...",14.0,1.0,mango|vanilla
116206,31891932,4.5,1600599260000,"[{'id': 226, 'match': 'iron'}, {'id': 425, 'ma...",1.0,0.0,iron|tomatoes
116227,10933359,5.0,1585002292000,"[{'id': 292, 'match': 'oak'}, {'id': 434, 'mat...",35.0,6.0,oak|vanilla|black cherry|red fruit
116228,2069826,4.5,1595358654000,"[{'id': 38, 'match': 'black cherry'}]",163.0,26.0,black cherry


In [84]:
user_token['prep_token'] = user_token['prep_token'].apply(lambda x : x.replace('|', '') if x == '|' else x)

In [87]:
# user_token 이 없는 경우가 너무 많음
user_token.loc[user_token['prep_token'] == '']

Unnamed: 0,userID,prep_token
0,135,
2,159,
3,1201,
4,2742,
6,2821,
...,...,...
52191,46874748,
52192,46886068,
52195,46937893,
52197,47062624,


In [93]:
meta.sort_values(['userID', 'time'])

Unnamed: 0,userID,rating,time,token,like,comments,vintage_id
75284,135,4.0,1446192586000,,2.0,0.0,1960068
75338,135,5.0,1446192634000,,1.0,0.0,2058188
16583,140,5.0,1405185984000,"[{'id': 55, 'match': 'blueberries'}]",1.0,0.0,1492826
86358,140,5.0,1485844814000,,4.0,0.0,6784672
42298,159,4.5,1546160399000,,3.0,0.0,2136952
...,...,...,...,...,...,...,...
8008,46910813,3.5,1605491024000,"[{'id': 49, 'match': 'blackberries'}, {'id': 5...",0.0,0.0,109608146
109769,46937893,5.0,1603987489000,,0.0,0.0,102293886
47553,46950004,5.0,1604009137000,"[{'id': 49, 'match': 'blackberry'}]",0.0,0.0,150394625
52597,47062624,4.0,1605066908000,,0.0,0.0,2149611


In [181]:
item['vintage_id'].nunique(), meta['vintage_id'].nunique()

(2516, 3025)

In [319]:
recent_item = meta.groupby('userID').agg({
    'vintage_id' : [('vintage_id', 'unique')]
}).reset_index()

In [347]:
meta = meta.sort_values(['userID', 'time']).reset_index(drop = True)

In [354]:
recent_dict = {}
for args in meta.values:
    u = args[0]
    v = args[-1]
    recent_dict[u] = recent_dict.get(u, list()).append(v)
    
    break

In [355]:
recent_dict.get(135)

In [356]:
recent_dict

{135: None}

In [343]:
meta.sort_values(['userID', 'time']).groupby('userID')['vintage_id']

AttributeError: 'SeriesGroupBy' object has no attribute 'to_list'

In [320]:
recent_item.columns = ['userID', 'vintage_id']

In [324]:
# usera가 구매한 마지막 vintage_id -> predict label
recent_item['label'] = recent_item['vintage_id'].fillna('').apply(lambda x : x[-1])

In [328]:
recent_item['vintage_id'].fillna('').apply(lambda x : x)

array([2436806, 2212631, 1459663, 2595468, 2689090, 4624910])

In [250]:
recent_item['vintage_id'] = recent_item['vintage_id'].fillna('').apply(lambda x : x[:-1])

In [114]:
item = pd.read_pickle('./item_vector.pkl')

In [117]:
item.head()

Unnamed: 0,wine_id,vintage_id,5-spice powder,acacia,acai berry,allspice,almond,almonds,anise,aniseed,...,acidity,fizziness,intensity,sweetness,tannin,user_structure_count,calculated_structure_count,tropical_fruit_count,tropical_fruit_score,tropical_fruit_mentions_count
0,1264,26195,0.0,0.0,0.0,0.0,2.648642,0.0,0.0,0.0,...,0.705283,0.0,0.796638,0.46729,0.76279,0.048539,0.058198,0.037037,0.032258,0.032258
1,1264,26196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.705283,0.0,0.796638,0.46729,0.76279,0.048539,0.058198,0.037037,0.032258,0.032258
2,1264,26197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.705283,0.0,0.796638,0.46729,0.76279,0.048539,0.058198,0.037037,0.032258,0.032258
3,1264,1840610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.705283,0.0,0.796638,0.46729,0.76279,0.048539,0.058198,0.037037,0.032258,0.032258
4,1651,1166340,0.0,0.0,0.0,0.0,0.0,0.0,1.716743,0.0,...,0.637695,0.0,0.680082,0.516048,0.638973,0.475495,0.13468,0.185185,0.074839,0.032258


In [251]:
recent_item

Unnamed: 0,userID,vintage_id
0,135,"[1960068, 2058188]"
1,140,"[1492826, 6784672]"
2,159,[2136952]
3,1201,"[2436806, 2212631, 1459663, 2595468, 2689090]"
4,2742,[2732746]
...,...,...
52194,46910813,[109608146]
52195,46937893,[102293886]
52196,46950004,[150394625]
52197,47062624,[2149611]


In [263]:
user_recent_item = {}
for u, recent in tqdm(recent_item.values):
    temp = np.zeros(519)
    for idx in recent:
        try:
            temp += item.loc[item['vintage_id'] == idx].values[0][2:]
        except:
            pass
    
    user_recent_item[u] = temp

100%|██████████| 52199/52199 [01:09<00:00, 753.30it/s] 


In [273]:
recent_item['recent_vector'] = list(map(lambda x : [x], user_recent_item.values()))

In [274]:
recent_item

Unnamed: 0,userID,vintage_id,recent_vector
0,135,"[1960068, 2058188]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,140,"[1492826, 6784672]","[[0.0, 0.0, 0.0, 0.0, 5.297284135056492, 0.0, ..."
2,159,[2136952],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,1201,"[2436806, 2212631, 1459663, 2595468, 2689090]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,2742,[2732746],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...
52194,46910813,[109608146],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
52195,46937893,[102293886],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.716743437988..."
52196,46950004,[150394625],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
52197,47062624,[2149611],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [280]:
meta.loc[meta['rating'] < 4].groupby('userID')['vintage_id'].unique().reset_index()

Unnamed: 0,userID,vintage_id
0,2821,[2716040]
1,2997,"[2138796, 1209030, 86009, 104764, 2147917]"
2,6817,[1319190]
3,7084,[87291]
4,12420,[150347595]
...,...,...
3764,46184264,[150265304]
3765,46404017,"[1459663, 74535983, 162997022, 57646109, 14568..."
3766,46528036,[153406060]
3767,46564082,[2179179]


In [290]:
user['profile'] = list(map(lambda x : [x], user.values[:, 1:]))

In [295]:
user['profile'] = user['profile'].apply(lambda x : x[0])

In [304]:
user = user.reset_index()

In [309]:
dataset = pd.concat([user[['userID', 'profile']], recent_item[['vintage_id', 'recent_vector']]], axis = 1)

In [310]:
dataset.to_pickle('./temp_dataset.pkl')