In [1]:
import pandas as pd

## 載入資料

In [2]:
# TODO: 可能找個地方放 iCook 資料檔案，然後在這邊 wget 下來

In [3]:
recipes_df = pd.read_json('recipes.jsonl.gz', lines=True, compression='gzip')
# recipes_df

ingredients_df = pd.read_json('ingredients.jsonl.gz', lines=True, compression='gzip')
# ingredients_df

favorites_df = pd.read_json('favorites.json.gz', compression='gzip')
# favorites_df

test_favorites_df = pd.read_json('test_favorites.json.gz', compression='gzip')
# test_favorites_df

test_user_ids = pd.read_csv('test_user_ids.txt.gz', compression='gzip', names=['user_id'])
# test_user_ids

## EDA

### ingredients

In [4]:
ls = ingredients_df.group_name.unique()
print(ls)

[None '果醬' '磅蛋糕' ... '蟹餅材料 A' '蟹餅材料 B' '檸檬美乃滋沾醬']


看起來應該是可以自填的，有空值，也有各種相似，但不同的分組。

In [5]:
ls = ingredients_df.name.unique()
print(ls)

['蒲瓜' '排骨' '鹽' ... '白飯?' '鮭魚?' '斧頭牛扒']


目標是希望藉由食材來對各類食譜做出區分，來判斷哪些食譜之間較為相似。
應該使用name的欄位就足夠了，group name的資訊，有些有點重複。
而且，group的內容，應該是可以藉由name的欄位就推測出來。

# recipes

In [6]:
recipes_df.name.isna().any()

False

In [7]:
top_number = 10
top_recipes = recipes_df.sort_values('favorites_count', ascending=False).iloc[:top_number, :]
total_f_user_count = len(favorites_df.user_id.unique())
top_f_user = favorites_df[favorites_df.recipe_id.isin(top_recipes.id)]
top_f_user_count = len(top_f_user.user_id.unique())
print(top_f_user_count / total_f_user_count)

0.07204843391369722


前10名的收藏僅佔全user的7%。 不是大家都會收藏一樣的食譜。
只用收藏數高的食譜做推薦的效果會不太好。

## 計算食譜相似度

In [8]:
import jieba  # 用結巴來做中文斷詞分析

In [9]:
recipe_igd_name = ingredients_df.copy()
recipe_igd_name['name'] = recipe_igd_name['name'] + ' '
recipe_igd_name = recipe_igd_name[['recipe_id', 'name']].groupby('recipe_id', as_index=False).sum()
recipe_name_df = recipes_df[['id', 'name']].copy().rename(columns={'id': 'recipe_id'})
recipe_name_df = recipe_name_df.merge(recipe_igd_name, on='recipe_id', how='outer')
recipe_name_df = recipe_name_df.fillna('')
recipe_name_df['all_name'] = recipe_name_df['name_x'] + ' ' + recipe_name_df['name_y']
name_df = recipe_name_df[['recipe_id', 'all_name']].reset_index(drop=True)
name_df


Unnamed: 0,recipe_id,all_name
0,319968,鮮奶燉蛋 雞蛋 鮮奶 砂糖
1,319797,初次醃蘿蔔【鬼怪】 白蘿蔔（小） 白醋 水 砂糖 食鹽
2,311082,零基礎免揉-哈拉Challah辮子麵包\n 免揉哈拉基礎麵團 手粉(高筋麵粉) 蛋液 黑芝麻
3,286167,洋蔥豬扒 梅頭豬扒 洋蔥 蕃茄醬 糖 意大利黑醋 生抽 水
4,317195,蒜香肉絲小松菜 小松菜 肉絲 醬油 蒜頭 米酒 鹽 米酒 水
...,...,...
27353,361673,藥膳火鍋鍋底 大雞腿 雞胸肉 中藥材-黃精 高麗菜 中藥材-何首烏 米酒 中藥材-當歸 水 ...
27354,362481,在家自製斧頭扒～大廚級出馬 斧頭牛扒 無鹽牛油 黑椒粉 蒜頭 鹽 油 香草碎
27355,362407,皮蛋瘦肉粥（健康）（減肥）（零失敗） 隔夜飯或煮好的白飯 豬肉片，等等切條 醬油 白菜 雞蛋...
27356,362485,簡易櫻花蝦蘿蔔糕 白蘿蔔 再來米粉 水 櫻花蝦 紅蔥頭酥 白胡椒粉


In [10]:
import re

def jeiba_function(text):
    sentences = text.split(' ')
    words = [' '.join(jieba.cut(sent)) for sent in sentences if not re.match('\d|\s', sent)]
    words = ' '.join(words)
    return words

In [12]:
name_df['jieba_text'] = name_df.all_name.apply(lambda x: jeiba_function(x))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\0338\AppData\Local\Temp\jieba.cache
Loading model cost 0.487 seconds.
Prefix dict has been built successfully.


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()
tfidf_matrix = tf.fit_transform(name_df['jieba_text'])

# 計算商品間的相似程度
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(name_df.index, index = name_df['recipe_id'])

# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (name_df['recipe_id'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    if len(items) == 0:
      # 如果沒有購買記錄就給topk
      return top_recipes.loc[:k, 'recipe_id'].tolist()
    res = []
    for d in items[0:1]:
        res.extend(recommend_item(d, k))
        # print(len(res))
    # res = pd.unique(res).tolist()
    return res


## 計算推薦

In [42]:
test_user_ids_part = test_user_ids[:5000]
test_favorites_df_part = test_favorites_df[test_favorites_df.user_id.isin(test_user_ids_part.user_id)]

In [43]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    content-based
    '''
    recipe_trainings = training_data
    recipe_trainings = recipe_trainings.sort_values('created_at', ascending=False)
    recommendations = {user: recommend_items(recipe_trainings.loc[recipe_trainings['user_id'] == user, 'recipe_id'].tolist(), k) for user in users}
    return recommendations

predictions = recommender(favorites_df, test_user_ids_part.user_id, k=3)

In [None]:
predictions

{100004: [322535, 352505, 357318],
 1000120: [355195, 335530, 359625],
 1000205: [334385, 359069, 344451],
 1000228: [356824, 361636, 356941],
 1000554: [361133, 355841, 355204],
 1000559: [337934, 297974, 330531],
 1000569: [340459, 357217, 348794],
 1000578: [343614, 338568, 317416],
 1000616: [343536, 342933, 343104],
 1000642: [328403, 329832, 341774],
 1000675: [321271, 331586, 330188],
 1000715: [360700, 345949, 354460],
 1000766: [345805, 324590, 323346],
 1001006: [323009, 326597, 328214],
 1001098: [354283, 330217, 309646],
 1001130: [336476, 345098, 354338],
 1001234: [329928, 357815, 332437],
 1001238: [346764, 325857, 350799],
 1001256: [358633, 339724, 324390],
 1001355: [341029, 353658, 341479]}

## 評估結果

In [44]:
def evaluate(real_results={}, predicted_results={}):
    '''
    * real_results: dict 真實被收藏的食譜資料
    * predicted_results: dict 利用訓練資料學習的推薦食譜
    * method: str
    * score: float
    '''
    total = 0
    for d in predicted_results:
          total += len(set(predicted_results[d]) & set(test_favorites_df[test_favorites_df['user_id'] == d]['recipe_id']))

    score = total / test_favorites_df.shape[0]
    return score

evaluate(test_favorites_df_part, predictions)

0.00018042775256415594