In [1]:
from tqdm.notebook import tqdm

import os
import re

import pandas as pd
import numpy as np
pd.set_option("max_rows", 999)

### 다양한 명칭으로 표기 된 식재료들을 하나의 이름으로 mapping

In [5]:
# ingredients.csv는 server에도 올라가고 pre-processing에서도 사용됨
mapping_csv = pd.read_csv('server/ingredients_mapping.csv', encoding='cp949', header=None).rename(columns = {0:'original', 1:'general'})

In [10]:
mapping_csv

Unnamed: 0,original,general
0,가다랑어,가다랑어포
1,가쓰오,가다랑어포
2,가다랭이포,가다랑어포
3,가다랑어포,가다랑어포
4,가다랑이포,가다랑어포
...,...,...
3632,무김치,김치
3633,파김치,김치
3634,신김치,김치
3635,포기김치용,김치


In [11]:
a = np.array(['가다랑어','포기김치용','무김치'])
my_dict = dict( zip(mapping_csv.original, mapping_csv.general))

def to_general_name(x):
    return np.vectorize(my_dict.get)(x)

In [12]:
to_general_name(a)

array(['가다랑어포', '김치', '김치'], dtype='<U5')

### 크롤링 데이터 Concat

In [None]:
path = "./"
file_list = os.listdir(path)
file_list

In [None]:
csv_list = ['tbRecipe_ilsang_1.csv',
 'tbRecipe_간식야식.csv',
 'tbRecipe_다이어트.csv',
 'tbRecipe_도시락영양식.csv',
 'tbRecipe_명절_이유식_기타.csv',
 'tbRecipe_손님접대.csv',
 'tbRecipe_일상2.csv',
 'tbRecipe_초스피드.csv',
 'tbRecipe_해장_푸드스타일링.csv']

In [None]:
df = pd.DataFrame()

for csv in csv_list:
    temp = pd.read_csv(csv)
    df = pd.concat([df, temp], ignore_index=True)

In [None]:
# 요리재료 제대로 안 긁힌거 날려버리기 ( 크롤링 잘못된 거) 
duplicated_row_indices = [] #중복된 애들

for row_index in range(len(df)-1):
      if df.loc[row_index, '재료'] == df.loc[row_index + 1, '재료']:
            duplicated_row_indices.append(row_index + 1)

duplicated_row_indices

In [None]:
df_copy = df.copy(deep=True)
df_copy.loc[duplicated_row_indices, ['재료']] = np.nan
df_copy = df_copy.dropna(axis = 0)
df_copy = df_copy.reset_index(drop=True)
df = df_copy.copy(deep = True)   # 다 날려버림

In [None]:
df

In [None]:
# df.to_csv('tbRecipe_all.csv', index = False, encoding = 'utf-8-sig')

In [None]:
ingredient_count = df.groupby('재료')[['id']].count().sort_values(by = 'id', ascending = False)
ingredient_count.head(150)

### 재료 column 데이터 타입 str -> list

In [None]:
def str_to_list(x):
    return x.split(', ')

df['재료개수'] = df['재료'].apply(str_to_list).apply(lambda x: len(x))
df = df[lambda x: x['재료개수']>1]    # 재료가 하나인 애들/ 레시피마다 달라요 없애기

In [None]:
ingredient_per_recipe_list = df['재료'].apply(str_to_list)

### 전처리 및 TF 벡터화

In [None]:
ingredient_per_recipe_list_converted = []

# 괄호, 한글 아닌거 날리기
for i in tqdm(df['재료']):
    processed_ingredients = []
    tmplst = i.split(",")
    for tmpfood in tmplst:
        tmp = re.findall("[가-힣]+",tmpfood)
        if len(tmp) == 0: continue
        processed_ingredients.append(tmp[0])
        np_processed_ingredients = np.array(processed_ingredients)
        
    ingredient_per_recipe_list_converted.append(np_processed_ingredients)
    
ingredient_per_recipe_list = ingredient_per_recipe_list_converted

In [None]:
ingredient_per_recipe_list

In [None]:
## 재료 개수 줄여버리기~
df['재료_processed'] = ingredient_per_recipe_list
ingredient_per_recipe_list = df['재료_processed'].apply(to_general_name).apply(lambda x: x[x != np.array('None')]).apply(lambda x: x[x != np.array(None)])

In [None]:
total_ingredient_list = []   #우리가 크롤링한 재료에 존재하는 모오든 재료

for ingredient in ingredient_per_recipe_list:
    total_ingredient_list.extend(ingredient)

before_len = len(total_ingredient_list)
total_ingredient_list = list(set(total_ingredient_list))
after_len = len(total_ingredient_list)

print(f'before:: {before_len}, after:: {after_len}')

In [None]:
df = df.reset_index(drop = True)  # loc으로 검색하려면 index가 초기화 되어야 함

np_total_ingredient_list = np.array(total_ingredient_list)
recipe_vector_dictionary = {}
counter = 0

for recipe_ingredients in ingredient_per_recipe_list:  #
    
    recipe_id = df.loc[counter, 'id']
    
    ingredients_location_list = []
    recipe_vector = np.zeros(len(np_total_ingredient_list))
    for ingredient in recipe_ingredients:
        ingredient_location = np.where(np_total_ingredient_list == ingredient)[0][0]
        ingredients_location_list.append(ingredient_location)
    recipe_vector[ ingredients_location_list ] = 1
    
    recipe_vector_dictionary[ recipe_id ] = recipe_vector 
    
    counter += 1
#     if counter>12:
#         break

In [None]:
df_vector = pd.DataFrame(recipe_vector_dictionary).T
df_vector.columns = total_ingredient_list
df_vector

### Cosine Similarity 계산 및 추천

In [None]:
musts = ['두부', '김치', '스팸']
my_ingredient = ['대파', '마늘', '고추장', '콩', '라면스프', '햄', '계란']
musts = to_general_name(musts)
my_ingredient = to_general_name(my_ingredient)
print(musts, my_ingredient)

In [None]:
df_vector_test = df_vector.copy(deep = True)
for must in musts:
    df_vector_test = df_vector_test[lambda x: x[must] != 0]

filter_indices = df_vector_test.index
df_vector = df_vector.loc[filter_indices]

In [None]:
df_vector

In [None]:
my_vector = np.zeros(len(np_total_ingredient_list))
ingredients_location_list = []
not_in_list = []
for ingredient in my_ingredient:
    try:
        ingredient_location = np.where(np_total_ingredient_list == ingredient)[0][0]
        ingredients_location_list.append(ingredient_location)
    except:
        not_in_list.append(ingredient)
    my_vector[ ingredients_location_list ] = 1

print(not_in_list)

In [None]:
my_vector

In [None]:
idx = df_vector.index
from scipy import spatial

In [None]:
def cosine_similarity(a, b):
    return 1 - spatial.distance.cosine(a,b)

idx = df_vector.index
a = my_vector
index_dictionary = {}

for i in tqdm(idx):
    b = np.array(df_vector.loc[i])
    index_dictionary[i] = cosine_similarity(a,b)
    

df_vector_copy = df_vector.reset_index()
df_vector_copy['similarity'] = df_vector_copy['index'].map(index_dictionary)
df_vector_copy = df_vector_copy.set_index('index')
df_vector_copy

In [None]:
df_vector_copy['similarity'].max()

In [None]:
similar_recipe = df_vector_copy[df_vector_copy.similarity.isin(
    df_vector_copy[['similarity']].sort_values(by='similarity', ascending=False)['similarity'].head(12)) ]
df_similar_recipe = df[df.id.isin(similar_recipe.index)].sort_values(by='조회수', ascending=False).head(12)
df_similar_recipe.shape

In [None]:
similar_recipe_vector = df_vector_copy[df_vector_copy.index.isin(df_similar_recipe.id)]
similar_recipe_vector = similar_recipe_vector.drop('similarity', axis = 1)
similar_recipe_vector

In [None]:
df_similar_recipe = df_similar_recipe.reset_index(drop=True)
df_similar_recipe

In [None]:
selected_recipe = {}
for x in range(12):
    temp = df_similar_recipe.iloc[x,:]
    key = temp.id
    selected_recipe[int(key)] = {'제목': temp[2],'재료': temp[7],'소요시간': temp[5], '이미지url': temp[8], '레시피url': temp[9]}
selected_recipe

### 사용한 재료, 더 필요한 재료 확인

In [None]:
def check_ingredient(similar_recipe_vector, my_vector):
    
    ingredient_list = np.array(similar_recipe_vector.columns)
    ingredient_check = similar_recipe_vector - my_vector #1이면 없는 재료, -1이면 안쓰는 재료
    
    necessary_dictionary = {}
    unused_dictionary = {}
    
    for index in ingredient_check.index:
        ingredient_check_per_recipe = np.array(ingredient_check.loc[index])
        
        necessary_ingredient_indices = np.where(ingredient_check_per_recipe == 1)[0]
        unused_ingredient_indices = np.where(ingredient_check_per_recipe == -1)[0]
        
        necessary_ingredient = list(ingredient_list[necessary_ingredient_indices])
        unused_ingredient = list(ingredient_list[unused_ingredient_indices])
        
        necessary_dictionary[index] = necessary_ingredient
        unused_dictionary[index] = unused_ingredient
    
    return necessary_dictionary, unused_dictionary

In [None]:
a, b= check_ingredient(similar_recipe_vector, my_vector)