# foodPoint

In [69]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class AlcoholRecommender:
    def __init__(self, data_path):
        # 데이터 불러오기
        self.df = pd.read_csv(data_path, header=0)
        self.df['foodsPoint'] = self.df['foodsPoint'].fillna('')

    def search_index(self, input_name):
        # 입력된 술 이름에 해당하는 인덱스 찾기
        try:
            target_index = self.df[self.df.iloc[:, 1] == input_name].index[0]
            return target_index
        except IndexError:
            return "술 이름을 찾을 수 없습니다."

    def calculate_cosine_similarity(self):
        # 텍스트 데이터를 벡터화 시키고 코사인 유사도 구하기
        count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
        data_mat = count_vect.fit_transform(self.df['foodsPoint'])
        similarity_matrix = cosine_similarity(data_mat, data_mat)
        return similarity_matrix

    def find_sim_alcohol(self, product_name, top_n=3):
        target_index = self.search_index(product_name)
        if isinstance(target_index, str):
            return target_index

        similarities = self.calculate_cosine_similarity()
        similar_indices = similarities[target_index].argsort()[::-1][:top_n]

        top_similar_drinks_data = self.df.iloc[similar_indices]
        top_similar_drinks_similarities = similarities[target_index, similar_indices]

        top_similar_drinks_data['유사도'] = top_similar_drinks_similarities
        top_similar_drinks_data = top_similar_drinks_data.drop(index=target_index)
        return top_similar_drinks_data


In [70]:
# 객체 생성
recommender = AlcoholRecommender('/content/drive/MyDrive/CJ올리브네트웍스/추천시스템/final.csv')

# 유사한 제품 검색 (입력 제품 하나만 제외)
similar_products = recommender.find_sim_alcohol('양지백주', top_n=3)
pd.DataFrame(similar_products)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_similar_drinks_data['유사도'] = top_similar_drinks_similarities


Unnamed: 0,category,name,url,imagePath,comment,score,reviewCount,kind,alcoholDegree,capacity,...,foodsPoint,specialPoint,foods,sweetFlavor,sourFlavor,bodyFlavor,carbonicFlavor,tanninFlavor,en_foods,유사도
26,chungju,모든날에,https://www.sooldamhwa.com/damhwaMarket/detail...,images/chungju_59.jpg,전통누룩을 사용하여 3번 빚은 약주,4.5,2,약주,15.00%,500ml,...,짭조름한 간장 베이스 음식과 함께 즐겨보세요,3번의 발효 과정을 거친 전통 방식으로 빚어진 술이에요,"간장 게장,고등어구이,찜닭",2,4,2,-1,4,"gejang,grilled_fish,jjimdak",0.250873
7,chungju,궁중술 왕주,https://www.sooldamhwa.com/damhwaMarket/detail...,images/chungju_7.jpg,왕실에서 비밀스럽게 전수된 궁중비법으로 빚은 술,4.9,147,살균약주,13.00%,375ml,...,달짝지근한 맛이 감도는 간장 베이스 음식과 함께 해보세요,"구기자, 국화, 솔잎이 들어간 살균 약주","바지락 술찜,불고기,찜닭",3,3,3,-1,2,"nagasaki_seafood_noodles,bulgogi,jjimdak",0.214834


# foodsPoint + foods

In [71]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class AlcoholRecommender:
    def __init__(self, data_path):
        # 데이터 불러오기
        self.df = pd.read_csv(data_path, header=0)
        self.df['combined'] = self.df['foodsPoint'] + ',' + self.df['foods']
        self.df['combined'] = self.df['combined'].fillna('')

    def search_index(self, input_name):
        # 입력된 술 이름에 해당하는 인덱스 찾기
        try:
            target_index = self.df[self.df.iloc[:, 1] == input_name].index[0]
            return target_index
        except IndexError:
            return "술 이름을 찾을 수 없습니다."

    def calculate_cosine_similarity(self):
        # 텍스트 데이터를 벡터화 시키고 코사인 유사도 구하기
        count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
        data_mat = count_vect.fit_transform(self.df['combined'])
        similarity_matrix = cosine_similarity(data_mat, data_mat)
        return similarity_matrix

    def find_sim_alcohol(self, product_name, top_n=3):
        target_index = self.search_index(product_name)
        if isinstance(target_index, str):
            return target_index

        similarities = self.calculate_cosine_similarity()
        similar_indices = similarities[target_index].argsort()[::-1][:top_n]

        top_similar_drinks_data = self.df.iloc[similar_indices]
        top_similar_drinks_similarities = similarities[target_index, similar_indices]

        top_similar_drinks_data['유사도'] = top_similar_drinks_similarities
        top_similar_drinks_data = top_similar_drinks_data.drop(index=target_index)
        return top_similar_drinks_data


In [72]:
# 객체 생성
recommender = AlcoholRecommender('/content/drive/MyDrive/CJ올리브네트웍스/추천시스템/final.csv')

# 유사한 제품 검색 (입력 제품 하나만 제외)
similar_products = recommender.find_sim_alcohol('양지백주', top_n=3)
pd.DataFrame(similar_products)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_similar_drinks_data['유사도'] = top_similar_drinks_similarities


Unnamed: 0,category,name,url,imagePath,comment,score,reviewCount,kind,alcoholDegree,capacity,...,specialPoint,foods,sweetFlavor,sourFlavor,bodyFlavor,carbonicFlavor,tanninFlavor,en_foods,combined,유사도
155,soju,여유 19%,https://www.sooldamhwa.com/damhwaMarket/detail...,images/soju_56.jpg,우렁이농법으로 재배한 무농약 쌀로 만든 소주,4.7,25,증류식소주,19.00%,375ml,...,양촌양조장의 송광소주를 시대에 맞게 복원한 술,"소 불고기,제육 볶음,콩나물 무침",-1,-1,2,-1,-1,"bulgogi,bulgogi,bean_sprout_salad","반주를 즐기고 싶은 날, 함께 해보세요!,소 불고기,제육 볶음,콩나물 무침",0.137649
26,chungju,모든날에,https://www.sooldamhwa.com/damhwaMarket/detail...,images/chungju_59.jpg,전통누룩을 사용하여 3번 빚은 약주,4.5,2,약주,15.00%,500ml,...,3번의 발효 과정을 거친 전통 방식으로 빚어진 술이에요,"간장 게장,고등어구이,찜닭",2,4,2,-1,4,"gejang,grilled_fish,jjimdak","짭조름한 간장 베이스 음식과 함께 즐겨보세요,간장 게장,고등어구이,찜닭",0.130931


# Flavor + foodsPoint

In [73]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class AlcoholRecommender:
    def __init__(self, data_path):
        # 데이터 불러오기
        self.df = pd.read_csv(data_path, header=0)
        self.df['foodsPoint'] = self.df['foodsPoint'].fillna('')

        # 열에 접두사 추가 및 'flavors', 'combined' 열 생성
        self.df['sweetFlavor'] = 'sweet_' + self.df['sweetFlavor'].astype(str)
        self.df['sourFlavor'] = 'sour_' + self.df['sourFlavor'].astype(str)
        self.df['bodyFlavor'] = 'body_' + self.df['bodyFlavor'].astype(str)
        self.df['carbonicFlavor'] = 'carbon_' + self.df['carbonicFlavor'].astype(str)
        self.df['tanninFlavor'] = 'tannin_' + self.df['tanninFlavor'].astype(str)
        self.df['flavors'] = self.df['sweetFlavor'] + ',' + self.df['sourFlavor'] + ',' + self.df['bodyFlavor'] + ',' + self.df['carbonicFlavor'] + ',' + self.df['tanninFlavor']
        self.df['combined'] = self.df['foodsPoint'] + ',' + self.df['flavors']

    def search_index(self, input_name):
        # 입력된 술 이름에 해당하는 인덱스 찾기
        try:
            target_index = self.df[self.df.iloc[:, 1] == input_name].index[0]
            return target_index
        except IndexError:
            return "술 이름을 찾을 수 없습니다."

    def calculate_cosine_similarity(self):
        # 텍스트 데이터를 벡터화 시키고 코사인 유사도 구하기
        count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
        data_mat = count_vect.fit_transform(self.df['combined'])
        similarity_matrix = cosine_similarity(data_mat, data_mat)
        return similarity_matrix

    def find_sim_alcohol(self, product_name, top_n=3):
        target_index = self.search_index(product_name)
        if isinstance(target_index, str):
            return target_index

        similarities = self.calculate_cosine_similarity()
        similar_indices = similarities[target_index].argsort()[::-1][:top_n]

        top_similar_drinks_data = self.df.iloc[similar_indices]
        top_similar_drinks_similarities = similarities[target_index, similar_indices]

        top_similar_drinks_data['유사도'] = top_similar_drinks_similarities
        top_similar_drinks_data = top_similar_drinks_data.drop(index=target_index)
        return top_similar_drinks_data


In [74]:
# 객체 생성
recommender = AlcoholRecommender('/content/drive/MyDrive/CJ올리브네트웍스/추천시스템/final.csv')

# 유사한 제품 검색 (입력 제품 하나만 제외)
similar_products = recommender.find_sim_alcohol('양지백주', top_n=3)
pd.DataFrame(similar_products)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_similar_drinks_data['유사도'] = top_similar_drinks_similarities


Unnamed: 0,category,name,url,imagePath,comment,score,reviewCount,kind,alcoholDegree,capacity,...,foods,sweetFlavor,sourFlavor,bodyFlavor,carbonicFlavor,tanninFlavor,en_foods,flavors,combined,유사도
273,takju,지란지교 탁주,https://www.sooldamhwa.com/damhwaMarket/detail...,images/takju_41.jpg,진짜 수제 막걸리의 품격,4.6,13,탁주,13.00%,500ml,...,"라자냐,가지볶음,버터 갈릭 새우",sweet_3,sour_4,body_4,carbon_0,tannin_2,"lasana,stir_fried_eggplant,croque_monsieur","sweet_3,sour_4,body_4,carbon_0,tannin_2","풍미가 짙은 음식과 함께해 보세요,sweet_3,sour_4,body_4,carbo...",0.478365
235,takju,배금도가 수제 막걸리,https://www.sooldamhwa.com/damhwaMarket/detail...,images/takju_31.jpg,수제 누룩으로 만든 술,4.9,19,탁주,12.00%,"500ml, 1000ml",...,"막창,돼지껍데기,통닭 구이",sweet_2,sour_4,body_4,carbon_0,tannin_2,"gopchanggui,grilled_pork,chicken","sweet_2,sour_4,body_4,carbon_0,tannin_2","기름진 음식과 즐겨보세요,sweet_2,sour_4,body_4,carbon_0,t...",0.430706


# Flavor + foodsPoint + foods

In [75]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class AlcoholRecommender:
    def __init__(self, data_path):
        # 데이터 불러오기
        self.df = pd.read_csv(data_path, header=0)
        self.df['foodsPoint'] = self.df['foodsPoint'].fillna('')

        # 열에 접두사 추가 및 'flavors', 'combined' 열 생성
        self.df['sweetFlavor'] = 'sweet_' + self.df['sweetFlavor'].astype(str)
        self.df['sourFlavor'] = 'sour_' + self.df['sourFlavor'].astype(str)
        self.df['bodyFlavor'] = 'body_' + self.df['bodyFlavor'].astype(str)
        self.df['carbonicFlavor'] = 'carbon_' + self.df['carbonicFlavor'].astype(str)
        self.df['tanninFlavor'] = 'tannin_' + self.df['tanninFlavor'].astype(str)
        self.df['flavors'] = self.df['sweetFlavor'] + ',' + self.df['sourFlavor'] + ',' + self.df['bodyFlavor'] + ',' + self.df['carbonicFlavor'] + ',' + self.df['tanninFlavor']
        self.df['combined'] = self.df['foods'] + ',' + self.df['foodsPoint'] + ',' + self.df['flavors']
        self.df['combined'] = self.df['combined'].fillna('')

    def search_index(self, input_name):
        # 입력된 술 이름에 해당하는 인덱스 찾기
        try:
            target_index = self.df[self.df.iloc[:, 1] == input_name].index[0]
            return target_index
        except IndexError:
            return "술 이름을 찾을 수 없습니다."

    def calculate_cosine_similarity(self):
        # 텍스트 데이터를 벡터화 시키고 코사인 유사도 구하기
        count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
        data_mat = count_vect.fit_transform(self.df['combined'])
        similarity_matrix = cosine_similarity(data_mat, data_mat)
        return similarity_matrix

    def find_sim_alcohol(self, product_name, top_n=3):
        target_index = self.search_index(product_name)
        if isinstance(target_index, str):
            return target_index

        similarities = self.calculate_cosine_similarity()
        similar_indices = similarities[target_index].argsort()[::-1][:top_n]

        top_similar_drinks_data = self.df.iloc[similar_indices]
        top_similar_drinks_similarities = similarities[target_index, similar_indices]

        top_similar_drinks_data['유사도'] = top_similar_drinks_similarities
        top_similar_drinks_data = top_similar_drinks_data.drop(index=target_index)
        return top_similar_drinks_data


In [77]:
# 객체 생성
recommender = AlcoholRecommender('/content/drive/MyDrive/CJ올리브네트웍스/추천시스템/final.csv')

# 유사한 제품 검색 (입력 제품 하나만 제외)
similar_products = recommender.find_sim_alcohol('양지백주', top_n=3)
pd.DataFrame(similar_products)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_similar_drinks_data['유사도'] = top_similar_drinks_similarities


Unnamed: 0,category,name,url,imagePath,comment,score,reviewCount,kind,alcoholDegree,capacity,...,foods,sweetFlavor,sourFlavor,bodyFlavor,carbonicFlavor,tanninFlavor,en_foods,flavors,combined,유사도
273,takju,지란지교 탁주,https://www.sooldamhwa.com/damhwaMarket/detail...,images/takju_41.jpg,진짜 수제 막걸리의 품격,4.6,13,탁주,13.00%,500ml,...,"라자냐,가지볶음,버터 갈릭 새우",sweet_3,sour_4,body_4,carbon_0,tannin_2,"lasana,stir_fried_eggplant,croque_monsieur","sweet_3,sour_4,body_4,carbon_0,tannin_2","라자냐,가지볶음,버터 갈릭 새우,풍미가 짙은 음식과 함께해 보세요,sweet_3,s...",0.313882
283,takju,해창 12%,https://www.sooldamhwa.com/damhwaMarket/detail...,images/takju_46.jpg,해창주조장의 대표 막걸리,4.2,10,탁주,12%,900ml,...,"떡갈비,육회,갈비 구이",sweet_3,sour_4,body_4,carbon_0,tannin_2,"galbi,beef_tartare,galbi","sweet_3,sour_4,body_4,carbon_0,tannin_2","떡갈비,육회,갈비 구이,술만큼이나 강한 풍미가 있는 기름진 음식과 함께해 보세요,s...",0.294245


# 검증

- 목표: 목표에 따라 임계값을 설정 -> 기준이 없음
- 높은 유사성 값 (크로스 임계값)을 가진 항목 쌍을 '유사한 항목'으로 정의
- 낮은 유사성 값 (크로스 임계값 아래)을 가진 항목 쌍을 '유사하지 않은 항목'으로 정의

- 다만, 인풋 안주에 대한 술 추천이 해당 추천시스템의 취지이므로 단순히 유사성이 높은 것으로 추천하는 것이 아니라 foodsPoint를 기준으로 하여 여러 컬럼을 함께 비교하였을 때, 유사성이 높은 것으로 한다.