In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [92]:
# Setting
location = "강남역"
keyword = "맛집"
low_rate = 70 # 하위 70% 이상

columns_to_zero = ['instagram_post', 'instagram_follower',
                   'visitor_review_count', 'blog_review_count', 'distance_from_subway']

columns_to_false = ['on_tv', 'seoul_michelin', 'on_blue_ribbon', "no_kids",
                    "parking_available", "hot_spot", "age-2030", "gender-balance", "new_store"]

def calculate_consine_similarity(df1, df2):
    return cosine_similarity(df1, df2)

# Data Loading

crawled_data = pd.read_csv(
    f"data/crawl_result/{location}{keyword}.csv", header=0, encoding='utf-8-sig').drop('Unnamed: 0', axis=1).reset_index(drop=True)

datepop_data = pd.read_csv(
    f"data/datepop/shop_food.csv", header=0, encoding='utf-8-sig').drop('Unnamed: 0', axis=1).reset_index(drop=True)

for column in columns_to_zero:
    crawled_data[column].fillna(0, inplace=True)
    datepop_data[column].fillna(0, inplace=True)
for column in columns_to_false:
    crawled_data[column].fillna(False, inplace=True)
    datepop_data[column].fillna(False, inplace=True)

crawled_data.reset_index(drop=True, inplace=True)
datepop_data.reset_index(drop=True, inplace=True)

# Type Conveersion
str_to_int_features = ['instagram_post', 'instagram_follower', 'visitor_review_count', 'blog_review_count', 'distance_from_subway']

for feature in str_to_int_features:
    crawled_data[feature] = crawled_data[feature].astype(int)
    datepop_data[feature] = datepop_data[feature].astype(int)

handled_data = crawled_data.copy()


# Feature Scaling
numeric_features = ['instagram_post', 'instagram_follower', 'visitor_review_count', 'blog_review_count', 'distance_from_subway']
scaler = MinMaxScaler()

handled_data[numeric_features] = scaler.fit_transform(handled_data[numeric_features])
datepop_data[numeric_features] = scaler.transform(datepop_data[numeric_features])

# Similarity Calculation
similarity_features = ['instagram_post', 'instagram_follower', 'visitor_review_count', 'blog_review_count', 
                       'distance_from_subway', 'new_store', 'gender-balance', 'age-2030', 'hot_spot', 
                       'parking_available', 'no_kids', 'on_tv', 'seoul_michelin', 'on_blue_ribbon']

average_similarity = []
for i, row in handled_data.iterrows():
    row_df = pd.DataFrame([row[similarity_features]])
    similarities = calculate_consine_similarity(row_df, datepop_data[similarity_features])[0]

    high_percent = np.percentile(similarities, low_rate)
    top_similarities = [sim for sim in similarities if sim >= high_percent]
    average_similarity.append(np.mean(top_similarities))

crawled_data.insert(3, 'similarity', average_similarity)
# pd.DataFrame(average_similarity).describe()

### 6. Total score

In [93]:
total_scores = []
for index, item in crawled_data.iterrows():
    score1 = math.pow(100, item["similarity"])
    total_scores.append(score1)

crawled_data.insert(4, 'total_score', total_scores)
crawled_data.sort_values(by="total_score", ascending=False,
                            ignore_index=True, inplace=True)



Unnamed: 0,store_id,name,category,similarity,total_score,is_food,new_store,instagram_link,instagram_post,instagram_follower,...,no_kids,pet_available,seoul_michelin,age-2030,gender-balance,on_blue_ribbon,image_urls,address,phone,gps
0,1317157725,레뽀드라라 강남점,카페,0.928796,72.04317,True,False,https://www.instagram.com/repos_de_lala,82,251,...,False,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 강남구 테헤란로7길 22 2관 101호,0507-1352-3177,"{'latitude': 37.5003173, 'longitude': 127.0306..."
1,1712856289,류센소 강남점,일본식라면,0.928435,71.923271,True,False,https://www.instagram.com/ryusenso.gangnam,9,88,...,False,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 강남구 강남대로102길 31 1층,0507-1342-8725,"{'latitude': 37.50309500000002, 'longitude': 1..."
2,19948255,진대감 역삼점,소고기구이,0.925823,71.063291,True,False,https://www.instagram.com/jindaegam_official,128,2176,...,False,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 강남구 봉은사로30길 75 1층,0507-1346-2472,"{'latitude': 37.5027032, 'longitude': 127.0371..."
3,1853785709,배드라핀 [Bad Lapin],와인,0.924342,70.580451,True,False,https://www.instagram.com/badlapin_,25,1593,...,False,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 강남구 논현로97길 23 1층 카페,0507-1310-7913,"{'latitude': 37.5028125549704, 'longitude': 12..."
4,1136933064,호보식당,한식,0.91482,67.55242,True,False,https://instagram.com/hobo__restaurant,173,661,...,False,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 강남구 논현로85길 43 1층,02-508-3992,"{'latitude': 37.4980897, 'longitude': 127.0346..."
5,1932467805,보글이 생태탕왕코다리 양재역점,"해물,생선요리",0.912736,66.907162,True,False,,0,0,...,False,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 서초구 강남대로37길 56-18 서초에메랄드빌딩 1층,0507-1388-7709,"{'latitude': 37.4854129, 'longitude': 127.0305..."
6,1912493531,데일리픽스,양식,0.906885,65.128288,True,False,https://www.instagram.com/daily.fix.seoul,108,1093,...,False,True,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 강남구 논현로85길 70 1층 102호,0507-1346-2468,"{'latitude': 37.4976844, 'longitude': 127.03257}"
7,1381311081,진짜해산물 역삼점,조개요리,0.90269,63.882104,True,False,https://www.instagram.com/jinhae_official,40,1108,...,False,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 강남구 언주로81길 8 1층 2호,0507-1351-3943,"{'latitude': 37.5003645, 'longitude': 127.0428..."
8,1081029406,쇼쿠지 양재점,일식당,0.901945,63.66331,True,False,,0,0,...,False,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 서초구 서운로6길 19 1층,0507-1349-8740,"{'latitude': 37.485876384532716, 'longitude': ..."
9,1882132325,비아살라리아,베이커리,0.901327,63.482575,True,False,https://www.instagram.com/via_salaria,12,312,...,False,True,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,서울 강남구 역삼로7길 10 융전빌딩 101호,0507-1431-2297,"{'latitude': 37.4948832, 'longitude': 127.0331..."


In [94]:
# crawled_data.head(20).to_csv(f'data/crawl_score/only_similarity/{location}{keyword}_top20_{low_rate}.csv', encoding='utf-8-sig')