### 점수 산출을 임의로 해볼 수 있는 파일입니다.

In [43]:
import pandas as pd
import numpy as np
import math

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, RobustScaler

from utils.calculate_score import calculate_score
from utils.load_datepop import load_datepop

In [44]:
crawling_dict_list = [
        {
            "location": "강남역",
            "keyword": ["맛집", "공방", "만화카페", "커플 스튜디오", "동물카페"]
        },
        {
            "location": "가로수길",
            "keyword": ["맛집", "공방", "만화카페", "커플 스튜디오", "동물카페"]
        },
        {
            "location": "대학로",
            "keyword": ["맛집", "공방", "만화카페", "커플 스튜디오", "동물카페", "연극"]
        },
        {
            "location": "홍대",
            "keyword": ["맛집", "공방", "만화카페", "커플 스튜디오", "동물카페", "연극"]
        },
        {
            "location": "연남동",
            "keyword": ["맛집", "공방", "만화카페", "커플 스튜디오", "동물카페"]
        },
    ]

In [67]:
class CrawlingDataScorer:

    def __init__(self, crawled_data, location, keyword, is_food):
        self.location = location
        self.keyword = keyword
        self.is_food = is_food

        self.datepop_data = pd.DataFrame()
        self.crawled_data = crawled_data
        self.scaled_crawled_data = pd.DataFrame()


    # missing value 채워서 반환
    def fill_missing_value(self, data):
        
        missing_to_zero = ['instagram_post', 'instagram_follower',
                   'visitor_review_count', 'blog_review_count', ]
        missing_to_inf = ['distance_from_subway']

        missing_to_false = ['on_tv', 'seoul_michelin', 'on_blue_ribbon', "no_kids",
                            "parking_available", "hot_spot", "age-2030", "gender-balance", "new_store"]
    
        for column in missing_to_zero:
            data[column] = data[column].fillna(1)
        for column in missing_to_false:
             data[column] = data[column].fillna(False).astype('bool')
        for column in missing_to_inf:
            data[column] = data[column].fillna(1000)

        data = data.reset_index(drop=True)

        return data
    
    # str형으로 표기된 numerfic features를 int형으로 전환 후 반환
    def str_to_int(self, data):
        # Type Conversion
        str_to_int_features = ['instagram_post', 'instagram_follower',
                            'visitor_review_count', 'blog_review_count', 'distance_from_subway']

        for feature in str_to_int_features:
            data[feature] = data[feature].astype(int)

        return data

    # 데이트팝 매장 load
    def load_and_preprocess_datepop_data(self):

        datepop_data = load_datepop()

        # 인스타그램 링크 없는 경우(null 또는 빈 문자열) 제외
        datepop_data = datepop_data[datepop_data['instagram_link'].notna() & (
            datepop_data['instagram_link'] != '')]
        # 데이트팝 매장 중, 아래 features 값이 하나라도 없는 경우(null 또는 0) 제외
        drop_numeric_features = ['instagram_post', 'instagram_follower',
                                 'visitor_review_count', 'blog_review_count']

        datepop_data = datepop_data[datepop_data[drop_numeric_features].notna().all(
            axis=1) & (datepop_data[drop_numeric_features] != 0).all(axis=1)]

        datepop_data = self.fill_missing_value(datepop_data)
        self.datepop_data = self.str_to_int(datepop_data)

    # 크롤링 매장 전처리
    def preprocess_crawled_data(self):

        self.crawled_data = self.fill_missing_value(self.crawled_data)
        self.crawled_data = self.str_to_int(self.crawled_data)

    # 유사도 계산을 위한 numeric features 전처리
    def scaling_numeric_features(self):
        # Feature Scaling
        numeric_features = ['instagram_post', 'instagram_follower',
                            'visitor_review_count', 'blog_review_count']

        robust_scaler = RobustScaler()
        self.scaled_crawled_data[  numeric_features] = robust_scaler.fit_transform(
            self.crawled_data[numeric_features])
        self.datepop_data[numeric_features] = robust_scaler.transform(
            self.datepop_data[numeric_features])

        min_max_scaler = MinMaxScaler()
        self.scaled_crawled_data[numeric_features] = min_max_scaler.fit_transform(
            self.scaled_crawled_data[numeric_features])
        self.datepop_data[numeric_features] = min_max_scaler.transform(
            self.datepop_data[numeric_features])


    # 크롤링 매장에 대한 기준표 점수 계산
    def calculate_condition_score(self):
        scores = []
        for index, item in self.crawled_data.iterrows():
            score = calculate_score(item)
            scores.append(score)

        self.crawled_data.insert(2, 'score', scores)
    
    # 크롤링 매장에 대한 데이트팝 매장과의 유사도 점수 계산
    def calculate_similarity_score(self):
        similarity_features = ['instagram_post', 'instagram_follower', 'visitor_review_count', 'blog_review_count']
        average_similarity = []
        for i, row in self.scaled_crawled_data.iterrows():
            row_df = pd.DataFrame([row[similarity_features]])
            similarities = cosine_similarity(row_df, self.datepop_data[similarity_features])[0]

            high_percent = np.percentile(similarities, 70)

            top_similarities = [sim for sim in similarities if sim >= high_percent]
        
            average_similarity.append(np.mean(top_similarities))

        similarity_scores = [5 * math.pow(10, sim) for sim in average_similarity]
        self.crawled_data.insert(2, 'similarity', similarity_scores)
        
    def calculate_total_score(self):
        # Total Score
        total_scores = []
        for index, item in self.crawled_data.iterrows():
            score1 = item["similarity"]
            score2 = item["score"]

            total_scores.append(score1 + score2)

        self.crawled_data.insert(2, 'total_score', total_scores)
        self.crawled_data= self.crawled_data.sort_values(by="total_score", ascending=False,
                                    ignore_index=True)
        
    def scoring(self):
        self.load_and_preprocess_datepop_data()
        self.preprocess_crawled_data()
        self.scaling_numeric_features()
        self.calculate_condition_score()
        self.calculate_similarity_score()
        self.calculate_total_score()
    
    # def save_result(self):
    #     self.crawled_data.head(20).to_csv(f'data/crawl_score/hybrid/{self.location}{self.keyword}_top20.csv', encoding='utf-8-sig')

In [68]:
location = "성수동"
keyword = "맛집"

crawled_data = pd.read_csv(f'data/crawl_result/{location}{keyword}.csv')
is_food = False
if keyword == "맛집":
    is_food = True

scorer = CrawlingDataScorer(
    crawled_data=crawled_data, location=location, keyword=keyword, is_food=is_food)
scorer.scoring()

scorer.crawled_data.sort_values(by="total_score", ascending=False).head(20)

  data[column] = data[column].fillna(False).astype('bool')
  data[column] = data[column].fillna(False).astype('bool')
  data[column] = data[column].fillna(False).astype('bool')
  data[column] = data[column].fillna(False).astype('bool')
  data[column] = data[column].fillna(False).astype('bool')


Unnamed: 0.1,Unnamed: 0,store_id,total_score,similarity,score,name,category,is_food,new_store,instagram_link,...,no_kids,seoul_michelin,age-2030,gender-balance,on_blue_ribbon,image_urls,running_well,address,phone,gps
0,366,1422706717,80.214055,44.214055,36,텐웰즈,바(BAR),True,True,https://www.instagram.com/10_wells,...,True,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,False,서울 강남구 강남대로156길 46 지하1층,0507-1437-0381,"{'latitude': 37.519009, 'longitude': 127.0226277}"
1,173,1144909027,79.847688,49.847688,30,옥스라이브파이어그릴 성수,양식,True,True,https://www.instagram.com/ox_seongsu,...,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,False,서울 성동구 둘레15길 11 1층,0507-1406-0600,"{'latitude': 37.5357736, 'longitude': 127.0555..."
2,383,1562260637,78.687841,48.687841,30,식물와인바,와인,True,True,https://www.instagram.com/sikmul_winebar,...,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,False,서울 광진구 동일로24길 64 1층 식물와인바,0507-1329-4552,"{'latitude': 37.5429525, 'longitude': 127.0685..."
3,5,1513343333,76.597834,49.597834,27,라바트리 서울숲,카페,True,True,https://www.instagram.com/lavatree_coffee,...,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,True,서울 성동구 아차산로 6 5층,0507-1326-9767,"{'latitude': 37.5476731, 'longitude': 127.0451..."
4,283,1736738437,75.837606,49.837606,26,에스메로,스페인음식,True,True,https://www.instagram.com/esmero.seoul,...,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,True,서울 성동구 뚝섬로13길 36 9층 & 루프탑,02-6951-2884,"{'latitude': 37.5412877, 'longitude': 127.0581..."
5,410,1232930088,75.837351,49.837351,26,오제이바,술집,True,True,https://www.instagram.com/ojbar0,...,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,False,서울 강남구 도산대로37길 13 1층,0507-1363-6317,"{'latitude': 37.5219023, 'longitude': 127.031184}"
6,408,1163328133,75.427618,48.427618,27,쑬로,술집,True,True,https://www.instagram.com/ssulo___norja,...,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,False,서울 강남구 선릉로161길 22 지하1층,0507-1393-1076,"{'latitude': 37.527549, 'longitude': 127.0386672}"
7,400,1497728699,75.326361,49.326361,26,드레자양,요리주점,True,True,https://www.instagram.com/d.re__jayang,...,False,False,False,False,False,[],False,서울 광진구 뚝섬로35길 23 1층 검은색 문 돌손잡이,0507-1413-3569,"{'latitude': 37.5358955, 'longitude': 127.0705..."
8,186,1937156788,75.022536,48.022536,27,헤이든 성수점,술집,True,True,https://www.instagram.com/heyden_sojubar,...,True,False,False,False,False,[],True,서울 성동구 왕십리로10길 24 지1층,0507-1321-6825,"{'latitude': 37.5463605, 'longitude': 127.047026}"
9,356,1580911013,74.706773,47.706773,27,각식당,한식,True,True,https://www.instagram.com/gak_sikdang,...,False,False,False,False,False,['https://search.pstatic.net/common/?autoRotat...,False,서울 동대문구 장한로6길 32 1층,0507-1442-1395,"{'latitude': 37.5629142, 'longitude': 127.0678..."
