# 라이브러리

In [28]:
# 데이터 불러오기
import psycopg2
import json

# 데이터 전처리
import pandas as pd
import numpy as np
import re
import datetime

# 데이터 시각화
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import font_manager, rc
import platform

if platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    
    rc('font', family='AppleGothic')

# 기타
import warnings
warnings.filterwarnings("ignore")

# 데이터 불러오기

In [199]:
#Cluster된 데이터 불러오기
import pickle
clustered = pd.read_pickle("./data/clustered_apartment.pkl")
clustered.head(5)

Unnamed: 0,apartment_id,cluster
0,1,10
1,3,5
2,4,10
3,5,3
4,6,10


In [200]:
def call_df(table_name):
    with open('config.json', 'r') as f:
        config = json.load(f)
        
    conn = psycopg2.connect(user = config['USER'],
                              password = config['PASSWORD'],
                              host = config['HOST'],
                              port = config['PORT'],
                              database = config['DATABASE'])
    
    sql = f'SELECT * FROM {table_name}'
    df = pd.read_sql_query(sql, conn)
    conn.close()
    return df

In [201]:
price = call_df('crawling_db.price_table')
price['area'] = price['area'].apply(lambda x: int(re.split('\D',x)[0]))
apartment = call_df('crawling_db.apartment_table')
apartment = apartment[['apartment_id','apartment_name']]

In [202]:
df = (apartment.
      merge(clustered, how='left', on='apartment_id').
      merge(price, how='left', on='apartment_id'))

In [203]:
df['period'] = pd.to_datetime(df['period'], format='%Y.%m')

In [204]:
df = df[['period','apartment_name','area','amount','cluster']]
df.head(5)

Unnamed: 0,period,apartment_name,area,amount,cluster
0,2020-03-01,상수두산위브,104.0,1200000000.0,15
1,2020-02-01,상수두산위브,104.0,1200000000.0,15
2,2019-12-01,상수두산위브,104.0,1035000000.0,15
3,2019-11-01,상수두산위브,104.0,1025000000.0,15
4,2019-10-01,상수두산위브,104.0,960000000.0,15


# 데이터 전처리

In [205]:
#사용자 옵션
user_option = {
    'apartment_name':'당산반도유보라팰리스', # 검색하고 싶은 아파트의 이름
    'apartment_area':108, # 그 아파트의 면적
}

In [217]:
#날짜만 있는 빈 dataframe
empty = df['period'].sort_values().reset_index()
empty = empty.dropna(axis=0).drop_duplicates(['period'])

In [218]:
#사용자 옵션에 해당 하는 아파트와 면적을 지닌 dataframe
chosen = df[(df['apartment_name'] == user_option['apartment_name']) &
            (df['area'] <= user_option['apartment_area']+3) &
            (df['area'] >= user_option['apartment_area']-3)]

In [274]:
#빈 dataframe에 해당 옵션에 해당하는 dataframe을 합치면 값이 없는 곳에 na로 채워지게 된다.
temp = empty.merge(chosen,how='left',on='period')
temp.groupby(temp['period']).mean()
temp = temp.reset_index()
temp = temp[['period','amount']]

In [275]:
#처음 거래된 날짜로 부터 시작을 해본다
starting_point = temp[temp['amount'].notnull()].index[0]
temp = temp[starting_point:].reset_index()
temp = temp[['period','amount']]
temp

Unnamed: 0,period,amount
0,2010-03-01,7.614300e+08
1,2010-04-01,
2,2010-05-01,
3,2010-06-01,
4,2010-07-01,
...,...,...
122,2019-12-01,1.145000e+09
123,2020-01-01,1.185000e+09
124,2020-02-01,
125,2020-03-01,
