# Data Importing

In [1]:
import psycopg2
import pandas as pd
import json

In [2]:
def call_df(table_name):
    with open('config.json', 'r') as f:
        config = json.load(f)
        
    conn = psycopg2.connect(user = config['USER'],
                              password = config['PASSWORD'],
                              host = config['HOST'],
                              port = config['PORT'],
                              database = config['DATABASE'])
    
    sql = f'SELECT * FROM {table_name}'
    df = pd.read_sql_query(sql, conn)
    conn.close()
    return df

In [3]:
district = call_df('crawling_db.district_table')
apartment = call_df('crawling_db.apartment_table').drop(columns='table_id')
school = call_df('crawling_db.school_table').drop(columns='table_id')
price = call_df('crawling_db.price_table')

In [4]:
df = (price.merge(apartment, how='left', on='apartment_id').
      merge(district, how='left', on='district_id').
      merge(school, how='left', on='apartment_id'))

In [9]:
df.head()

Unnamed: 0,price_id,apartment_id,area,period,year,month,amount,amount_original,district_id,apartment_addr_town,...,apartment_floor_min,apartment_floor_max,apartment_parking,district_name,school_name,school_dist,school_addr_district,school_addr_town,school_students,time_period
0,1,8928,160,2020.02,2020,2,2060000000,"20억 6,000",1,개포동,...,20,22,2.36,강남구,서울대진초등학교,1,1,개포로109길,316,2020-02-01
1,2,8928,160,2020.01,2020,1,2030000000,"20억 3,000",1,개포동,...,20,22,2.36,강남구,서울대진초등학교,1,1,개포로109길,316,2020-01-01
2,3,8928,160,2019.11,2019,11,1950000000,"19억 5,000",1,개포동,...,20,22,2.36,강남구,서울대진초등학교,1,1,개포로109길,316,2019-11-01
3,4,8928,160,2019.1,2019,10,1939999999,"19억 4,000",1,개포동,...,20,22,2.36,강남구,서울대진초등학교,1,1,개포로109길,316,2019-10-01
4,5,8928,160,2019.07,2019,7,1739999999,"17억 4,000",1,개포동,...,20,22,2.36,강남구,서울대진초등학교,1,1,개포로109길,316,2019-07-01


# Data Preprocessing

In [6]:
# area => 숫자가 아닌 무언가가 처음 나오는 것 기준으로 앞의 내용
import re
df['area'] = df['area'].apply(lambda x: int(re.split('\D',x)[0]))

In [7]:
# period => datetime 형태로 변환. 시각화에 용이.
pd.plotting.register_matplotlib_converters() # datetime 형태를 시각화에 사용
df['time_period'] = df['period'].apply(lambda x: pd.to_datetime(x, format='%Y.%m'))