In [1]:
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import datetime
import re
import warnings
warnings.filterwarnings("ignore")

In [7]:
def call_df(table_name):
    with open('config.json', 'r') as f:
        config = json.load(f)
        
    conn = psycopg2.connect(user = config['USER'],
                              password = config['PASSWORD'],
                              host = config['HOST'],
                              port = config['PORT'],
                              database = config['DATABASE'])
    
    sql = f'SELECT * FROM {table_name}'
    df = pd.read_sql_query(sql, conn)
    conn.close()
    return df

In [8]:
district = call_df('crawling_db.district_table')
apartment = call_df('crawling_db.apartment_table').drop(columns='table_id')
school = call_df('crawling_db.school_table').drop(columns='table_id')
subway = call_df('crawling_db.subway_table').drop(columns='table_id')
price = call_df('crawling_db.price_table')

In [28]:
df = (apartment.
      merge(district, how='left', on='district_id').
      merge(school, how='left', on='apartment_id').
      merge(subway, how='left',on='apartment_id'))

In [29]:
df['apartment_floor_min'] = pd.to_numeric(df['apartment_floor_min'])
df['apartment_floor_max'] = pd.to_numeric(df['apartment_floor_max'])
df['school_students'] = pd.to_numeric(df['school_students'])
df['school_addr_town'] = df['school_addr_town'].apply(lambda x: re.split('\d',x)[0])

In [30]:
df = df[df['school_name']!=''].reset_index(drop=True).copy()

In [31]:
from sklearn.preprocessing import StandardScaler
def standard_scaler(data, var_name):
    scaler = StandardScaler()
    data[var_name] = scaler.fit_transform(data[var_name].values.reshape(-1,1))
    return data

In [32]:
var_num = ['apartment_floor_min',
             'apartment_floor_max',
             'apartment_parking',
             'school_students',
             'st_dist',
             'st_volume']

for var in var_num:
    df = standard_scaler(df, var)

In [33]:
var_cate = ['district_id',
            'apartment_addr_town',
            'apartment_builder',
            'apartment_build_year',
            'apartment_build_month',
            'school_name',
            'school_addr_district',
            'school_addr_town',
            'st_name']
var_label = ['apartment_addr_town',
             'apartment_builder',
             'school_name',
             'school_addr_district',
             'school_addr_town',
             'st_name']

In [34]:
from sklearn.preprocessing import LabelEncoder
def label_encoding(data, var_name):
    encoder = LabelEncoder()
    data[var_name] = encoder.fit_transform(data[var_name])
    return data

In [35]:
for var in var_label:
    df = label_encoding(df, var)

In [37]:
train = df[var_cate + var_num]
print(len(var_cate))

9


In [38]:
train.head(5)

Unnamed: 0,district_id,apartment_addr_town,apartment_builder,apartment_build_year,apartment_build_month,school_name,school_addr_district,school_addr_town,st_name,apartment_floor_min,apartment_floor_max,apartment_parking,school_students,st_dist,st_volume
0,13,142,1571,2004,12,234,12,67,135,0.140933,1.863569,0.331306,0.578211,-1.047446,-0.477447
1,1,8,1798,2020,9,55,0,9,95,-0.443879,3.319099,0.875301,-0.788767,0.908914,-0.477447
2,1,8,1886,2003,3,26,0,9,32,0.140933,-0.319726,1.698646,0.11817,-0.819653,-0.477447
3,1,8,1418,1982,11,25,0,145,32,-0.833754,-1.047491,-0.918413,-1.737766,0.529064,-0.477447
4,1,8,1418,1982,11,345,0,9,13,-0.833754,-1.047491,-1.050736,-0.886033,-0.36205,-0.477447


In [39]:
#최적의 cluster

from sklearn.cluster import KMeans
from kmodes.kprototypes import KPrototypes

kproto = KPrototypes(n_clusters=4, verbose=0, random_state=0)
train['cluster'] = kproto.fit_predict(train, categorical=list(range(len(var_cate))))

In [41]:
#우선 저장
train.to_csv('clustered_data.csv')

In [44]:
df['cluster'] = train['cluster']
df = df[['apartment_id','apartment_name','cluster']]
df = df.merge(price, how='left', on='apartment_id')

In [46]:
# period => datetime 형태로 변환. 시각화에 용이.
pd.plotting.register_matplotlib_converters() # datetime 형태를 시각화에 사용
df['time_period'] = df['period'].apply(lambda x: pd.to_datetime(x, format='%Y.%m'))

In [48]:
temp = df.copy()
temp = temp.dropna()
temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456582 entries, 0 to 458142
Data columns (total 11 columns):
apartment_id       456582 non-null int64
apartment_name     456582 non-null object
cluster            456582 non-null uint16
price_id           456582 non-null float64
area               456582 non-null object
period             456582 non-null object
year               456582 non-null float64
month              456582 non-null float64
amount             456582 non-null float64
amount_original    456582 non-null object
time_period        456582 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(1), object(4), uint16(1)
memory usage: 39.2+ MB


In [49]:
#필요한 데이터만 추출
temp = temp.drop(['apartment_id','price_id','period','year','month','amount_original'],axis=1)

In [50]:
# area => 숫자가 아닌 무언가가 처음 나오는 것 기준으로 앞의 내용
import re
temp['area'] = temp['area'].apply(lambda x: int(re.split('\D',x)[0]))

In [53]:
#사용자가 원하는 아파트 입력
your_apart = input('아파트 이름을 입력해주세요: ')

아파트 이름을 입력해주세요: 상수두산위브


In [54]:
#사용자가 원하는 면적 입력
your_area = int(input('면적을 입력해 주세요(m2 단위): '))

면적을 입력해 주세요(m2 단위): 104


In [90]:
#해당 아파트와 면적이 해당하는 cluster 추출하고, 그 cluster 해당하는 모든 아파트 가격들을 모아 새로운 변수를 생성
group_name = temp[(temp['apartment_name']==your_apart) & (temp['area'] == your_area)].cluster[1]
temp_for_var = temp[temp['cluster']== group_name]

In [91]:
#이 데이터로 이제 시계열 분석을 진행해야 한다. 필요없는 자료들은 역시 제거한다
temp_for_var = temp_for_var.drop(['apartment_name','cluster','area'],axis=1)
temp_for_var = temp_for_var[['time_period','amount']]
temp_for_var.sort_values(by='time_period',ascending=True)

Unnamed: 0,time_period,amount
26549,2006-01-01,7.450000e+08
325144,2006-01-01,3.800000e+08
16090,2006-01-01,1.150000e+09
36461,2006-01-01,4.480000e+08
305097,2006-01-01,2.640000e+08
...,...,...
256081,2020-04-01,7.400000e+08
333676,2020-04-01,2.300000e+08
333757,2020-04-01,4.300000e+08
142929,2020-04-01,7.200000e+08


In [92]:
grouped = temp_for_var.groupby(temp_for_var['time_period'])

In [187]:
a = grouped.min()
a.columns = ['real_min']
b = grouped.median()
b.columns = ['real_median']
c = grouped.max()
c.columns = ['real_max']

In [188]:
var_data = pd.merge(a,b,left_index=True,right_index=True,how='left')
var_data = pd.merge(var_data,c,left_index=True,right_index=True,how='left')

In [189]:
train = var_data[0:len(var_data)-4]
test = var_data[len(var_data)-4:len(var_data)]

In [190]:
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(endog=train)
model_fit = model.fit()

In [191]:
yhat = model_fit.forecast(model_fit.y, steps=4)
var = pd.DataFrame(yhat)
var.columns = ['test_min','test_median','test_max']
var.index = test.index
var

Unnamed: 0_level_0,test_min,test_median,test_max
time_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-01,115097000.0,742295500.0,4901008000.0
2020-02-01,113779100.0,748312300.0,5027668000.0
2020-03-01,113143800.0,754205100.0,5071377000.0
2020-04-01,113104200.0,760111600.0,5100613000.0


In [192]:
#결과는 조금 속상하군요
result = pd.concat([test,var],axis=1)
result

Unnamed: 0_level_0,real_min,real_median,real_max,test_min,test_median,test_max
time_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01,101000000.0,670000000.0,5450000000.0,115097000.0,742295500.0,4901008000.0
2020-02-01,90000000.0,700000000.0,5700000000.0,113779100.0,748312300.0,5027668000.0
2020-03-01,95000000.0,690000000.0,4800000000.0,113143800.0,754205100.0,5071377000.0
2020-04-01,100000000.0,551500000.0,5050000000.0,113104200.0,760111600.0,5100613000.0
