In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from copy import deepcopy

In [2]:
import os
path = os.path.dirname(os.getcwd())
path = os.path.join(path, "ProcessedData")

In [3]:
train = pd.read_csv(os.path.join(path, "train_processed.csv"))
test = pd.read_csv(os.path.join(path, "test_processed.csv"))
apt_train = pd.read_csv(os.path.join(path, "building_tr.csv"))
apt_test = pd.read_csv(os.path.join(path, "building_tst.csv"))
adr_train = pd.read_csv(os.path.join(path, "train_apt.csv"))
adr_test = pd.read_csv(os.path.join(path, "test_apt.csv"))
subway = pd.read_csv(os.path.join(path, "subway.csv"))
pop_train = pd.read_csv(os.path.join(path, "pop_tr.csv"))
pop_test = pd.read_csv(os.path.join(path, "pop_tst.csv"))

In [4]:
apt_train = apt_train.iloc[:,[4,9]].rename(columns={'건물명':'단지명'})
apt_test = apt_test.iloc[:,[4,9]].rename(columns={'건물명':'단지명'})

In [5]:
train = pd.merge(train,apt_train,how='left',on='도로명주소')
train = pd.merge(train,adr_train.loc[:,['위도','경도','도로명주소']],how='left',on='도로명주소')

In [6]:
test = pd.merge(test,apt_test,how='left',on='도로명주소')
test = pd.merge(test,adr_test.loc[:,['위도','경도','도로명주소']],how='left',on='도로명주소')

In [7]:
df_tr=pd.DataFrame({'단지명':['대전둔산3', '제주아라주공아파트', '울산송정2'],
                    '위도':[127.37073298347131,126.54686726803892, 129.36353989694038],
                     '경도':[36.36376899104686, 33.47376381443057,35.59673220404177]})
df_tst=pd.DataFrame({'단지명':['정읍첨단1 주거행복복지센터'],
                     '위도':[126.848614],
                     '경도':[35.493656]})

In [8]:
for d in df_tr.단지명.unique():
    train.loc[train.단지명 == d,"위도"] = df_tr.loc[df_tr.단지명 == d,"위도"].reset_index(drop=True)[0]
    train.loc[train.단지명 == d,"경도"] = df_tr.loc[df_tr.단지명 == d,"경도"].reset_index(drop=True)[0]

In [9]:
for d in df_tst.단지명.unique():
    test.loc[test.단지명 == d,"위도"] = df_tst.loc[df_tst.단지명 == d,"위도"].reset_index(drop=True)[0]
    test.loc[test.단지명 == d,"경도"] = df_tst.loc[df_tst.단지명 == d,"경도"].reset_index(drop=True)[0]

In [10]:
def loc(x,y):
    m = 1000000; res=''
    for ind in range(subway.shape[0]):
        sub_x = subway.loc[ind,'위도']
        sub_y = subway.loc[ind,'경도']
        subway_name = subway.loc[ind,'subway_name']
        distance = np.sqrt((sub_x-x)**2+(sub_y-y)**2)
        if distance < m:
            m = distance
            res = subway_name
    return res,m

In [11]:
tmp = train.apply(lambda x : loc(x['위도'],x['경도']),axis=1)
tmp1 = test.apply(lambda x : loc(x['위도'],x['경도']),axis=1)

In [12]:
train['subway_name'] = [i[0] for i in tmp]
train['subway_dist'] = [i[1] for i in tmp]

In [13]:
test['subway_name'] = [i[0] for i in tmp1]
test['subway_dist'] = [i[1] for i in tmp1]

In [14]:
train = pd.merge(train,subway[['subway_name','환승역 수']],on='subway_name',how='left')
test = pd.merge(test,subway[['subway_name','환승역 수']],on='subway_name',how='left')

In [15]:
def impute(col_pred, tmp, x_col = "단지내주차면수",col = "연면적"):
    tr = tmp.loc[~tmp[col].isna(),[x_col,col]]
    X = np.array(tr.drop(col,axis=1)).reshape(-1,1)
    y = np.array(tr[col])
    fit = LinearRegression().fit(X,y)
    X_new = np.array(tmp.loc[tmp.단지코드==col_pred,x_col])
    pred = fit.predict(X_new.reshape(-1,1))
    return pred

In [16]:
tmp = train[['단지코드','단지내주차면수','연면적']]
code_ls_na = tmp.loc[tmp.연면적.isna(),'단지코드'].unique() 
for col_pred in code_ls_na:
    train.loc[train.단지코드==col_pred,"연면적"] = impute(col_pred, tmp)

In [17]:
tmp = test[['단지코드','단지내주차면수','연면적']]
code_ls_na = tmp.loc[tmp.연면적.isna(),'단지코드'].unique() 
for col_pred in code_ls_na:
    test.loc[test.단지코드==col_pred,"연면적"] = impute(col_pred, tmp)

In [18]:
train = pd.merge(train,pop_train,how='left',on='도로명주소')
test = pd.merge(test,pop_test,how='left',on='도로명주소')

In [19]:
train.loc[train['단지코드'].isin(['C2085']),'총세대수'] = 1339
train.loc[train['단지코드'].isin(['C2085']),'공가수'] = 9

In [20]:
train.loc[train['단지코드'].isin(['C1649']),'총세대수'] = 1047
train.loc[train['단지코드'].isin(['C1649']),'공가수'] = 31
train.loc[train['단지코드'].isin(['C1649']),'등록차량수'] = 1214
train.loc[train['단지코드'].isin(['C1649']),'버스정류장'] = 4

In [21]:
print(train.shape, test.shape) # (2952, 23) (1022, 22)
train = train.loc[~train.단지코드.isin(['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']),]
test = test.loc[~test.단지코드.isin(['C2335', 'C1327', 'C2675'])]
print(train.shape, test.shape) # (2896, 23) (1008, 22)

(2952, 34) (1022, 33)
(2896, 34) (1008, 33)


In [22]:
print(train.단지코드.nunique())
print(test.단지코드.nunique())

414
147


In [23]:
print(train.isna().sum().sum())
print(test.isna().sum().sum())

0
0


In [24]:
import os
path = os.path.dirname(os.getcwd())
path = os.path.join(path, "ProcessedData")
train.to_csv(os.path.join(path,"merged_train.csv"),index=False)
test.to_csv(os.path.join(path,"merged_test.csv"),index=False)