In [1]:
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import font_manager, rc
plt.rcParams['font.family'] = 'NanumGothic'
import platform
if platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    
    rc('font', family='AppleGothic')

matplotlib.rcParams['axes.unicode_minus'] = False

# Warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import os
path = os.path.dirname(os.getcwd())
path = os.path.dirname(path)
path = os.path.join(path, "/Users/seungji/Desktop/Dacon/Daesamanlap/ProcessedData")
train = pd.read_csv(os.path.join(path,"merged_train.csv"))
test = pd.read_csv(os.path.join(path,"merged_test.csv"))


In [2]:
tmp = train.groupby(['단지코드','임대건물구분']).count().iloc[:,:1].reset_index()
store = tmp[tmp['임대건물구분']=='상가'].reset_index(drop=True)[['단지코드','총세대수']].rename(columns={'총세대수':'상가수'})
apt = tmp[tmp['임대건물구분']=='아파트'].reset_index(drop=True)[['단지코드','총세대수']].rename(columns={'총세대수':'아파트수'})
total = pd.merge(apt,store,on='단지코드',how='left').fillna(0)
total['상가비율'] = total.apply(lambda x : x['상가수']/(x['아파트수']+x['상가수']),axis=1)
train = pd.merge(train,total[['단지코드','상가비율']],on='단지코드',how='left')

tmp = test.groupby(['단지코드','임대건물구분']).count().iloc[:,:1].reset_index()
store = tmp[tmp['임대건물구분']=='상가'].reset_index(drop=True)[['단지코드','총세대수']].rename(columns={'총세대수':'상가수'})
apt = tmp[tmp['임대건물구분']=='아파트'].reset_index(drop=True)[['단지코드','총세대수']].rename(columns={'총세대수':'아파트수'})
total = pd.merge(apt,store,on='단지코드',how='left').fillna(0)
total['상가비율'] = total.apply(lambda x : x['상가수']/(x['아파트수']+x['상가수']),axis=1)
test = pd.merge(test,total[['단지코드','상가비율']],on='단지코드',how='left')

In [3]:
train = train[train['임대건물구분']=='아파트']
test = test[test['임대건물구분']=='아파트']

In [5]:
area = ['경상남도', '전라북도', '강원도', '광주광역시', '충청남도', '제주특별자치도', '울산광역시', '충청북도', '전라남도', '경상북도', '세종특별자치시']
for ind in range(train.shape[0]):
    if train.loc[ind,'지역'] in area :
        train.loc[ind,'환승역 수'] = 0

area = ['경상남도', '전라북도', '강원도', '광주광역시', '충청남도', '제주특별자치도', '울산광역시', '충청북도', '전라남도', '경상북도', '세종특별자치시']
for ind in range(test.shape[0]):
    if test.loc[ind,'지역'] in area :
        test.loc[ind,'환승역 수'] = 0

train['공가비율'] = train.apply(lambda x : x['공가수']/x['총세대수'],axis=1)
test['공가비율'] = test.apply(lambda x : x['공가수']/x['총세대수'],axis=1)

train['0~19 인구수'] = train['0~19세_비율']*train['총세대수']
train['20~39 인구수'] = train['20~39세_비율']*train['총세대수']
train['40~69 인구수'] = train['40~69세_비율']*train['총세대수']
train['70세이상 인구수'] = train['70세이상_비율']*train['총세대수']

test['0~19 인구수'] = test['0~19세_비율']*test['총세대수']
test['20~39 인구수'] = test['20~39세_비율']*test['총세대수']
test['40~69 인구수'] = test['40~69세_비율']*test['총세대수']
test['70세이상 인구수'] = test['70세이상_비율']*test['총세대수']



In [6]:
def func(x):
    for i in range(10,90,10):
        if int(x) in range(i,i+10):
            return i+5

train['전용면적'] = train['전용면적'].apply(lambda x : func(x))
test['전용면적'] = test['전용면적'].apply(lambda x : func(x))

In [7]:
train = train.join(pd.get_dummies(train['공급유형']))
test = test.join(pd.get_dummies(test['공급유형']))
test['공공분양'] = 0
test['공공임대(5년)'] = 0 
test['장기전세'] = 0

In [8]:
not_unique = ['자격유형','임대보증금','임대료','전용면적별세대수','전용면적','공급유형']

In [9]:
tmp1 = train.drop(not_unique,axis=1).drop_duplicates()

tmp = pd.DataFrame(train.groupby(['단지코드','전용면적','공급유형']).sum()['전용면적별세대수']).reset_index()
tmp = pd.merge(tmp,tmp1.drop_duplicates(),on='단지코드',how='left')
total = pd.DataFrame(tmp.groupby('단지코드').sum()['전용면적별세대수']).reset_index(drop=False).rename(columns={'전용면적별세대수':'total'})
tmp = pd.merge(tmp,total,on='단지코드',how='left')
tmp['ratio'] = tmp.apply(lambda x : x['전용면적별세대수']/x['total'],axis=1)
tmp['y1'] = tmp.apply(lambda x : x['ratio']*x['등록차량수'],axis=1)

total = pd.DataFrame(train.groupby(['단지코드','전용면적','공급유형']).mean()['임대료']).reset_index()
tmp = pd.merge(tmp,total,on=['단지코드','전용면적','공급유형'],how='left')

total = pd.DataFrame(train.groupby(['단지코드','전용면적','공급유형']).mean()['임대보증금']).reset_index()
tmp = pd.merge(tmp,total,on=['단지코드','전용면적','공급유형'],how='left')

tmp['단지내주차면수_new'] = tmp.apply(lambda x : x['단지내주차면수']*x['ratio'],axis=1)
tmp = tmp.drop(['ratio'],axis=1)

train = tmp
train['y2'] = train.apply(lambda x : x['y1']/x['단지내주차면수_new'],axis=1)


In [10]:
tmp1 = test.drop(not_unique,axis=1).drop_duplicates()

tmp = pd.DataFrame(test.groupby(['단지코드','전용면적','공급유형']).sum()['전용면적별세대수']).reset_index()
tmp = pd.merge(tmp,tmp1.drop_duplicates(),on='단지코드',how='left')
total = pd.DataFrame(tmp.groupby('단지코드').sum()['전용면적별세대수']).reset_index(drop=False).rename(columns={'전용면적별세대수':'total'})
tmp = pd.merge(tmp,total,on='단지코드',how='left')
tmp['ratio'] = tmp.apply(lambda x : x['전용면적별세대수']/x['total'],axis=1)

total = pd.DataFrame(test.groupby(['단지코드','전용면적','공급유형']).mean()['임대료']).reset_index()
tmp = pd.merge(tmp,total,on=['단지코드','전용면적','공급유형'],how='left')

total = pd.DataFrame(test.groupby(['단지코드','전용면적','공급유형']).mean()['임대보증금']).reset_index()
tmp = pd.merge(tmp,total,on=['단지코드','전용면적','공급유형'],how='left')

tmp['단지내주차면수_new'] = tmp.apply(lambda x : x['단지내주차면수']*x['ratio'],axis=1)
tmp = tmp.drop(['ratio'],axis=1)

test = tmp

In [11]:
train['0~19 인구수'] = train['0~19세_비율']*train['전용면적별세대수']
train['20~39 인구수'] = train['20~39세_비율']*train['전용면적별세대수']
train['40~69 인구수'] = train['40~69세_비율']*train['전용면적별세대수']
train['70세이상 인구수'] = train['70세이상_비율']*train['전용면적별세대수']

test['0~19 인구수'] = test['0~19세_비율']*test['전용면적별세대수']
test['20~39 인구수'] = test['20~39세_비율']*test['전용면적별세대수']
test['40~69 인구수'] = test['40~69세_비율']*test['전용면적별세대수']
test['70세이상 인구수'] = test['70세이상_비율']*test['전용면적별세대수']

col = ['0~19세_비율','20~39세_비율','40~69세_비율','70세이상_비율']
train = train.drop(col,axis=1)
test = test.drop(col,axis=1)

In [13]:
## 1. EDA 및 전처리(baseline: MSE 98)

### 1) 상가비율

tmp = tr_up.groupby(['단지코드','임대건물구분']).count().iloc[:,:1].reset_index()
store = tmp[tmp['임대건물구분']=='상가'].reset_index(drop=True)[['단지코드','총세대수']].rename(columns={'총세대수':'상가수'})
apt = tmp[tmp['임대건물구분']=='아파트'].reset_index(drop=True)[['단지코드','총세대수']].rename(columns={'총세대수':'아파트수'})
total = pd.merge(apt,store,on='단지코드',how='left').fillna(0)
total['상가비율'] = total.apply(lambda x : x['상가수']/(x['아파트수']+x['상가수']),axis=1)
tr_up = pd.merge(tr_up,total[['단지코드','상가비율']],on='단지코드',how='left')

tmp = test.groupby(['단지코드','임대건물구분']).count().iloc[:,:1].reset_index()
store = tmp[tmp['임대건물구분']=='상가'].reset_index(drop=True)[['단지코드','총세대수']].rename(columns={'총세대수':'상가수'})
apt = tmp[tmp['임대건물구분']=='아파트'].reset_index(drop=True)[['단지코드','총세대수']].rename(columns={'총세대수':'아파트수'})
total = pd.merge(apt,store,on='단지코드',how='left').fillna(0)
total['상가비율'] = total.apply(lambda x : x['상가수']/(x['아파트수']+x['상가수']),axis=1)
test = pd.merge(test,total[['단지코드','상가비율']],on='단지코드',how='left')

### 2) 세대당가능주차면수

tr_up['세대당_가능주차면수'] = tr_up.apply(lambda x : x['단지내주차면수']/x['총세대수'],axis=1)
test['세대당_가능주차면수'] = test.apply(lambda x : x['단지내주차면수']/x['총세대수'],axis=1)

### 3) 공가비율

tr_up['공가비율'] = tr_up.apply(lambda x : x['공가수']/x['총세대수'],axis=1)
test['공가비율'] = test.apply(lambda x : x['공가수']/x['총세대수'],axis=1)


NameError: name 'tr_up' is not defined