In [17]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
pd.options.display.max_rows = 100
pd.options.display.max_columns = 40
import numpy as np
import os,random, math
from tqdm import tqdm
from copy import deepcopy
from collections import Counter

# Visualization
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import font_manager, rc
plt.rcParams['font.family'] = 'NanumGothic'
import platform
if platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    
    rc('font', family='AppleGothic')

matplotlib.rcParams['axes.unicode_minus'] = False

# from dataprep.eda import plot, plot_correlation, plot_missing

import plotly 
import plotly.express as px
# from plotly import tools, subplots
# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# from plotly.offline import init_notebook_mode, iplot
# import plotly.graph_objs as go
# import plotly.express as px

# Warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
# row 생략 없이 출력
# pd.set_option('display.max_rows', None)
# col 생략 없이 출력
pd.set_option('display.max_columns', None)
from sklearn.metrics import mean_absolute_error

In [3]:
rawtest = pd.read_csv("/Users/seungji/Desktop/Dacon/Daesamanlap/ProcessedData/merged_test.csv")
rawtrain = pd.read_csv("/Users/seungji/Desktop/Dacon/Daesamanlap/ProcessedData/merged_train.csv")

In [4]:
test = pd.read_csv("/Users/seungji/Desktop/Dacon/Daesamanlap/ProcessedData/merged_test.csv")
train = pd.read_csv("/Users/seungji/Desktop/Dacon/Daesamanlap/ProcessedData/merged_train.csv")

In [5]:
# merged_ data에서 drop해야 할 단지코드
print(train.shape, test.shape)
train = train.loc[~train.단지코드.isin(['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']),]
test = test.loc[~test.단지코드.isin(['C2335', 'C1327', 'C2675'])]
print(train.shape, test.shape)

(2896, 34) (1008, 33)
(2896, 34) (1008, 33)


In [6]:
train.at[range(1481,1487),"총세대수"]=969
train.at[range(1624,1632),"총세대수"]=969
train.at[range(1481,1487),"공가수"]=9
train.at[range(1624,1632),"공가수"]=9

train.at[range(1753,1757),"총세대수"]=1047
train.at[range(1810,1812),"총세대수"]=1047
train.at[range(1753,1757),"공가수"]=31
train.at[range(1810,1812),"공가수"]=31
train.at[range(1753,1757),"등록차량수"]=1214
train.at[range(1810,1812),"등록차량수"]=1214
train.at[range(1753,1757),"버스정류장"]=4
train.at[range(1810,1812),"버스정류장"]=4

In [7]:
train=train.drop_duplicates(keep='first')
test=test.drop_duplicates(keep='first')

In [8]:
print("\n...Before preprocessing")
print(train.shape, test.shape)

# 상가비율 column 추가
train_cls = pd.concat([train[['단지코드']], pd.get_dummies(train[['임대건물구분']])], axis=1).groupby('단지코드').sum()
tot = train_cls['임대건물구분_상가']+train_cls['임대건물구분_아파트']
train_cls['상가비율'] = train_cls['임대건물구분_상가']/tot
train = pd.merge(train,train_cls[['상가비율']].reset_index(),on='단지코드')

test_cls = pd.concat([test[['단지코드']], pd.get_dummies(test[['임대건물구분']])], axis=1).groupby('단지코드').sum()
tot = test_cls['임대건물구분_상가']+test_cls['임대건물구분_아파트']
test_cls['상가비율'] = test_cls['임대건물구분_상가']/tot
test = pd.merge(test,test_cls[['상가비율']].reset_index(),on='단지코드')

# 세대당_가능주차면수 column 추기
train['세대당_가능주차면수'] = train['단지내주차면수']/train['총세대수']
test['세대당_가능주차면수'] = test['단지내주차면수']/test['총세대수']

# 공가수비율 column 추가
train['공가수비율']= train.공가수/train.총세대수 
test['공가수비율']= test.공가수/test.총세대수 

# 대형전용면적 column 추가
level = 85
train['대형전용면적'] = 0
train.loc[train.전용면적>level, '대형전용면적'] = 1
train.loc[train.전용면적<=level, '대형전용면적'] = 0

test['대형전용면적'] = 0
test.loc[test.전용면적>level, '대형전용면적'] = 1
test.loc[test.전용면적<=level, '대형전용면적'] = 0

# check
print("\n...After preprocessing")
print(train.shape, test.shape)


...Before preprocessing
(2577, 34) (936, 33)

...After preprocessing
(2577, 38) (936, 37)


In [9]:
dic = {'1': 'A' ,'2':['C','F','G'],'3':['B','H','I'],'4':['J'],
       '5':['L',"M","N","O"],'6':["E","K"],'7':'D'}

In [10]:
def mapping_by_key(dic, x):
    for i in dic.keys():
        if x in dic[i]:
            return int(i)

In [11]:
train['자격유형_카테고리'] = train['자격유형'].apply(lambda x : mapping_by_key(dic, x))
train['자격유형_카테고리'] = train['자격유형_카테고리'].astype(object)
test['자격유형_카테고리'] = test['자격유형'].apply(lambda x : mapping_by_key(dic, x))
test['자격유형_카테고리'] = test['자격유형_카테고리'].astype(object)
train= train.drop(columns="자격유형")
test= test.drop(columns="자격유형")

In [12]:
train.shape

(2577, 38)

## 공급유형

In [13]:
train.loc[train.공급유형.isin(['공공임대(5년)', '공공임대(10년)']), '공급유형'] = '공공임대(단기)'
test.loc[test.공급유형.isin(['공공임대(5년)', '공공임대(10년)']), '공급유형'] = '공공임대(단기)'

In [15]:
print(train.shape)
print(test.shape)


(2577, 38)
(936, 37)


In [16]:
train

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,임대보증금,임대료,지하철역,버스정류장,단지내주차면수,등록차량수,단지명,도로명주소,연면적,위도,경도,subway_name,subway_dist,환승역 수,총인구수,세대당_인구,남/여비율,남/여_0~19세,남/여_20~39세,남/여_40~69세,남/여_70세이상,0~19세_비율,20~39세_비율,40~69세_비율,70세이상_비율,상가비율,세대당_가능주차면수,공가수비율,대형전용면적,자격유형_카테고리
0,C2515,545,아파트,경상남도,국민임대,33.48,276,17.0,9216000.0,82940.0,0.0,3.0,624.0,205.0,미수휴먼시아,경상남도 통영시 미우지해안로 107,1213.87,128.3965,34.8242,다대포해수욕장역,0.6117,1,11129.0,2.3,0.97,1.7612,1.0366,0.984,0.5888,0.1956,0.185,0.5221,0.0972,0.0,1.145,0.0312,0,1
1,C2515,545,아파트,경상남도,국민임대,39.6,60,17.0,12672000.0,107130.0,0.0,3.0,624.0,205.0,미수휴먼시아,경상남도 통영시 미우지해안로 107,1213.87,128.3965,34.8242,다대포해수욕장역,0.6117,1,11129.0,2.3,0.97,1.7612,1.0366,0.984,0.5888,0.1956,0.185,0.5221,0.0972,0.0,1.145,0.0312,0,1
2,C2515,545,아파트,경상남도,국민임대,39.6,20,17.0,12672000.0,107130.0,0.0,3.0,624.0,205.0,미수휴먼시아,경상남도 통영시 미우지해안로 107,1213.87,128.3965,34.8242,다대포해수욕장역,0.6117,1,11129.0,2.3,0.97,1.7612,1.0366,0.984,0.5888,0.1956,0.185,0.5221,0.0972,0.0,1.145,0.0312,0,1
3,C2515,545,아파트,경상남도,국민임대,46.9,38,17.0,18433000.0,149760.0,0.0,3.0,624.0,205.0,미수휴먼시아,경상남도 통영시 미우지해안로 107,1213.87,128.3965,34.8242,다대포해수욕장역,0.6117,1,11129.0,2.3,0.97,1.7612,1.0366,0.984,0.5888,0.1956,0.185,0.5221,0.0972,0.0,1.145,0.0312,0,1
4,C2515,545,아파트,경상남도,국민임대,46.9,19,17.0,18433000.0,149760.0,0.0,3.0,624.0,205.0,미수휴먼시아,경상남도 통영시 미우지해안로 107,1213.87,128.3965,34.8242,다대포해수욕장역,0.6117,1,11129.0,2.3,0.97,1.7612,1.0366,0.984,0.5888,0.1956,0.185,0.5221,0.0972,0.0,1.145,0.0312,0,1
5,C2515,545,아파트,경상남도,국민임대,51.97,106,17.0,23042000.0,190090.0,0.0,3.0,624.0,205.0,미수휴먼시아,경상남도 통영시 미우지해안로 107,1213.87,128.3965,34.8242,다대포해수욕장역,0.6117,1,11129.0,2.3,0.97,1.7612,1.0366,0.984,0.5888,0.1956,0.185,0.5221,0.0972,0.0,1.145,0.0312,0,1
6,C2515,545,아파트,경상남도,국민임대,51.97,26,17.0,23042000.0,190090.0,0.0,3.0,624.0,205.0,미수휴먼시아,경상남도 통영시 미우지해안로 107,1213.87,128.3965,34.8242,다대포해수욕장역,0.6117,1,11129.0,2.3,0.97,1.7612,1.0366,0.984,0.5888,0.1956,0.185,0.5221,0.0972,0.0,1.145,0.0312,0,1
7,C1407,1216,아파트,대전광역시,국민임대,30.95,288,13.0,15620000.0,127350.0,1.0,1.0,1285.0,1064.0,도안 휴먼시아 4단지,대전광역시 유성구 상대로 40,4636.41,127.3374,36.3466,유성온천역,0.0082,1,45840.0,2.42,0.92,2.1671,0.9294,0.8716,0.6995,0.2288,0.2761,0.4425,0.0527,0.0,1.0567,0.0107,0,1
8,C1407,1216,아파트,대전광역시,국민임대,30.99,68,13.0,15620000.0,127350.0,1.0,1.0,1285.0,1064.0,도안 휴먼시아 4단지,대전광역시 유성구 상대로 40,4636.41,127.3374,36.3466,유성온천역,0.0082,1,45840.0,2.42,0.92,2.1671,0.9294,0.8716,0.6995,0.2288,0.2761,0.4425,0.0527,0.0,1.0567,0.0107,0,1
9,C1407,1216,아파트,대전광역시,국민임대,30.99,34,13.0,15620000.0,127350.0,1.0,1.0,1285.0,1064.0,도안 휴먼시아 4단지,대전광역시 유성구 상대로 40,4636.41,127.3374,36.3466,유성온천역,0.0082,1,45840.0,2.42,0.92,2.1671,0.9294,0.8716,0.6995,0.2288,0.2761,0.4425,0.0527,0.0,1.0567,0.0107,0,1


In [None]:
data = train.drop(columns=['공급유형','자격유형_카테고리','전용면적','임대건물구분',
                           '전용면적별세대수','대형전용면적','자격유형_카테고리',
                           '임대보증금','임대료'])
data = data.drop_duplicates()

In [None]:
data.shape

## 1차원

### train

In [None]:
sample = pd.DataFrame(train.groupby(['단지코드','공급유형']).mean()['전용면적별세대수']).reset_index()

## 은영이랑 달라지는 부분

### 전용면적, 전용면적별세대수 

In [None]:
tr_gr = tr.merge(rawtrain.groupby("단지코드").mean().reset_index()[["단지코드","전용면적",'전용면적별세대수','임대료',"임대보증금"]], left_on = "단지코드",right_on= "단지코드", how = "left")
tst_gr = tst.merge(rawtest.groupby("단지코드").mean().reset_index()[["단지코드","전용면적",'전용면적별세대수','임대료',"임대보증금"]], left_on = "단지코드",right_on= "단지코드", how = "left")

### 전용면적, 전용면적별세대수 std

In [None]:
tr_gr["전용면적_std"] = rawtrain.groupby("단지코드").std().reset_index()["전용면적"]
tr_gr["전용면적별세대수_std"] = rawtrain.groupby("단지코드").std().reset_index()["전용면적별세대수"]
tr_gr["임대보증금_std"] = rawtrain.groupby("단지코드").std().reset_index()["임대보증금"]
tr_gr["임대료_std"] = rawtrain.groupby("단지코드").std().reset_index()["임대료"]

tst_gr["전용면적_std"] = rawtest.groupby("단지코드").std().reset_index()["전용면적"]
tst_gr["전용면적별세대수_std"] = rawtest.groupby("단지코드").std().reset_index()["전용면적별세대수"]
tst_gr["임대보증금_std"] = rawtest.groupby("단지코드").std().reset_index()["임대보증금"]
tst_gr["임대료_std"] = rawtest.groupby("단지코드").std().reset_index()["임대료"]

tr_gr.loc[tr_gr.전용면적_std.isna(),"전용면적_std"] = 0
tr_gr.loc[tr_gr.전용면적별세대수_std.isna(),"전용면적별세대수_std"] = 0
tr_gr.loc[tr_gr.임대보증금_std.isna(),"임대보증금_std"] = 0
tr_gr.loc[tr_gr.임대료_std.isna(),"임대료_std"] = 0