In [1]:
#Package
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
plt.rcParams['font.family'] = 'NanumGothic'
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
train=pd.read_csv("merged_train.csv")
test=pd.read_csv("merged_test.csv")

In [3]:
# merged_ data에서 drop해야 할 단지코드
print(train.shape, test.shape)
train = train.loc[~train.단지코드.isin(['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']),]
test = test.loc[~test.단지코드.isin(['C2335', 'C1327', 'C2675'])]
print(train.shape, test.shape)

(2952, 34) (1022, 33)
(2896, 34) (1008, 33)


In [4]:
print(len(set(train.단지코드)), len(set(test.단지코드)))

414 147


In [5]:
train=train.drop_duplicates(keep='first')
test=test.drop_duplicates(keep='first')

In [6]:
print(train.shape, test.shape)

(2577, 34) (936, 33)


In [7]:
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '지하철역', '버스정류장', '단지내주차면수', '등록차량수', '단지명',
       '도로명주소', '연면적', '위도', '경도', 'subway_name', 'subway_dist', '환승역 수',
       '총인구수', '세대당_인구', '남/여비율', '남/여_0~19세', '남/여_20~39세', '남/여_40~69세',
       '남/여_70세이상', '0~19세_비율', '20~39세_비율', '40~69세_비율', '70세이상_비율'],
      dtype='object')

## 상가비율, 대형전용면적, 총세대수:주차면수, 공가수

In [8]:
print("\n...Before preprocessing")
print(train.shape, test.shape)

# 상가비율 column 추가
train_cls = pd.concat([train[['단지코드']], pd.get_dummies(train[['임대건물구분']])], axis=1).groupby('단지코드').sum()
tot = train_cls['임대건물구분_상가']+train_cls['임대건물구분_아파트']
train_cls['상가비율'] = train_cls['임대건물구분_상가']/tot
train = pd.merge(train,train_cls[['상가비율']].reset_index(),on='단지코드')

test_cls = pd.concat([test[['단지코드']], pd.get_dummies(test[['임대건물구분']])], axis=1).groupby('단지코드').sum()
tot = test_cls['임대건물구분_상가']+test_cls['임대건물구분_아파트']
test_cls['상가비율'] = test_cls['임대건물구분_상가']/tot
test = pd.merge(test,test_cls[['상가비율']].reset_index(),on='단지코드')

# 분양세대수 column 추가
train_sale = pd.DataFrame(train.groupby(['단지코드']).first()['총세대수'] - train.groupby(['단지코드']).sum()['전용면적별세대수'])
train_sale.columns = ["분양세대수"]
train_sale = train_sale.astype('int16')
train_sale.reset_index(inplace=True)
train = pd.merge(train, train_sale,on="단지코드")

test_sale = pd.DataFrame(test.groupby(['단지코드']).first()['총세대수'] - test.groupby(['단지코드']).sum()['전용면적별세대수'])
test_sale.columns = ["분양세대수"]
test_sale = test_sale.astype('int16')
test_sale.reset_index(inplace=True)
test = pd.merge(test, test_sale,on="단지코드")

# 세대당_가능주차면수 column 추기
train['세대당_가능주차면수'] = train['단지내주차면수']/train['총세대수']
test['세대당_가능주차면수'] = test['단지내주차면수']/test['총세대수']

# 공가수비율 column 추가
train['공가수비율']= train.공가수/train.총세대수 
test['공가수비율']= test.공가수/test.총세대수 

# 대형전용면적 column 추가
level = 85
train['대형전용면적'] = 0
train.loc[train.전용면적>level, '대형전용면적'] = 1
train.loc[train.전용면적<=level, '대형전용면적'] = 0

test['대형전용면적'] = 0
test.loc[test.전용면적>level, '대형전용면적'] = 1
test.loc[test.전용면적<=level, '대형전용면적'] = 0

# check
print("\n...After preprocessing")
print(train.shape, test.shape)


...Before preprocessing
(2577, 34) (936, 33)

...After preprocessing
(2577, 39) (936, 38)


In [9]:
dic = {'1': 'A' ,'2':['C','F','G'],'3':['B','H','I'],'4':['J'],'5':['L',"M","N","O"],'6':["E","K"],'7':'D'}

In [10]:
def mapping_by_key(dic, x):
    for i in dic.keys():
        if x in dic[i]:
            return int(i)

In [11]:
train['자격유형_카테고리'] = train['자격유형'].apply(lambda x : mapping_by_key(dic, x))
test['자격유형_카테고리'] = test['자격유형'].apply(lambda x : mapping_by_key(dic, x))
train['자격유형_카테고리'] = train['자격유형_카테고리'].astype(object)
test['자격유형_카테고리'] = test['자격유형_카테고리'].astype(object)

- 공공임대(5년), 공공임대(10년)만 묶으면 될 듯
- 그 외 장기전세 등도 확인해봤는데 너무 분포가 다름. 

In [13]:
train.loc[train.공급유형.isin(['공공임대(5년)', '공공임대(10년)']), '공급유형'] = '공공임대(단기)'
test.loc[test.공급유형.isin(['공공임대(5년)', '공공임대(10년)']), '공급유형'] = '공공임대(단기)'

In [14]:
train["지역_1"]=train.지역
test["지역_1"]=test.지역

In [15]:
train.loc[train.지역.isin(['경상북도', '광주광역시', '대구광역시', '세종특별자치시']), '지역_1'] = '지역1'
train.loc[train.지역.isin(['강원도', '경기도', '서울특별시','전라남도']), '지역_1'] = '지역2'
train.loc[train.지역.isin(['울산광역시', '전라북도', '제주특별자치도','충청남도',"충청북도"]), '지역_1'] = '지역3'
train.loc[train.지역.isin(['경상남도', '대전광역시', '부산광역시']), '지역_1'] = '지역4'

test.loc[test.지역.isin(['경상북도', '광주광역시', '대구광역시', '세종특별자치시']), '지역_1'] = '지역1'
test.loc[test.지역.isin(['강원도', '경기도', '서울특별시','전라남도']), '지역_1'] = '지역2'
test.loc[test.지역.isin(['울산광역시', '전라북도', '제주특별자치도','충청남도',"충청북도"]), '지역_1'] = '지역3'
test.loc[test.지역.isin(['경상남도', '대전광역시', '부산광역시']), '지역_1'] = '지역4'

In [16]:
train["지역_2"]=train.지역
test["지역_2"]=test.지역

In [17]:
train.loc[train.지역.isin(['강원도', '대전광역시', '부산광역시','울산광역시', '제주특별자치도','충청남도']), '지역_2'] = '지역1'
train.loc[train.지역.isin(['경상남도', '경상북도', '전라남도','전라북도','충청북도']), '지역_2'] = '지역2'
train.loc[train.지역.isin(['경기도', '광주광역시', '대구광역시','서울특별시',"세종특별자치시"]), '지역_2'] = '지역3'

test.loc[test.지역.isin(['강원도', '대전광역시', '부산광역시','울산광역시', '제주특별자치도','충청남도']), '지역_2'] = '지역1'
test.loc[test.지역.isin(['경상남도', '경상북도', '전라남도','전라북도','충청북도']), '지역_2'] = '지역2'
test.loc[test.지역.isin(['경기도', '광주광역시', '대구광역시','서울특별시',"세종특별자치시"]), '지역_2'] = '지역3'

In [18]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
robustScaler = RobustScaler()
standardScaler = StandardScaler()
minMaxScaler = MinMaxScaler()

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 

In [20]:
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '지하철역', '버스정류장', '단지내주차면수', '등록차량수', '단지명',
       '도로명주소', '연면적', '위도', '경도', 'subway_name', 'subway_dist', '환승역 수',
       '총인구수', '세대당_인구', '남/여비율', '남/여_0~19세', '남/여_20~39세', '남/여_40~69세',
       '남/여_70세이상', '0~19세_비율', '20~39세_비율', '40~69세_비율', '70세이상_비율', '상가비율',
       '분양세대수', '세대당_가능주차면수', '공가수비율', '대형전용면적', '자격유형_카테고리', '지역_1', '지역_2'],
      dtype='object')

## 기본으로 돌려보기

In [21]:
X = train.drop(columns=['단지명','도로명주소','subway_name','등록차량수', '전용면적','지역_1','지역_2'])
y = train[['등록차량수']]

In [22]:
X = pd.get_dummies(X, drop_first=True)

In [23]:
len(X.columns)

486

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True,random_state=2021)

In [25]:
from sklearn.linear_model import LinearRegression

In [26]:
lr= LinearRegression().fit(X_train, y_train)
lr_pred = lr.predict(X_test)
mean_absolute_error(y_test, lr_pred)

3.764373643335392

In [27]:
ri = Ridge().fit(X_train, y_train)
ri_pred = ri.predict(X_test)
mean_absolute_error(y_test, ri_pred)

25.663647779357138

In [28]:
la = Lasso().fit(X_train, y_train)
la_pred = la.predict(X_test)
mean_absolute_error(y_test, la_pred)

106.64656893629216

In [29]:
rf = RandomForestRegressor(random_state=2021, criterion="mae").fit(X_train, y_train)
rf_pred = rf.predict(X_test)
mean_absolute_error(y_test, rf_pred)

13.43512919896641

In [30]:
dc = DecisionTreeRegressor(random_state=2021, criterion="mae").fit(X_train, y_train)
dc_pred = dc.predict(X_test)
mean_absolute_error(y_test, dc_pred)

9.833333333333334

## 지역별로 나눠서 modeling

In [31]:
train1=train[train.지역_1=="지역1"]
train2=train[train.지역_1=="지역2"]
train3=train[train.지역_1=="지역3"]
train4=train[train.지역_1=="지역4"]

In [34]:
test1=test[test.지역_1=="지역1"]
test2=test[test.지역_1=="지역2"]
test3=test[test.지역_1=="지역3"]
test4=test[test.지역_1=="지역4"]

In [35]:
print(train1.shape, train2.shape, train3.shape, train4.shape)
print(test1.shape, test2.shape, test3.shape, test4.shape)

(372, 42) (964, 42) (492, 42) (749, 42)
(158, 41) (374, 41) (204, 41) (200, 41)


In [61]:
X = train3.drop(columns=['단지명','도로명주소','subway_name','등록차량수', '전용면적','지역_1','지역_2'])
y = train3[['등록차량수']]
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True,random_state=2021)

In [62]:
#minMaxScaler.fit(X_train)
#X_train = minMaxScaler.transform(X_train)
#minMaxScaler.fit(X_test)
#X_test = minMaxScaler.transform(X_test)
# 왜인지 스케일러 쓰면 안좋아짐;;

In [63]:
lr= LinearRegression().fit(X_train, y_train)
lr_pred = lr.predict(X_test)
mean_absolute_error(y_test, lr_pred)

0.2838453720145161

In [64]:
ri = Ridge().fit(X_train, y_train)
ri_pred = ri.predict(X_test)
mean_absolute_error(y_test, ri_pred)

19.689898065095846

In [65]:
la = Lasso().fit(X_train, y_train)
la_pred = la.predict(X_test)
mean_absolute_error(y_test, la_pred)

51.280122060440966

# 첫번째 기준으로 예측 해보겠음

In [66]:
sample=pd.read_csv('sample_submission.csv')
sample.head()

Unnamed: 0,code,num
0,C1072,0
1,C1128,0
2,C1456,0
3,C1840,0
4,C1332,0


In [67]:
# sample = sample.loc[~sample.code.isin(['C2335', 'C1327', 'C2675'])]

In [68]:
sample.shape

(150, 2)

In [69]:
train1=train[train.지역_1=="지역1"]
train2=train[train.지역_1=="지역2"]
train3=train[train.지역_1=="지역3"]
train4=train[train.지역_1=="지역4"]
test1=test[test.지역_1=="지역1"]
test2=test[test.지역_1=="지역2"]
test3=test[test.지역_1=="지역3"]
test4=test[test.지역_1=="지역4"]

In [70]:
print(train1.shape, train2.shape, train3.shape, train4.shape)
print(test1.shape, test2.shape, test3.shape, test4.shape)

(372, 42) (964, 42) (492, 42) (749, 42)
(158, 41) (374, 41) (204, 41) (200, 41)


In [71]:
df= pd.concat([train1, test1])

In [72]:
X = df.drop(columns=['단지코드','단지명','도로명주소','subway_name','등록차량수', '전용면적','지역_1','지역_2'])
X = pd.get_dummies(X, drop_first=True)

In [73]:
X.train=X.iloc[0:372, :]
X.test=X.iloc[372:, :]
y_train = train1[['등록차량수']]

In [74]:
model1= LinearRegression().fit(X.train, y_train)
model1_pred = model1.predict(X.test)

In [75]:
model1=pd.DataFrame(model1_pred)
code1=test1.단지코드
model1=model1.reset_index(drop=True)
code1=code1.reset_index(drop=True)

In [76]:
pred1=pd.concat([code1, model1], axis=1, ignore_index=True)
pred1.columns=["code", "num"]
pred1=pred1.num.groupby(pred1.code).mean()
pred1.head()

code
C1071     298.832348
C1229     418.864358
C1253    1072.764653
C1297     439.894044
C1349     145.573329
Name: num, dtype: float64

In [77]:
df= pd.concat([train2, test2])

In [78]:
X = df.drop(columns=['단지코드','단지명','도로명주소','subway_name','등록차량수', '전용면적','지역_1','지역_2'])
X = pd.get_dummies(X, drop_first=True)

In [79]:
X.train=X.iloc[0:964, :]
X.test=X.iloc[964:, :]
y_train = train2[['등록차량수']]

In [80]:
model2= LinearRegression().fit(X.train, y_train)
model2_pred = model2.predict(X.test)

In [81]:
model2=pd.DataFrame(model2_pred)
code2=test2.단지코드
model2=model2.reset_index(drop=True)
code2=code2.reset_index(drop=True)

In [82]:
pred2=pd.concat([code2, model2], axis=1, ignore_index=True)
pred2.columns=["code", "num"]
pred2=pred2.num.groupby(pred2.code).mean()
pred2.head()

code
C1016    689.518517
C1030    766.138363
C1060    730.701811
C1064    779.086985
C1072    656.046798
Name: num, dtype: float64

In [83]:
df= pd.concat([train3, test3])

In [84]:
X = df.drop(columns=['단지코드','단지명','도로명주소','subway_name','등록차량수', '전용면적','지역_1','지역_2'])
X = pd.get_dummies(X, drop_first=True)

In [85]:
X.train=X.iloc[0:492, :]
X.test=X.iloc[492:, :]
y_train = train3[['등록차량수']]

In [86]:
model3= LinearRegression().fit(X.train, y_train)
model3_pred = model3.predict(X.test)

In [87]:
model3=pd.DataFrame(model3_pred)
code3=test3.단지코드
model3=model3.reset_index(drop=True)
code3=code3.reset_index(drop=True)

In [88]:
pred3=pd.concat([code3, model3], axis=1, ignore_index=True)
pred3.columns=["code", "num"]
pred3=pred3.num.groupby(pred3.code).mean()
pred3.head()

code
C1019    300.406453
C1038    208.642422
C1040    332.793749
C1166    283.577946
C1216    289.037362
Name: num, dtype: float64

In [89]:
df= pd.concat([train4, test4])

In [90]:
X = df.drop(columns=['단지코드','단지명','도로명주소','subway_name','등록차량수', '전용면적','지역_1','지역_2'])
X = pd.get_dummies(X, drop_first=True)

In [91]:
X.train=X.iloc[0:749, :]
X.test=X.iloc[749:, :]
y_train = train4[['등록차량수']]

In [92]:
model4= LinearRegression().fit(X.train, y_train)
model4_pred = model4.predict(X.test)

In [93]:
model4=pd.DataFrame(model4_pred)
code4=test4.단지코드
model4=model4.reset_index(drop=True)
code4=code4.reset_index(drop=True)

In [94]:
pred4=pd.concat([code4, model4], axis=1, ignore_index=True)
pred4.columns=["code", "num"]
pred4=pred4.num.groupby(pred4.code).mean()
pred4.head()

code
C1003    236.863759
C1006    309.851772
C1083    377.344051
C1147    727.449882
C1152    826.683873
Name: num, dtype: float64

In [95]:
df_merge=pd.DataFrame(pd.concat([pred1, pred2, pred3, pred4], axis=0))
df_merge=df_merge.reset_index()

In [96]:
df_merge.head()

Unnamed: 0,code,num
0,C1071,298.832348
1,C1229,418.864358
2,C1253,1072.764653
3,C1297,439.894044
4,C1349,145.573329


In [97]:
test_split=pd.merge(sample, df_merge, on='code', how='outer')

In [98]:
test_split.head()

Unnamed: 0,code,num_x,num_y
0,C1072,0,656.046798
1,C1128,0,1321.254176
2,C1456,0,560.54978
3,C1840,0,533.81161
4,C1332,0,1222.577013


In [99]:
test_split= test_split.drop(columns='num_x')
test_split.columns=["code","num"]

In [100]:
min(train.등록차량수)

13.0

In [101]:
test_split[test_split.num<0].index

Int64Index([116], dtype='int64')

In [102]:
test_split.at[116,"num"]=13

In [103]:
test_split.to_csv('0712_LR2.csv', index=False)