In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
pd.options.display.max_rows = 100
pd.options.display.max_columns = 40
import numpy as np
import os,random, math
from tqdm import tqdm
from copy import deepcopy
from collections import Counter

# Visualization
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import font_manager, rc
plt.rcParams['font.family'] = 'NanumGothic'
import platform
if platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    
    rc('font', family='AppleGothic')

matplotlib.rcParams['axes.unicode_minus'] = False

# from dataprep.eda import plot, plot_correlation, plot_missing

import plotly 
import plotly.express as px
# from plotly import tools, subplots
# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# from plotly.offline import init_notebook_mode, iplot
# import plotly.graph_objs as go
# import plotly.express as px

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Fitting
from sklearn.cross_decomposition import PLSRegression
from sklearn.svm import SVR
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as LGB
from catboost import CatBoostRegressor

# Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold

# Fine-Tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Evaluation
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Pipeline
from sklearn.pipeline import Pipeline, make_pipeline


# Warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# cf.) function

In [11]:
def read_file(filename):
    df = pd.read_csv(f'{filename}')
    if sum(df.duplicated(df.columns)) != 0:
        print(f"...There are {sum(df.duplicated(df.columns))} duplicates\n...Remove them")
        df = df.drop_duplicates()
    else:
        print("...There are no duplicates")
    df.name = filename
    print(f"shape: {df.shape}")
    return df

def check_NA(df):
    print(f"NA check")
    NA_res = df.isna().sum()
    if NA_res.sum() != 0:
        for idx, i in enumerate(NA_res):
            if i != 0:
                print(df.columns[idx],i,"->",round(i/df.shape[0]*100,3),"%")
    else:
        print("...There's no NA")

def dual_countplot(col, train, test, figsize=(14,8), pallete="Set2", flip=True):
    plt.rcParams["figure.figsize"] = figsize
    fig, ax = plt.subplots(ncols=2)
    if flip:
        sns.countplot(y=train[col],palette=pallete,order=sorted(train[col].unique()),ax=ax[0]).set_title("train")
        sns.countplot(y=test[col],palette=pallete,order=sorted(test[col].unique()),ax=ax[1]).set_title("test")
    else:
        sns.countplot(x=train[col],palette=pallete,order=sorted(train[col].unique()),ax=ax[0]).set_title("train")
        sns.countplot(x=test[col],palette=pallete,order=sorted(test[col].unique()),ax=ax[1]).set_title("test")
    plt.tight_layout()
    plt.show()
    
def reg_group(train,group1,col_drop,target):
    tr = train.loc[train.지역.isin(group1)].drop(col_drop, axis=1)
    tr_drop = tr.drop(target, axis=1)
    scaler = StandardScaler() 
    tr_num = tr_drop.select_dtypes(include="number")
    X = scaler.fit_transform(tr_num)
    y = np.array(tr[target])
    result = OLS(y, X).fit()
    return tr_num.columns, result.summary(), result.params
        

def to_one(data, target, drop, option):
    x_col = drop
    data = data.drop(x_col, axis=1)
    if option==True:
        y = data[[target,'단지코드']].groupby('단지코드').mean()[target]
    else:
        target = []
        y = None
    X = data.drop(target,axis=1).set_index('단지코드')
    data_one_num = X.select_dtypes(include="number").groupby('단지코드').mean()
    data_one_obj = pd.get_dummies(X.select_dtypes(exclude="number")).groupby('단지코드').sum()
    data_one = data_one_num.join(data_one_obj, how="inner")
    return data_one, y

def to_one_young(train):
    data = train.drop(columns=['공급유형','자격유형_카테고리','전용면적','임대건물구분',
                           '전용면적별세대수','대형전용면적','자격유형_카테고리',
                           '임대보증금','임대료'])
    data = data.drop_duplicates()
    sample = pd.DataFrame(train.groupby(['단지코드','공급유형']).sum()['전용면적별세대수']).reset_index()

    for i in sample.공급유형.unique():
        data[str(i)]=0
    sample = train.groupby('단지코드').mean()[['임대료','임대보증금']].reset_index()
    for i in sample.index:
        code = sample.loc[i,'단지코드']
        val1 = sample.loc[i,'임대료']
        val2 = sample.loc[i,'임대보증금']
        ind = data[data['단지코드']==code].index
        data.loc[ind,'임대료'] = val1
        data.loc[ind,'임대보증금'] = val2
    sample = pd.DataFrame(train.groupby(['단지코드','자격유형_카테고리']).sum()['전용면적별세대수']).reset_index()
    for i in sample.index:
        code = sample.loc[i,'단지코드']
        col = sample.loc[i,'자격유형_카테고리']
        val = sample.loc[i,'전용면적별세대수']
        ind = data[data['단지코드']==code].index
        data.loc[ind,col] = val
    data=data.fillna(0)
    sample = pd.DataFrame(train.groupby('단지코드')['대형전용면적'].sum()).reset_index()
    data=pd.merge(data, sample, on="단지코드", how="inner")
    return data

def drop_sangga(train, test, train_one, test_one):
    # train.loc[train.공급유형=="영구임대", "임대료"] = train.loc[train.공급유형=="영구임대", "임대료"]*2
    train_a = train[train.임대건물구분=="아파트"]
    sample = train_a.groupby('단지코드').mean()[['전용면적','전용면적별세대수','임대료','임대보증금']].reset_index()
    sample2 = train_a.groupby('단지코드').std()[['전용면적','전용면적별세대수','임대료','임대보증금']].reset_index()
    sample2.columns = ['단지코드','전용면적_std', '전용면적별세대수_std', '임대료_std', '임대보증금_std']
    new_ = pd.concat([sample, sample2], axis=1).set_index(train_one.index).drop('단지코드', axis=1)
    train_one = pd.concat([train_one.drop(new_.columns, axis=1), new_], axis=1).fillna(0)
    
    # test.loc[test.공급유형=="영구임대", "임대료"] = test.loc[test.공급유형=="영구임대", "임대료"]*2
    test_a = test[test.임대건물구분=="아파트"]
    sample = test_a.groupby('단지코드').mean()[['전용면적','전용면적별세대수','임대료','임대보증금']].reset_index()
    sample2 = test_a.groupby('단지코드').std()[['전용면적','전용면적별세대수','임대료','임대보증금']].reset_index()
    sample2.columns = ['단지코드','전용면적_std', '전용면적별세대수_std', '임대료_std', '임대보증금_std']
    new_ = pd.concat([sample, sample2], axis=1).set_index(test_one.index).drop('단지코드', axis=1)
    test_one = pd.concat([test_one.drop(new_.columns, axis=1), new_], axis=1).fillna(0)
    test_one['공급유형_공공임대(50년)'] = 0
    test_one['공급유형_공공임대(분납)'] = 0
    test_one['공급유형_장기전세'] = 0
    
    return train_one, test_one

def add_pop(data):
    data['0~19 인구수'] = data['0~19세_비율']*data['총세대수']
    data['20~39 인구수'] = data['20~39세_비율']*data['총세대수']
    data['40~69 인구수'] = data['40~69세_비율']*data['총세대수']
    data['70세이상 인구수'] = data['70세이상_비율']*data['총세대수']
    col = ['0~19 인구수', '20~39 인구수', '40~69 인구수', '70세이상 인구수']
    data = data.drop(col, axis=1)
    return data

def mapping_by_key(dic, x):
    for i in dic.keys():
        if x in dic[i]:
            return int(i)



In [3]:
filepath = "../EDA/data"
sample = pd.read_csv(os.path.join(filepath,"sample_submission.csv"))
sample.shape

(150, 2)

In [4]:
filepath = "../../../../../문서/카카오톡 받은 파일"
comp = pd.read_csv(os.path.join(filepath,"0713_cb.csv"))
comp.shape

(150, 2)

In [5]:
filepath = '../ProcessedData'
train = read_file(os.path.join(filepath,'merged_train.csv'))
test = read_file(os.path.join(filepath,'merged_test.csv'))
check_NA(train)
check_NA(test)
train.shape, test.shape

...There are 319 duplicates
...Remove them
shape: (2577, 34)
...There are 72 duplicates
...Remove them
shape: (936, 33)
NA check
...There's no NA
NA check
...There's no NA


((2577, 34), (936, 33))

In [6]:
## train
# 상가비율 column 추가 & 임대건물구분 drop
train_cls = pd.concat([train[['단지코드']], pd.get_dummies(train[['임대건물구분']])], axis=1).groupby('단지코드').sum()
tot = train_cls['임대건물구분_상가']+train_cls['임대건물구분_아파트']
train_cls['상가비율'] = train_cls['임대건물구분_상가']/tot
train = pd.merge(train,train_cls[['상가비율']].reset_index(),on='단지코드')
train.drop("임대건물구분", axis=1)

# # 분양세대수 column 추가
# train_sale = pd.DataFrame(train.groupby(['단지코드']).first()['총세대수'] - train.groupby(['단지코드']).sum()['전용면적별세대수'])
# train_sale.columns = ["분양세대수"]
# train_sale = train_sale.astype('int16')
# train_sale.reset_index(inplace=True)
# train = pd.merge(train, train_sale,on="단지코드")

# 세대당_가능주차면수 추가
train['세대당_가능주차면수'] = train['단지내주차면수']/train['총세대수']

# 공가수비율 추가
train['공가수비율']= train.공가수/train.총세대수 

#대형전용면적 추가
level = 85
train['대형전용면적'] = 0
train.loc[train.전용면적>level, '대형전용면적'] = 1
train.loc[train.전용면적<=level, '대형전용면적'] = 0

# # std 추가
# train["전용면적_std"] = train.groupby("단지코드").std().reset_index()["전용면적"]
# train["전용면적별세대수_std"] = train.groupby("단지코드").std().reset_index()["전용면적별세대수"]
# train["임대보증금_std"] = train.groupby("단지코드").std().reset_index()["임대보증금"]
# train["임대료_std"] = train.groupby("단지코드").std().reset_index()["임대료"]
# train.loc[train.전용면적_std.isna(),"전용면적_std"] = 0
# train.loc[train.전용면적별세대수_std.isna(),"전용면적별세대수_std"] = 0
# train.loc[train.임대보증금_std.isna(),"임대보증금_std"] = 0
# train.loc[train.임대료_std.isna(),"임대료_std"] = 0

# 자격유형_카테고리 추가 & 자격유형 drop
dic = {'1': 'A' ,'2':['C','F','G'],'3':['B','H','I'],'4':['J'],
       '5':['L',"M","N","O"],'6':["E","K"],'7':'D'}
train['자격유형_카테고리'] = train['자격유형'].apply(lambda x : mapping_by_key(dic, x))
train['자격유형_카테고리'] = train['자격유형_카테고리'].astype(object)
train= train.drop(columns="자격유형")

# 공급유형 재범주화
train.loc[train.공급유형.isin(['공공임대(5년)', '공공임대(10년)']), '공급유형'] = '공공임대(단기)'



## test
# 상가비율 column 추가 & 임대건물구분 drop
test_cls = pd.concat([test[['단지코드']], pd.get_dummies(test[['임대건물구분']])], axis=1).groupby('단지코드').sum()
tot = test_cls['임대건물구분_상가']+test_cls['임대건물구분_아파트']
test_cls['상가비율'] = test_cls['임대건물구분_상가']/tot
test = pd.merge(test,test_cls[['상가비율']].reset_index(),on='단지코드')
test.drop("임대건물구분", axis=1)

# # 분양세대수 column 추가
# test_sale = pd.DataFrame(test.groupby(['단지코드']).first()['총세대수'] - test.groupby(['단지코드']).sum()['전용면적별세대수'])
# test_sale.columns = ["분양세대수"]
# test_sale = test_sale.astype('int16')
# test_sale.reset_index(inplace=True)
# test = pd.merge(test, test_sale,on="단지코드")

# 세대당_가능주차면수 추가
test['세대당_가능주차면수'] = test['단지내주차면수']/test['총세대수']

# 공가수비율 추가
test['공가수비율']= test.공가수/test.총세대수 

#대형전용면적 추가
level = 85
test['대형전용면적'] = 0
test.loc[test.전용면적>level, '대형전용면적'] = 1
test.loc[test.전용면적<=level, '대형전용면적'] = 0

# # std 추가
# test["전용면적_std"] = test.groupby("단지코드").std().reset_index()["전용면적"]
# test["전용면적별세대수_std"] = test.groupby("단지코드").std().reset_index()["전용면적별세대수"]
# test["임대보증금_std"] = test.groupby("단지코드").std().reset_index()["임대보증금"]
# test["임대료_std"] = test.groupby("단지코드").std().reset_index()["임대료"]
# test.loc[test.전용면적_std.isna(),"전용면적_std"] = 0
# test.loc[test.전용면적별세대수_std.isna(),"전용면적별세대수_std"] = 0
# test.loc[test.임대보증금_std.isna(),"임대보증금_std"] = 0
# test.loc[test.임대료_std.isna(),"임대료_std"] = 0

# 자격유형_카테고리 추가 & 자격유형 drop
test['자격유형_카테고리'] = test['자격유형'].apply(lambda x : mapping_by_key(dic, x))
test['자격유형_카테고리'] = test['자격유형_카테고리'].astype(object)
test= test.drop(columns="자격유형")

# 공급유형 재범주화
test.loc[test.공급유형.isin(['공공임대(5년)', '공공임대(10년)']), '공급유형'] = '공공임대(단기)'

# 0713 EDA_은영

In [7]:
train_one = to_one_young(train)
test_one = to_one_young(test)
print(train_one.shape, test_one.shape)
pre_train = deepcopy(train_one)
pre_test = deepcopy(train_one)

(414, 49) (147, 46)


In [8]:
train = pre_train
test = pre_test
lev = 785
train1 = train.loc[train.단지내주차면수>=lev,]
train2 = train.loc[train.단지내주차면수<lev,]
test1 = test.loc[test.단지내주차면수>=lev,]
test2 = test.loc[test.단지내주차면수<lev,]

In [15]:
models = [LinearRegression(), 
#           PLSRegression(), 
#           Ridge(random_state=2021),
#           Lasso(random_state=2021), 
#           ElasticNet(random_state=2021),
#           CatBoostRegressor(random_state=2021, loss_function = 'MAE'),
#           RandomForestRegressor(criterion="mae",random_state=2021),
#           GradientBoostingRegressor(criterion="mae",random_state=2021),
          XGBRegressor(random_state=2021)]

In [31]:
def fit(X_train, X_test, y_train, y_test, model):
    try:
        model.fit(X_train, y_train, verbose=False)
    except:
        model.fit(X_train, y_train)
    tst_pred = model.predict(X_test)
    error = mean_absolute_error(y_test,tst_pred)
    return model, tst_pred, error


def fit_models(X, y, models, cv=False):
    X = np.array(X.reset_index(drop=True))
    y = np.array(y.reset_index(drop=True))
    best_error = 10000
    best_tst_pred = 0
    best_model = ""
    if cv==False:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True,random_state=2021)
        for m in models:
            model, tst_pred, error = fit(X_train, X_test, y_train, y_test, m)
            if best_error > error:
                best_error = error
                best_tst_pred = tst_pred
                best_model = model
    else:
        best_tst_idx = []
        kf = KFold(n_splits = 5, shuffle=True,random_state = 2021)
        for m in models:
            mae = []
            pred= []
            tst_idx = []
            for train_index, test_index in kf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model, tst_pred, error = fit(X_train, X_test, y_train, y_test, m)
                mae.append(error)
                pred.append(tst_pred.tolist())
                tst_idx.append(test_index.tolist())
            cv_error = np.mean(mae)
            print(f"model:{str(m).split('(')[0]}\nMAE:{cv_error}")
            if best_error > cv_error:
                best_error = cv_error
                best_tst_pred = pred
                best_tst_idx = sum(tst_idx, [])
                best_model = model
        best_tst_pred = sum(best_tst_pred,[])
        best_tst_pred = dict(zip(best_tst_idx,best_tst_pred))
    try:
        best_model = best_model.fit(np.concatenate((X_train, X_test),axis=0), 
                                                np.concatenate((y_train, y_test),axis=0), 
                                                verbose=False)
    except:
        best_model = best_model.fit(np.concatenate((X_train, X_test),axis=0), 
                                        np.concatenate((y_train, y_test),axis=0))
    return best_model, best_tst_pred, best_error

In [32]:
X = train1.drop(columns=['단지명','단지코드','도로명주소','subway_name',
                         '등록차량수', "임대상가", 2])
y = train1[['등록차량수']]
X = pd.get_dummies(X, drop_first=True)
best_model1, best_tst_pred1, best_error1 = fit_models(X, y, models, cv=True)
print(best_model1, best_error1)

model:LinearRegression
MAE:299.8988224391402
model:XGBRegressor
MAE:264.11674450102305
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=2021,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None) 264.11674450102305


In [27]:
X = train1.drop(columns=['단지명','단지코드','도로명주소','subway_name',
                         '등록차량수', "임대상가", 2])
y = train1[['등록차량수']]
X = pd.get_dummies(X, drop_first=True)
best_model1, best_tst_pred1, best_error1 = fit_models(X, y, models)
# print(best_model1, best_error1)

In [121]:
models = [LinearRegression(), 
          PLSRegression(), 
          Ridge(random_state=2021),
          Lasso(random_state=2021), 
          ElasticNet(random_state=2021),
#           CatBoostRegressor(random_state=2021, loss_function = 'MAE'),
          RandomForestRegressor(criterion="mae",random_state=2021),
          GradientBoostingRegressor(criterion="mae",random_state=2021),
          XGBRegressor(random_state=2021)]

In [122]:
X = train1.drop(columns=['단지명','단지코드','도로명주소','subway_name',
                         '등록차량수', "임대상가", 2])
y = train1[['등록차량수']]
X = pd.get_dummies(X, drop_first=True)
best_model1, best_tst_pred1, best_error1 = fit_models(X, y, models, cv=True)

model:LinearRegression
MAE:299.8988224391402
model:PLSRegression
MAE:249.32922235754205
model:Ridge
MAE:241.56773151215785
model:Lasso
MAE:245.4606285914716
model:ElasticNet
MAE:237.252915092595
model:RandomForestRegressor
MAE:243.94664285714288
model:GradientBoostingRegressor
MAE:272.05223874094077
model:XGBRegressor
MAE:264.11674450102305
[2, 4, 6, 8, 13, 22, 30, 32, 35, 46, 47, 58, 61, 64, 67, 76, 77, 78, 80, 92, 95, 3, 20, 23, 28, 31, 34, 37, 43, 45, 55, 56, 59, 60, 69, 74, 75, 82, 83, 97, 99, 101, 9, 11, 15, 17, 27, 39, 40, 42, 51, 53, 65, 68, 72, 73, 79, 84, 87, 88, 96, 98, 103, 5, 10, 14, 16, 18, 19, 25, 36, 38, 41, 48, 49, 50, 52, 54, 63, 71, 81, 89, 100, 102, 0, 1, 7, 12, 21, 24, 26, 29, 33, 44, 57, 62, 66, 70, 85, 86, 90, 91, 93, 94]


In [140]:
pd.merge(pd.DataFrame(best_tst_pred1,index=[0]).T.reset_index(),
         pd.DataFrame(train1.단지코드.reset_index(drop=True)).reset_index(),
         on="index")

Unnamed: 0,index,0,단지코드
0,2,992.5761,C1032
1,4,1140.0600,C1672
2,6,1013.3932,C1005
3,8,774.0983,C1885
4,13,1009.7700,C2000
...,...,...,...
99,86,1454.8911,C1326
100,90,1804.9864,C1802
101,91,1820.0794,C1939
102,93,1401.6079,C2506


In [73]:
X = train2.drop(columns=['단지명','단지코드','도로명주소','subway_name',
                         '등록차량수', "임대상가", 2])
y = train2[['등록차량수']]
X = pd.get_dummies(X, drop_first=True)
best_model2, best_tst_pred2, best_error2 = fit_models(X, y, models, cv=True)

model:LinearRegression
MAE:104.75048262232858
model:PLSRegression
MAE:104.90065848744257
model:Ridge
MAE:98.23119577120761
model:Lasso
MAE:96.47595591316039
model:ElasticNet
MAE:94.97057189194189
model:SVR
MAE:187.8577654036335
model:<catboost.core.CatBoostRegressor object at 0x000001B31D1DC5C8>
MAE:96.61555234704181
model:RandomForestRegressor
MAE:100.00964516129031
model:GradientBoostingRegressor
MAE:99.68399876709145
model:XGBRegressor
MAE:107.0929132153911


In [69]:
print(best_model1, best_error1)
print(best_model2, best_error2)

<catboost.core.CatBoostRegressor object at 0x000001B31D1DC5C8> 235.79542873699498
ElasticNet(random_state=2021) 94.97057189194189


In [80]:
best_tst_pred1

[1025.2714230227089,
 1119.405781544296,
 994.959189230805,
 743.5661916995698,
 929.0905159918186,
 810.3023273153187,
 936.0172449473656,
 714.1216000260695,
 1052.0199889411504,
 1271.375480171077,
 855.980602428514,
 1177.1270211391907,
 993.8754846954885,
 1015.4360775823654,
 919.2232022032692,
 1313.930150543984,
 898.7677467886464,
 1184.270037311271,
 1256.177805248765,
 1161.8208057328388,
 1026.105420160216,
 806.9508378426615,
 821.9189630541794,
 744.6578133258386,
 900.0297699056582,
 1011.0519187515496,
 1103.9105103588956,
 908.5873535135631,
 1002.1944067178447,
 1122.5178220397042,
 1045.7554892571013,
 905.9693806128755,
 1181.159377888605,
 1173.6080845943093,
 1423.7047854167006,
 1282.6907918790196,
 1119.7986850334144,
 836.3644515690406,
 1584.2480823118538,
 1215.4571713934167,
 814.916861496909,
 1305.5131317402897,
 1318.908398316973,
 789.6619331694078,
 1091.5200885423121,
 877.0117489648335,
 833.177541853789,
 805.2735785682996,
 810.341208698556,
 1067.7

In [464]:
best_model1, best_tst_pred1, best_error1 = fit_models(X_train1, X_test1, y_train1, y_test1, models)
tst_predict1 = best_model1.predict(test1_one)
best_model2, best_tst_pred2, best_error2 = fit_models(X_train2, X_test2, y_train2, y_test2, models)
tst_predict2 = best_model2.predict(test2_one)

0:	learn: 296.7690376	total: 3ms	remaining: 3s
500:	learn: 13.9549968	total: 1.12s	remaining: 1.11s
999:	learn: 5.0867827	total: 2.15s	remaining: 0us
0:	learn: 231.1418788	total: 2.39ms	remaining: 2.38s
500:	learn: 14.8339321	total: 1.06s	remaining: 1.05s
999:	learn: 5.6632659	total: 2.13s	remaining: 0us


In [465]:
tst_predict1.shape, tst_predict2.shape

((76,), (71,))

In [442]:
filepath = "../EDA/data"
sample = pd.read_csv(os.path.join(filepath,"sample_submission.csv"))
sample.shape
result1 = pd.concat([pd.DataFrame(test1_one.index).reset_index(drop=True), 
                    pd.DataFrame(tst_predict1)],axis=1)
result2 = pd.concat([pd.DataFrame(test2_one.index).reset_index(drop=True), 
                    pd.DataFrame(tst_predict2)],axis=1)
result = pd.concat([result1, result2], axis=0)
result.columns = ["code","num"]
sub = sample.merge(result, on="code",how = "left")
sub = sub[["code","num_y"]]
sub.columns = ["code","num"]
sub.loc[sub.num<0,"num"] = np.min(sub.loc[sub.num>=0,"num"])
sub.loc[sub.num.isna(),]
sub.to_csv("sub0717.csv",index=False)

In [441]:
mean_absolute_error(comp['num'].dropna(),
                    np.concatenate((tst_predict1, tst_predict2),axis=0))

443.0700703854426

In [None]:
1065
2362

In [91]:
print(train.loc[train.단지코드=="C1065",]['등록차량수'])
train.loc[train.단지코드=="C1065",] # true는 큰데 작게 예측

344   1,255.0000
Name: 등록차량수, dtype: float64


Unnamed: 0,단지코드,총세대수,지역,공가수,지하철역,버스정류장,단지내주차면수,등록차량수,단지명,도로명주소,연면적,위도,경도,subway_name,subway_dist,환승역 수,총인구수,세대당_인구,남/여비율,남/여_0~19세,...,공가수비율,국민임대,영구임대,임대상가,공공임대(단기),장기전세,행복주택,공공임대(분납),공공분양,공공임대(50년),임대료,임대보증금,1,2,7,3,6,4,5,대형전용면적
344,C1065,938,경상북도,37.0,0.0,3.0,1117.0,1255.0,경북김천혁신 Aa-1BL 국민임대주택,경상북도 김천시 해오름1로 17,20.22,128.1733,36.1212,문양역,0.3682,1,22662.0,2.54,1.01,2.6377,...,0.0394,0,0,0,0,0,0,0,0,0,187376.6667,13507333.3333,0.0,0.0,0.0,938.0,0.0,0.0,0.0,0


In [92]:
print(train.loc[train.단지코드=="C2362",]['등록차량수'])
train.loc[train.단지코드=="C2362",] # true는 작은데 크게 예측

389   196.0000
Name: 등록차량수, dtype: float64


Unnamed: 0,단지코드,총세대수,지역,공가수,지하철역,버스정류장,단지내주차면수,등록차량수,단지명,도로명주소,연면적,위도,경도,subway_name,subway_dist,환승역 수,총인구수,세대당_인구,남/여비율,남/여_0~19세,...,공가수비율,국민임대,영구임대,임대상가,공공임대(단기),장기전세,행복주택,공공임대(분납),공공분양,공공임대(50년),임대료,임대보증금,1,2,7,3,6,4,5,대형전용면적
389,C2362,1509,경기도,46.0,0.0,2.0,1055.0,196.0,양주옥정3단지,경기도 양주시 옥정서로 254,359.31,127.0885,37.8354,덕정역,0.028,1,54252.0,2.47,1.01,2.2829,...,0.0305,0,0,0,0,0,0,0,0,0,102500.0,24290000.0,0.0,0.0,0.0,0.0,0.0,1308.0,0.0,0


In [93]:
print(train.loc[train.단지코드=="C1979",]['등록차량수'])
train.loc[train.단지코드=="C1979",] # true가 작고 작게 예측

378   133.0000
Name: 등록차량수, dtype: float64


Unnamed: 0,단지코드,총세대수,지역,공가수,지하철역,버스정류장,단지내주차면수,등록차량수,단지명,도로명주소,연면적,위도,경도,subway_name,subway_dist,환승역 수,총인구수,세대당_인구,남/여비율,남/여_0~19세,...,공가수비율,국민임대,영구임대,임대상가,공공임대(단기),장기전세,행복주택,공공임대(분납),공공분양,공공임대(50년),임대료,임대보증금,1,2,7,3,6,4,5,대형전용면적
378,C1979,270,경기도,17.0,0.0,4.0,183.0,133.0,부천옥길 행복주택(A4BL),경기도 부천시 양지로 151,2134.1958,126.8196,37.4709,역곡역,0.0163,1,95543.0,2.56,0.96,2.0291,...,0.063,0,0,0,0,0,0,0,0,0,185836.6667,42216333.3333,0.0,0.0,0.0,0.0,0.0,0.0,270.0,0


In [101]:
train.loc[(train.공가수>40)&(train.등록차량수>1000),]

Unnamed: 0,단지코드,총세대수,지역,공가수,지하철역,버스정류장,단지내주차면수,등록차량수,단지명,도로명주소,연면적,위도,경도,subway_name,subway_dist,환승역 수,총인구수,세대당_인구,남/여비율,남/여_0~19세,...,공가수비율,국민임대,영구임대,임대상가,공공임대(단기),장기전세,행복주택,공공임대(분납),공공분양,공공임대(50년),임대료,임대보증금,1,2,7,3,6,4,5,대형전용면적
351,C1326,1934,부산광역시,43.0,0.0,4.0,1670.0,1153.0,부산정관 7단지(A-1BL),부산광역시 기장군 정관읍 모전로 41,4852.56,129.1684,35.3386,좌천역,0.0812,1,81775.0,2.6,0.98,2.2406,...,0.0222,0,0,0,0,0,0,0,0,0,173800.0,11900000.0,0.0,0.0,0.0,1934.0,0.0,0.0,0.0,0


In [103]:
train.loc[(train.공가수>40)&(train.총세대수>train.단지내주차면수),]

Unnamed: 0,단지코드,총세대수,지역,공가수,지하철역,버스정류장,단지내주차면수,등록차량수,단지명,도로명주소,연면적,위도,경도,subway_name,subway_dist,환승역 수,총인구수,세대당_인구,남/여비율,남/여_0~19세,...,공가수비율,국민임대,영구임대,임대상가,공공임대(단기),장기전세,행복주택,공공임대(분납),공공분양,공공임대(50년),임대료,임대보증금,1,2,7,3,6,4,5,대형전용면적
351,C1326,1934,부산광역시,43.0,0.0,4.0,1670.0,1153.0,부산정관 7단지(A-1BL),부산광역시 기장군 정관읍 모전로 41,4852.56,129.1684,35.3386,좌천역,0.0812,1,81775.0,2.6,0.98,2.2406,...,0.0222,0,0,0,0,0,0,0,0,0,173800.0,11900000.0,0.0,0.0,0.0,1934.0,0.0,0.0,0.0,0
374,C1565,1020,대구광역시,42.0,0.0,3.0,719.0,588.0,대구테크노폴리스 LH천년나무3단지,대구광역시 달성군 현풍읍 테크노북로4길 27,13242.1,128.4557,35.6948,설화명곡역,0.1092,1,23186.0,1.96,1.11,2.5906,...,0.0412,0,0,0,0,0,0,0,0,0,91500.0,18300000.0,0.0,0.0,0.0,0.0,0.0,1020.0,0.0,0
389,C2362,1509,경기도,46.0,0.0,2.0,1055.0,196.0,양주옥정3단지,경기도 양주시 옥정서로 254,359.31,127.0885,37.8354,덕정역,0.028,1,54252.0,2.47,1.01,2.2829,...,0.0305,0,0,0,0,0,0,0,0,0,102500.0,24290000.0,0.0,0.0,0.0,0.0,0.0,1308.0,0.0,0
396,C2225,1206,경상남도,49.0,0.0,3.0,842.0,448.0,김해율하2 A3블록,경상남도 김해시 율하5로 14,40.24,128.8251,35.1641,봉황역,0.0801,1,72820.0,2.77,0.98,2.0451,...,0.0406,0,0,0,0,0,0,0,0,0,27387.7778,52588777.7778,0.0,0.0,0.0,0.0,1200.0,0.0,0.0,0
403,C1156,1004,충청북도,47.0,0.0,3.0,700.0,505.0,청주동남 LH행복주택,충청북도 청주시 상당구 월운로 146,4158.5045,127.522,36.6064,대전역,0.2878,1,47196.0,2.485,0.985,2.0854,...,0.0468,0,0,0,0,0,0,0,0,0,35202.5,67596375.0,0.0,0.0,0.0,0.0,998.0,0.0,0.0,0
404,C2142,954,울산광역시,41.0,0.0,5.0,772.0,396.0,울산송정2,울산광역시 북구 박상진2로 82,23.1,129.3635,35.5967,좌천역,0.3084,1,36060.0,2.66,1.04,2.354,...,0.043,0,0,0,0,0,0,0,0,0,182250.0,15663500.0,0.0,0.0,0.0,946.0,0.0,0.0,0.0,0
