In [None]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
pd.options.display.max_rows = 100
pd.options.display.max_columns = 40
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

#modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error as mae

from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
import os
path = os.path.dirname(os.getcwd())
path = os.path.join(path, "ProcessedData")

In [None]:
train = pd.read_csv(os.path.join(path,"merged_train.csv"))
test = pd.read_csv(os.path.join(path,"merged_test.csv"))
sub = pd.read_csv(os.path.join(path,"merged_test.csv"))

In [None]:
class preprocess:
    
    def __init__(self,df,test,drop_col_name,y_name):
        self.df = df
        self.non_unique = np.array(['자격유형', '공급유형', '임대건물구분','임대보증금', '임대료', '전용면적', '전용면적별세대수'])
        self.test = test
        self.drop_col_name = drop_col_name
        self.y_name = y_name
        
    def final(self):

        preprocess.new_y(self,self.df)
        preprocess.no_duplicates(self,self.df)
        preprocess.process(self,self.df)
        print('====train preprocess====')
        preprocess.drop_col(self,self.drop_col_name)
        preprocess.train(self)
        result,f = pr.catboost(self)
        
        print('====train done====')
        
        preprocess.no_duplicates(self,self.test)
        preprocess.process(self,self.test)
        print('====test preprocess====')
        preprocess.drop_col(self,self.drop_col_name)
        pred = preprocess.test(self)
        
        return result,f,pred
        
        
    def new_y(self,df):
        try:
            df['등록차량수/총세대수'] = df.apply(lambda x : x['등록차량수']/x['총세대수'], axis=1)
            df['등록차량수/단지내주차면수'] = df.apply(lambda x : x['등록차량수']/x['단지내주차면수'], axis=1)
            self.y = np.array(['등록차량수','등록차량수/단지내주차면수','등록차량수/총세대수'])
        except:
            pass
        
    def no_duplicates(self,df): 
        
        self.newDF = df.drop(self.non_unique,axis=1).drop_duplicates().reset_index(drop=True)
    
    def merge(self,col,standard,oldDF,newDF,kind):
        
        sample = oldDF.groupby('단지코드').describe()[col].reset_index()
        for i in sample.index:
            for j in standard:
                code = sample.loc[i,'단지코드']
                val = sample.loc[i,j]
                ind = newDF[newDF['단지코드']==code].index
                newDF.loc[ind,kind+col+j] = val
        return newDF
    
    def category(self,x,dic):
        for i in dic.keys():
            if x in dic[i]:
                return int(i)
        
    def grouping(self,col,dic,oldDF,newDF):
        oldDF[col+'카테고리'] = oldDF[col].apply(lambda x : preprocess.category(self,x,dic))
        sample = pd.DataFrame(oldDF.groupby('단지코드')[col+'카테고리'].unique()).reset_index()
        for i in dic.keys():
            newDF[col+str(i)]=0
        for i in sample.index:
            code = sample.loc[i,'단지코드']
            val = sample.loc[i,col+'카테고리']
            ind = newDF[newDF['단지코드']==code].index
            for j in val:
                newDF.loc[ind,col+str(j)] = 1  
        newDF = newDF.drop(col+str(1),axis=1)
        return newDF
                    
    
    def process(self,df):
        
        self.apt = df[df['임대건물구분']=='아파트']
        self.store= df[df['임대건물구분']=='상가']
        
        #아파트 임대료, 임대보증금
        self.newDF = preprocess.merge(self,'임대료',['min','25%','50%','75%','max'],self.apt,self.newDF,'apt')
        self.newDF = preprocess.merge(self,'임대보증금',['min','25%','50%','75%','max'],self.apt,self.newDF,'apt')

        #상가 임대료, 임대보증금
        self.newDF = preprocess.merge(self,'임대료',['mean','std'],self.store,self.newDF,'store')
        self.newDF = preprocess.merge(self,'임대보증금',['mean','std'],self.store,self.newDF,'store')

        #상가 없는 경우 임대료 임대보증금 0으로 채움
        self.newDF = self.newDF.replace(np.nan, 0)

        # 자격유형 0,1 indicator variable
        dic = {'1': 'A' ,'2':['E','K'],'3':['D'],'4':['L','M','N','O'],'5':['B','H','I','G'],'6':'J','7':['C','F']}
        self.newDF = preprocess.grouping(self,'자격유형',dic,df,self.newDF)
        
        # 지역
        dic = {'1':['경상남도', '전라북도', '강원도', '광주광역시', '충청남도', '제주특별자치도', '울산광역시', '충청북도', '전라남도', '경상북도', '세종특별자치시'],'2':['경기도' ,'서울특별시', '대구광역시' ,'부산광역시', '대전광역시']}
        self.newDF = preprocess.grouping(self,'지역',dic,df,self.newDF)
        
        #공급유형
        df.loc[df['공급유형'].isin(['공공임대(10년)','공공임대(5년)']),'공급유형'] = '공공임대(5,10년)'
        sample = pd.DataFrame(df.groupby(['단지코드','공급유형']).sum()['전용면적별세대수']).reset_index()
        for i in sample.공급유형.unique():
            self.newDF[str(i)]=0
        for i in sample.index:
            code = sample.loc[i,'단지코드']
            col = sample.loc[i,'공급유형']
            val = sample.loc[i,'전용면적별세대수']
            ind = self.newDF[self.newDF['단지코드']==code].index
            self.newDF.loc[ind,col] = val
            
        # 전용면적
        self.newDF = preprocess.merge(self,'전용면적',['min','25%','50%','75%','max'],df,self.newDF,'area')
        
        # 공가수
        self.newDF['공가수'] =  self.newDF.apply(lambda x : x['공가수']*x['세대당_인구'],axis=1)
        
        return self.newDF
        
    
    def drop_col(self,col):
        self.newDF = self.newDF.drop(col,axis=1)
    
    def train(self):
        X = self.newDF.drop(['등록차량수','등록차량수/단지내주차면수','등록차량수/총세대수'],axis=1)
        X = pd.DataFrame(StandardScaler().fit_transform(X),columns=X.columns)
        y = self.newDF[self.y_name]
        if self.y_name == '등록차량수/총세대수':
            X = X.drop(['총세대수'],axis=1)
        if self.y_name == '등록차량수/단지내주차면수':
            X = X.drop(['단지내주차면수'],axis=1)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.3,random_state=2021,shuffle=True)
        self.ind = self.X_test.index

    def catboost(self):
        self.cb = CatBoostRegressor(random_state=2021,verbose=False).fit(self.X_train, self.y_train)
        pred = self.cb.predict(self.X_test)
        error = preprocess.error(self,self.y_name,pred)
        feature = self.cb.feature_importances_
        return error,feature
    
    def error(self,y_name,pred):
        if y_name == '등록차량수/총세대수':
            real_y = np.array(self.newDF.loc[self.ind,'등록차량수'].values)
            real_pred = np.array(self.newDF.loc[self.ind,'총세대수'].values) * np.array(pred)
            return mae(real_y,real_pred)
        if y_name == '등록차량수/단지내주차면수':
            real_y = np.array(self.newDF.loc[self.ind,'등록차량수'].values)
            real_pred = np.array(self.newDF.loc[self.ind,'단지내주차면수'].values) * np.array(pred)
            return mae(real_y,real_pred)
        else:
            return mae(pred,self.y_test)
        
    def test(self):
        X = pd.DataFrame(StandardScaler().fit_transform(self.newDF),columns=self.newDF.columns)
        if self.y_name == '등록차량수/총세대수':
            X= X.drop(['총세대수'],axis=1)
            pred = self.cb.predict(np.array(X))
            real_pred = np.array(X['총세대수'].values) * np.array(pred)
            return real_pred
        if self.y_name == '등록차량수/단지내주차면수':
            X= X.drop(['단지내주차면수'],axis=1)
            pred = self.cb.predict(np.array(X))
            real_pred = np.array(X['총세대수'].values) * np.array(pred)
            return real_pred
        else:
            pred = self.cb.predict(np.array(X))
            return pred

In [None]:
pr = preprocess(train,test,['단지코드','지역','단지명','도로명주소','위도','경도','subway_name'],'등록차량수/총세대수')
result, pred = pr.final()

In [None]:
pr2 = preprocess(train,test,['단지코드','지역','단지명','도로명주소','위도','경도','subway_name'],'등록차량수')
result2, pred2 = pr2.final()