In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (569 kB)
[K     |████████████████████████████████| 569 kB 8.5 MB/s 
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [3]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 12 not upgraded.
Need to get 9,604 kB of archives.
After this operation, 29.5 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 fonts-nanum all 20170925-1 [9,604 kB]
Fetched 9,604 kB in 1s (9,277 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletyp

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from imblearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
import shap
from datetime import datetime
import json
import joblib

In [5]:
plt.rc('font', family='NanumBarunGothic')

In [6]:
pd.options.display.float_format = '{:.6f}'.format

In [7]:
sheetname = '물류센터'

In [8]:
pre_data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/전인CM/input/전인CM_Inference.xlsx', sheet_name=sheetname, engine='openpyxl')

In [9]:
pre_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   착공년도        1 non-null      int64  
 1   프로젝트명       1 non-null      object 
 2   연면적(평)      1 non-null      int64  
 3   지하층         1 non-null      int64  
 4   지상층         1 non-null      int64  
 5   층           1 non-null      int64  
 6   지형          1 non-null      object 
 7   공법          1 non-null      object 
 8   상온          1 non-null      float64
 9   저온          1 non-null      float64
 10  기타          1 non-null      float64
 11  시공사 등급      1 non-null      int64  
 12  공사기간(개월)    1 non-null      int64  
 13  공사비지수       1 non-null      float64
 14  지역          1 non-null      object 
 15  건물외형        1 non-null      object 
 16  철거공사 포함 여부  1 non-null      object 
dtypes: float64(4), int64(7), object(6)
memory usage: 264.0+ bytes


In [10]:
label = pre_data.copy()

In [11]:
end_model = joblib.load('/content/drive/My Drive/Colab Notebooks/전인CM/output/물류센터_model.pkl')
scaler = joblib.load('/content/drive/My Drive/Colab Notebooks/전인CM/output/물류센터_scaler.pkl')
con_data = joblib.load('/content/drive/My Drive/Colab Notebooks/전인CM/output/물류센터_data.pkl')
en_data = joblib.load('/content/drive/My Drive/Colab Notebooks/전인CM/output/물류센터_en_data.pkl')
explainer = joblib.load('/content/drive/My Drive/Colab Notebooks/전인CM/output/물류센터_explainer.pkl')



In [12]:
# 결측치 처리
nalist = ['공법', '상온', '저온', '기타','시공사 등급'] # 등급 : 최빈값으로 처리해야함

for col in nalist : # 최빈 처리 
  if (pre_data[col].dtype == object) or (col == '시공사 등급') :
    pre_data.loc[pre_data[col].isna()==True,col] = con_data[col].mode()[0]
  else :
    pre_data.loc[pre_data[col].isna()==True,col] = con_data[col].mean()

In [13]:
# label encoding
pre_data['건물외형'] = pre_data['건물외형'].replace('정형', 0)
pre_data['건물외형'] = pre_data['건물외형'].replace('비정형', 1)
pre_data['철거공사 포함 여부'] = pre_data['철거공사 포함 여부'].replace('미포함', 0)
pre_data['철거공사 포함 여부'] = pre_data['철거공사 포함 여부'].replace('포함', 1)

sector = []
for i in range(len(pre_data)) :
  if pre_data['지역'][i] == '서울' : 
    sector.append(1)
  elif pre_data['지역'][i] == '인천' : 
    sector.append(2)
  elif pre_data['지역'][i] == '경기' : 
    sector.append(3)
  elif pre_data['지역'][i] == '충청' : 
    sector.append(4)
  elif pre_data['지역'][i] == '강원' : 
    sector.append(5)
  elif pre_data['지역'][i] == '영남' : 
    sector.append(6)
  elif pre_data['지역'][i] == '호남' : 
    sector.append(7)

pre_data['지역'] =  sector

In [14]:
pre_data.loc[pre_data.공법 == 'RC / PC', '공법'] = 'PC / RC'

In [15]:
sec = pre_data.copy()

one_hot = ['공법_PC / PEB', '공법_PC / 철골조', '공법_RC', '공법_SC / RC / PEB', '공법_SRC', '공법_SRC / PEB', '지형_평지', '공법_PC / RC', '공법_PC / RC / 철골조', '공법_PC / SRC']
for col in one_hot  :
  sec[col] = 0
if sec['지형'][0] == '평지' : 
  sec['지형_평지'] = 1

if '공법_'+sec['공법'][0] in one_hot : 
  sec['공법_'+sec['공법'][0]] = 1

sec = sec[['착공년도', '프로젝트명', '연면적(평)', '지하층', '지상층', '층', '상온', '저온', '기타',
       '시공사 등급', '공사기간(개월)', '공사비지수', '지역', '건물외형', '철거공사 포함 여부', '지형_평지',
       '공법_PC / PEB', '공법_PC / RC', '공법_PC / RC / 철골조', '공법_PC / 철골조', '공법_RC',
       '공법_SC / RC / PEB', '공법_SRC', '공법_SRC / PEB', '공법_PC / SRC']]

In [16]:
main = pd.concat([con_data, sec], axis=0).reset_index(drop=True)

temp = sec.drop(['프로젝트명'], axis=1)
scaled_temp = scaler.transform(temp)
sec_data = pd.DataFrame(data = scaled_temp, index=temp.index, columns=temp.columns)
conc = pd.concat([en_data, sec_data], axis=0).reset_index(drop=True)

In [17]:
pre_list = sec['프로젝트명']
pred_sec = pd.DataFrame(columns=pre_list)

In [18]:
# 유사도 측정 로직 수정
for name in pre_list : 
  data_list = []
  main_index = main[main['프로젝트명']==name].index
  main_values = conc.values[main_index]
  main_year = conc['착공년도'][main_index].values[0]

  for num in range(len(conc)) : 
    compare_values = conc.values[num]
    uclid_dist = np.sqrt(np.sum(np.square(main_values-compare_values)))
    if (main_year==conc['착공년도'].min()) : 
      data_list.append(uclid_dist)
    else : 
      if (main_year < conc['착공년도'][num]) :
        data_list.append(0)
      else : 
        data_list.append(uclid_dist)
  pred_sec[name] = data_list

In [19]:
train_sim = pred_sec[:len(con_data)]

pred_sim = []
for name in pre_data['프로젝트명'] : 
  unique_data = train_sim[name].unique()
  zero = [0]
  sim_data = np.setdiff1d(unique_data, zero).min()
  pred_sim.append(sim_data)

In [20]:
similar_cons = ['토목', '건축', '설비', '전기', '통신', '소방', '조경', '추가공종', '간접비 및 이윤']

In [21]:
similar_fe = ['착공년도', '프로젝트명', '연면적(평)', '지하층', '지상층', '층', '지형', '공법', '상온', '저온',
       '기타', '시공사 등급', '공사기간(개월)', '공사비', '공사비지수', '지역', '건물외형', '철거공사 포함 여부']

In [22]:
pred_result_1 = [] 
i = 0
for name in pre_list : 
  pred_result_1.append(con_data.loc[pred_sec[name]==pred_sim[i]]['2015 기준공사비'].values[0])
  i += 1

In [23]:
# 가장 유사도가 깊은 공사의 공종 데이터 가져오기 
pred_result_2 = []
i = 0
for name in pre_data['프로젝트명'] : 
  sim_values = []
  for project in similar_cons : 
    sim_values.append(con_data.loc[pred_sec[name]== pred_sim[i]][project].values[0])
  pred_result_2.append(sim_values)
  i += 1

In [24]:
pred_result_3 = []
i = 0
for name in pre_data['프로젝트명'] : 
  sim_values = []
  for project in similar_fe : 
    sim_values.append(con_data.loc[pred_sec[name]== pred_sim[i]][project].values[0])
  pred_result_3.append(sim_values)
  i += 1

In [25]:
pre_data['유사도 기준공사비'] = pred_result_1
pre_data[['토목', '건축', '설비', '전기', '통신', '소방', '조경', '추가공종', '간접비 및 이윤']] = pred_result_2

In [26]:
similar_data = pd.DataFrame(pred_result_3, columns = similar_fe)
similar_data['지역'] = label['지역']
similar_data['건물외형'] = label['건물외형']
similar_data['철거공사 포함 여부'] = label['철거공사 포함 여부']
similar_data['유사도'] = (1-pred_sim[0]/10)*100

In [27]:
similar_data.to_csv('/content/drive/My Drive/Colab Notebooks/전인CM/output/similarity_물류센터.csv', encoding='utf-8-sig', index=False)

inference 모델링

In [28]:
fe_data = pre_data.copy()

one_hot = ['공법_PC / PEB', '공법_PC / 철골조', '공법_RC', '공법_SC / RC / PEB', '공법_SRC', '공법_SRC / PEB', '지형_평지', '공법_PC / RC', '공법_PC / RC / 철골조', '공법_PC / SRC']
for col in one_hot  :
  fe_data[col] = 0
if fe_data['지형'][0] == '평지' : 
  fe_data['지형_평지'] = 1

if '공법_'+fe_data['공법'][0] in one_hot : 
  fe_data['공법_'+fe_data['공법'][0]] = 1

fe_data = fe_data[['착공년도', '연면적(평)', '지하층', '지상층', '층', '상온', '저온', '기타', '시공사 등급',
       '공사기간(개월)', '공사비지수', '토목', '건축', '설비', '전기', '통신', '소방', '조경', '추가공종',
       '간접비 및 이윤', '지역', '건물외형', '철거공사 포함 여부', '유사도 기준공사비', '지형_평지',
       '공법_PC / PEB', '공법_PC / RC', '공법_PC / RC / 철골조', '공법_PC / 철골조', '공법_RC',
       '공법_SC / RC / PEB', '공법_SRC', '공법_SRC / PEB', '공법_PC / SRC']]

In [29]:
# pred_data = end_model.predict(fe_data)
pred_data = end_model.predict(fe_data)
pre_data['예측 총공사비'] = pred_data[0]*pre_data['공사비지수'][0]/100
pre_data['예측 평당가'] = pred_data[0]*pre_data['공사비지수'][0]/100/pre_data['연면적(평)'][0]

In [30]:
pre_data['지역'] = label['지역']
pre_data['건물외형'] = label['건물외형']
pre_data['철거공사 포함 여부'] = label['철거공사 포함 여부']

In [31]:
pre_data.to_csv('/content/drive/My Drive/Colab Notebooks/전인CM/output/output_물류센터.csv', encoding='utf-8-sig', index=False)

In [32]:
shap_values_inf = explainer.shap_values(fe_data)
dd = shap_values_inf[0] / np.absolute(shap_values_inf).sum()

In [33]:
feature_weight = pd.DataFrame(columns = fe_data.columns)
feature_weight.loc[0] = dd.tolist()

In [34]:
categorical_feature = ['지형', '공법']
cate_main = []

for i in range(len(feature_weight)) :
  temp_1 = []
  for name in categorical_feature : 
    temp_2 = []
    for j in range (len(feature_weight.columns.tolist())) : 
      if name in feature_weight.columns.tolist()[j] : 
        real_name = feature_weight.columns.tolist()[j]
        temp_2.append(feature_weight[real_name][i])
    data_sum = np.sum(temp_2)
    temp_1.append(data_sum)
  cate_main.append(temp_1)

In [35]:
ttt_f = pd.DataFrame(cate_main, columns=categorical_feature)

feature_weight.drop(one_hot, axis=1, inplace=True)
feature_weight = pd.concat([feature_weight, ttt_f], axis=1)

In [36]:
feature_weight.insert(0, '프로젝트명', pre_data['프로젝트명'].tolist())

In [37]:
feature_weight.to_csv('/content/drive/My Drive/Colab Notebooks/전인CM/output/feature_weight_물류센터.csv', encoding='utf-8-sig', index=False)