# ABBA test - 전력 사용량 예측

## import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings(action='ignore')
from matplotlib import font_manager,rc
rc('font', family='AppleGothic')

## 데이터

### building_info

#### building_info 데이터 불러오기

In [2]:
# building_info 데이터 불러오기
building_info = pd.read_csv('input/building_info.csv')
display(building_info.head())
print(building_info.info())

Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,110634.0,39570.0,-,-,-
1,2,건물기타,122233.47,99000.0,-,-,-
2,3,건물기타,171243.0,113950.0,40,-,-
3,4,건물기타,74312.98,34419.62,60,-,-
4,5,건물기타,205884.0,150000.0,-,2557,1000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   건물번호          100 non-null    int64  
 1   건물유형          100 non-null    object 
 2   연면적(m2)       100 non-null    float64
 3   냉방면적(m2)      100 non-null    float64
 4   태양광용량(kW)     100 non-null    object 
 5   ESS저장용량(kWh)  100 non-null    object 
 6   PCS용량(kW)     100 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 5.6+ KB
None


#### column명 변환
- building_info의 column명을 편의를 위해 영어로 대체

In [None]:
# building_info의 column명을 편의를 위해 영어로 대체
building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

####  building_type 변환
- building_info의 building_type column을 편의를 위해 영어로 대체

In [3]:
# building_info의 building_type column을 편의를 위해 영어로 대체
translation_dict = {
    '건물기타': 'Other_Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data_Center',
    '백화점및아울렛': 'Department_Store_and_Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research_Institute',
    '지식산업센터': 'Knowledge_Industry_Center',
    '할인마트': 'Discount_Mart',
    '호텔및리조트': 'Hotel_and_Resort'
}
building_info['building_type'] = building_info['building_type'].replace(translation_dict)
building_info

Unnamed: 0,building_number,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,Other_Buildings,110634.00,39570.00,-,-,-
1,2,Other_Buildings,122233.47,99000.00,-,-,-
2,3,Other_Buildings,171243.00,113950.00,40,-,-
3,4,Other_Buildings,74312.98,34419.62,60,-,-
4,5,Other_Buildings,205884.00,150000.00,-,2557,1000
...,...,...,...,...,...,...,...
95,96,Hotel_and_Resort,93314.00,60500.00,-,-,-
96,97,Hotel_and_Resort,55144.67,25880.00,-,-,-
97,98,Hotel_and_Resort,53578.62,17373.75,-,-,-
98,99,Hotel_and_Resort,53499.00,40636.00,-,-,-


#### 결측치 처리 
- solar_power_capacity, ess_capacity, pcs_capacity 컬럼의 '-' 로 되어있는 결측치를 0으로 바꿔주고 object로 되어있는 type을 float64로 변환

In [4]:
# solar_power_capacity, ess_capacity, pcs_capacity 컬럼의 '-' 로 되어있는 결측치를 0으로 바꿔주고 object로 되어있는 type을 float64로 변환
building_info = building_info.replace('-',0)
building_info[['solar_power_capacity','ess_capacity','pcs_capacity']] = building_info[['solar_power_capacity','ess_capacity','pcs_capacity']].astype('float64')
display(building_info)
print(building_info.info())

Unnamed: 0,building_number,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,Other_Buildings,110634.00,39570.00,0.0,0.0,0.0
1,2,Other_Buildings,122233.47,99000.00,0.0,0.0,0.0
2,3,Other_Buildings,171243.00,113950.00,40.0,0.0,0.0
3,4,Other_Buildings,74312.98,34419.62,60.0,0.0,0.0
4,5,Other_Buildings,205884.00,150000.00,0.0,2557.0,1000.0
...,...,...,...,...,...,...,...
95,96,Hotel_and_Resort,93314.00,60500.00,0.0,0.0,0.0
96,97,Hotel_and_Resort,55144.67,25880.00,0.0,0.0,0.0
97,98,Hotel_and_Resort,53578.62,17373.75,0.0,0.0,0.0
98,99,Hotel_and_Resort,53499.00,40636.00,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   building_number       100 non-null    int64  
 1   building_type         100 non-null    object 
 2   total_area            100 non-null    float64
 3   cooling_area          100 non-null    float64
 4   solar_power_capacity  100 non-null    float64
 5   ess_capacity          100 non-null    float64
 6   pcs_capacity          100 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 5.6+ KB
None


### train_df

#### train_df 데이터 불러오기

In [7]:
# train_df 데이터 불러오기
train_df = pd.read_csv('input/train.csv')
display(train_df.head())
print(train_df.info())

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   num_date_time  204000 non-null  object 
 1   건물번호           204000 non-null  int64  
 2   일시             204000 non-null  object 
 3   기온(C)          204000 non-null  float64
 4   강수량(mm)        43931 non-null   float64
 5   풍속(m/s)        203981 non-null  float64
 6   습도(%)          203991 non-null  float64
 7   일조(hr)         128818 non-null  float64
 8   일사(MJ/m2)      116087 non-null  float64
 9   전력소비량(kWh)     204000 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 15.6+ MB
None


#### train_df의 column명을 편의를 위해 영어로 대체

In [None]:
# train_df의 column명을 편의를 위해 영어로 대체
train_df = train_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

#### num_date_time 컬럼은 building_number와 date_time를 단순 합쳐좋은 데이터임으로 drop

In [9]:
# num_date_time 컬럼은 building_number와 date_time를 단순 합쳐좋은 데이터임으로 drop
train_df = train_df.drop('num_date_time', axis = 1)
train_df

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption
0,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1,20220601 04,18.4,,2.8,43.0,,,986.40
...,...,...,...,...,...,...,...,...,...
203995,100,20220824 19,23.1,,0.9,86.0,0.5,,881.04
203996,100,20220824 20,22.4,,1.3,86.0,0.0,,798.96
203997,100,20220824 21,21.3,,1.0,92.0,,,825.12
203998,100,20220824 22,21.0,,0.3,94.0,,,640.08


In [10]:
train_df.isnull().sum()

building_number           0
date_time                 0
temperature               0
rainfall             160069
windspeed                19
humidity                  9
sunshine              75182
solar_radiation       87913
power_consumption         0
dtype: int64

### test_df

#### test_df 데이터 불러오기

In [11]:
# test_df 데이터 불러오기
test_df = pd.read_csv('input/test.csv')
display(test_df.head())
print(test_df.info())

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16800 entries, 0 to 16799
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   num_date_time  16800 non-null  object 
 1   건물번호           16800 non-null  int64  
 2   일시             16800 non-null  object 
 3   기온(C)          16800 non-null  float64
 4   강수량(mm)        16800 non-null  float64
 5   풍속(m/s)        16800 non-null  float64
 6   습도(%)          16800 non-null  int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 918.9+ KB
None


#### column명 변환
- test_df의 column명을 편의를 위해 영어로 대체

In [None]:
# test_df의 column명을 편의를 위해 영어로 대체
test_df = test_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

In [None]:
# num_date_time 컬럼은 building_number와 date_time를 단순 합쳐좋은 데이터임으로 drop
test_df = test_df.drop('num_date_time', axis = 1)
test_df