In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math
from sklearn.cluster import KMeans
import warnings

warnings.filterwarnings(action='ignore')




plt.rcParams['font.family'] = 'NanumBarunGothic'
%config inlinebackend.figure_format = 'retina'
plt.rcParams['figure.dpi'] = 100 

## 0. 데이터 전처리

In [None]:
train = pd.read_csv('train.csv', encoding='cp949')
test = pd.read_csv('test.csv', encoding='cp949')
test = test.rename(columns={'강수량(mm, 6시간)':'강수량(mm)',
                    '일조(hr, 3시간)':'일조(hr)'})
train['date_time'] = pd.to_datetime(train['date_time'])
test['date_time'] = pd.to_datetime(test['date_time'])


sample_submission = pd.read_csv('sample_submission.csv', encoding='cp949')

print(f'Train: {train.shape}')
print(f'Test: {test.shape}')
print(f'Submission: {sample_submission.shape}')

#### 날짜 변수 전처리

In [None]:
# hour, month 등 변수 추가
train['hour'] = train['date_time'].dt.hour
train['month'] = train['date_time'].dt.month
train['day'] = train['date_time'].dt.day
train['weekday'] = train['date_time'].dt.weekday
train['date'] = train['date_time'].dt.date

test['hour'] = test['date_time'].dt.hour
test['month'] = test['date_time'].dt.month
test['day'] = test['date_time'].dt.day
test['weekday'] = test['date_time'].dt.weekday
test['date'] = test['date_time'].dt.date

### 1. 건물의 시간별, 요일별 히트맵

In [None]:
def building_heatmap(num):
    
    fig = plt.figure(figsize = (2, 3))
    temp = train[train.num == num]
    temp = temp.groupby(['weekday', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('weekday', 'hour', '전력사용량(kWh)')
    sns.heatmap(temp)
    plt.title(f'building {num}')
    plt.xlabel('')
    plt.ylabel('')
    plt.yticks([])
    
building_heatmap(4)

### 2. 건물의 요일별 및 시간별 전력소모량 파악

In [None]:
def hour_and_weekday(num):
    
    temp = train[train.num == num]
    temp = temp.groupby(['weekday', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('weekday', 'hour', '전력사용량(kWh)')

    temp1 = temp.mean(axis=1)
    temp2 = temp.mean(axis=0)
        
    fig = plt.figure(figsize = (12, 4))
    
    plt.subplot(121)
    plt.plot(temp1.values)
    sns.barplot(temp1.index, temp1.values)
    
    plt.subplot(122)
    plt.plot(temp2.values)
    sns.barplot(temp2.index, temp2.values)
    
    plt.show()
    
    
hour_and_weekday(4)

### 3. 요일 내에서 시각화

In [None]:
def each_weekday(num):

    temp = train[train.num == num]
    temp = temp.groupby(['weekday', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('weekday', 'hour', '전력사용량(kWh)')

    fig = plt.figure(figsize = (16, 4))

    for idx in range(7):
        plt.subplot(1,7,idx+1)
        plt.title(idx)
        plt.ylim(temp.min().min(), temp.max().max())
        sns.barplot(temp.loc[idx,:].index, temp.loc[idx,:].values)
        
each_weekday(59)

### 4. 시간 내에서 시각화

In [None]:
def each_hour(num):

    temp = train[train.num == num]
    temp = temp.groupby(['weekday', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('weekday', 'hour', '전력사용량(kWh)')

    fig = plt.figure(figsize = (16, 4))
    
    plt.plot(temp)
    
    for idx in range(23):
        #plt.subplot(23,1,idx+1)
        #plt.title(idx)
        #plt.ylim(temp.min().min(), temp.max().max())
        #sns.barplot(temp[idx].index, temp[idx].values)
        plt.plot(temp[idx])
        
    
each_hour(59)

### 5. 요일만 같으면 전력소비현황이 시간별로 비슷할까?

In [None]:
weekday_ = 0 # 월요일

for num in range(1,61):
    temp = train[train.num == num]
    temp = temp[temp['weekday'] == weekday_].reset_index(drop=True)
    temp['date'] = temp['month'].astype('str') + '/' + pd.Series(map(lambda x: x.zfill(2), temp['day'].astype('str'))).astype('str')
    temp = temp.groupby(['date', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('date', 'hour', '전력사용량(kWh)')
    temp = temp.sort_index()

    fig = plt.figure(figsize = (16, 4))

    for idx, day in zip(range(temp.shape[0]), temp.index):

        plt.subplot(1,temp.shape[0],idx+1)
        plt.title(f'B{num}_{day}')
        plt.ylim(temp.min().min(), temp.max().max())

        ax = plt.gca()
        ax.axes.xaxis.set_visible(False)
        ax.axes.yaxis.set_visible(False)
        sns.barplot(temp.iloc[idx,:].index, temp.iloc[idx,:].values)

In [None]:
weekday_ = 1 # 화요일

for num in range(1,61):
    temp = train[train.num == num]
    temp = temp[temp['weekday'] == weekday_].reset_index(drop=True)
    temp['date'] = temp['month'].astype('str') + '/' + pd.Series(map(lambda x: x.zfill(2), temp['day'].astype('str'))).astype('str')
    temp = temp.groupby(['date', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('date', 'hour', '전력사용량(kWh)')
    temp = temp.sort_index()

    fig = plt.figure(figsize = (16, 4))

    for idx, day in zip(range(temp.shape[0]), temp.index):

        plt.subplot(1,temp.shape[0],idx+1)
        plt.title(f'B{num}_{day}')
        plt.ylim(temp.min().min(), temp.max().max())

        ax = plt.gca()
        ax.axes.xaxis.set_visible(False)
        ax.axes.yaxis.set_visible(False)
        sns.barplot(temp.iloc[idx,:].index, temp.iloc[idx,:].values)

In [None]:
weekday_ = 2 # 수요일

for num in range(1,61):
    temp = train[train.num == num]
    temp = temp[temp['weekday'] == weekday_].reset_index(drop=True)
    temp['date'] = temp['month'].astype('str') + '/' + pd.Series(map(lambda x: x.zfill(2), temp['day'].astype('str'))).astype('str')
    temp = temp.groupby(['date', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('date', 'hour', '전력사용량(kWh)')
    temp = temp.sort_index()

    fig = plt.figure(figsize = (16, 4))

    for idx, day in zip(range(temp.shape[0]), temp.index):

        plt.subplot(1,temp.shape[0],idx+1)
        plt.title(f'B{num}_{day}')
        plt.ylim(temp.min().min(), temp.max().max())

        ax = plt.gca()
        ax.axes.xaxis.set_visible(False)
        ax.axes.yaxis.set_visible(False)
        sns.barplot(temp.iloc[idx,:].index, temp.iloc[idx,:].values)

In [None]:
weekday_ = 3 # 목요일

for num in range(1,61):
    temp = train[train.num == num]
    temp = temp[temp['weekday'] == weekday_].reset_index(drop=True)
    temp['date'] = temp['month'].astype('str') + '/' + pd.Series(map(lambda x: x.zfill(2), temp['day'].astype('str'))).astype('str')
    temp = temp.groupby(['date', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('date', 'hour', '전력사용량(kWh)')
    temp = temp.sort_index()

    fig = plt.figure(figsize = (16, 4))

    for idx, day in zip(range(temp.shape[0]), temp.index):

        plt.subplot(1,temp.shape[0],idx+1)
        plt.title(f'B{num}_{day}')
        plt.ylim(temp.min().min(), temp.max().max())

        ax = plt.gca()
        ax.axes.xaxis.set_visible(False)
        ax.axes.yaxis.set_visible(False)
        sns.barplot(temp.iloc[idx,:].index, temp.iloc[idx,:].values)

In [None]:
weekday_ = 4 # 금요일

for num in range(1,61):
    temp = train[train.num == num]
    temp = temp[temp['weekday'] == weekday_].reset_index(drop=True)
    temp['date'] = temp['month'].astype('str') + '/' + pd.Series(map(lambda x: x.zfill(2), temp['day'].astype('str'))).astype('str')
    temp = temp.groupby(['date', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('date', 'hour', '전력사용량(kWh)')
    temp = temp.sort_index()

    fig = plt.figure(figsize = (16, 4))

    for idx, day in zip(range(temp.shape[0]), temp.index):

        plt.subplot(1,temp.shape[0],idx+1)
        plt.title(f'B{num}_{day}')
        plt.ylim(temp.min().min(), temp.max().max())

        ax = plt.gca()
        ax.axes.xaxis.set_visible(False)
        ax.axes.yaxis.set_visible(False)
        sns.barplot(temp.iloc[idx,:].index, temp.iloc[idx,:].values)

In [None]:
weekday_ = 5 # 토요일

for num in range(1,61):
    temp = train[train.num == num]
    temp = temp[temp['weekday'] == weekday_].reset_index(drop=True)
    temp['date'] = temp['month'].astype('str') + '/' + pd.Series(map(lambda x: x.zfill(2), temp['day'].astype('str'))).astype('str')
    temp = temp.groupby(['date', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('date', 'hour', '전력사용량(kWh)')
    temp = temp.sort_index()

    fig = plt.figure(figsize = (16, 4))

    for idx, day in zip(range(temp.shape[0]), temp.index):

        plt.subplot(1,temp.shape[0],idx+1)
        plt.title(f'B{num}_{day}')
        plt.ylim(temp.min().min(), temp.max().max())

        ax = plt.gca()
        ax.axes.xaxis.set_visible(False)
        ax.axes.yaxis.set_visible(False)
        sns.barplot(temp.iloc[idx,:].index, temp.iloc[idx,:].values)

In [None]:
weekday_ = 6 # 일요일

for num in range(1,61):
    temp = train[train.num == num]
    temp = temp[temp['weekday'] == weekday_].reset_index(drop=True)
    temp['date'] = temp['month'].astype('str') + '/' + pd.Series(map(lambda x: x.zfill(2), temp['day'].astype('str'))).astype('str')
    temp = temp.groupby(['date', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('date', 'hour', '전력사용량(kWh)')
    temp = temp.sort_index()

    fig = plt.figure(figsize = (16, 4))

    for idx, day in zip(range(temp.shape[0]), temp.index):

        plt.subplot(1,temp.shape[0],idx+1)
        plt.title(f'B{num}_{day}')
        plt.ylim(temp.min().min(), temp.max().max())

        ax = plt.gca()
        ax.axes.xaxis.set_visible(False)
        ax.axes.yaxis.set_visible(False)
        sns.barplot(temp.iloc[idx,:].index, temp.iloc[idx,:].values)

### 6. 건물별 일별 시각화

In [None]:
num = 27
temp = train[train.num == num].reset_index(drop=True)
temp['date'] = temp['month'].astype('str').reset_index(drop=True) + '/' + pd.Series(map(lambda x: x.zfill(2), temp['day'].astype('str'))).astype('str').reset_index(drop=True)
temp = temp.groupby(['date', 'hour'])['전력사용량(kWh)'].median().reset_index().pivot('date', 'hour', '전력사용량(kWh)')

fig = plt.figure(figsize = (16, 40))
for idx, day in zip(range(temp.shape[0]), temp.index):

    plt.subplot(17,5,idx+1)
    plt.title(f'B{num}_{day}')
    plt.ylim(temp.min().min(), temp.max().max())

    ax = plt.gca()
    ax.axes.xaxis.set_visible(False)
    ax.axes.yaxis.set_visible(False)
    sns.barplot(temp.columns, temp.iloc[idx,:].values)