In [12]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# EDA 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats
from scipy.stats import skew
from scipy.stats import spearmanr

# Learning algorithms
import sklearn
from sklearn.linear_model import *
from sklearn.svm import SVR
from sklearn.cluster import KMeans


In [13]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
ss = pd.read_csv('data/sample_submission.csv')

In [14]:
# 컬럼명 영문으로 변경하기
train_df.columns = ['num','datetime','target','temperature','windspeed','humidity','precipitation','insolation','nelec_cool_flag','solar_flag']
test_df.columns = ['num','datetime','temperature','windspeed','humidity','precipitation','insolation','nelec_cool_flag','solar_flag']

In [15]:
train_df.shape

(122400, 10)

In [16]:
train_df.head()

Unnamed: 0,num,datetime,target,temperature,windspeed,humidity,precipitation,insolation,nelec_cool_flag,solar_flag
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


비가 오는 날과 맑은 날의 전력 사용량에 차이가 날 것 이라 생각이 들었고, 날씨와 전력 사용량의 상관 관계를 알아보면 좋겠다는 생각이 들었습니다.

그래서 weather이라는 컬럼을 새로 만들고, 비오는 날을 0, 흐린 날을 0.5, 맑은 날을 1로 값을 주었습니다.

- 비오는날
    - 강수량 > 0
- 흐린 날
    - 강수량 = 0, 일조량
- 맑은 날
    - 강수량 = 0, 일조량 = 1
    - 강수량 = 0, 일조량 = 0 , 습도 
    
흐린 날과, 맑은 날의 데이터를 뽑기 위해 일조량과 습도를 알아내고 싶었습니다.

그래서 비오는 날의 dataframe과 비가 오지 않는 날의 dataframe을 뽑아내고, 습도, 강수량, 일조에 대한 데이터를 뽑아 보았습니다.

In [17]:
rainy_df = train_df[train_df['precipitation']>0]

print(rainy_df.shape)

(18927, 10)


In [18]:
not_rainy_df = train_df[train_df['precipitation']==0]

print(not_rainy_df.shape)

(103473, 10)


122400개의 데이터 중, 비가 오는 data는 18927개, 비가 오지 않는 data는 103473개가 있습니다.

비오는날의 일조량, 습도를 뽑아 보는데, 해가 뜬 시간과 해가 저문 시간에 따라 일조량과 습도가 다를 것 같아, 06\~18시 까지는 day time, 00\~06, 18\~23는 night time으로 나누어 일조량과 습도를 뽑아보았습니다.

시간으로 나누어 주기 위해, rainy dataframe과 not rainy dataframe에 hour column을 추가해주었습니다.

In [20]:
rainy_df['datetime'] = pd.to_datetime(rainy_df['datetime'])
rainy_df['hour'] = rainy_df['datetime'].dt.hour

rainy_df.head()

Unnamed: 0,num,datetime,target,temperature,windspeed,humidity,precipitation,insolation,nelec_cool_flag,solar_flag,hour
0,1,2020-06-01 00:00:00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0
1,1,2020-06-01 01:00:00,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1
6,1,2020-06-01 06:00:00,7978.176,16.7,3.4,90.0,0.1,0.0,0.0,0.0,6
36,1,2020-06-02 12:00:00,7935.408,16.3,2.9,80.0,1.9,0.0,0.0,0.0,12
48,1,2020-06-03 00:00:00,8114.904,18.4,1.7,84.0,0.1,0.0,0.0,0.0,0


In [21]:
not_rainy_df['datetime'] = pd.to_datetime(not_rainy_df['datetime'])
not_rainy_df['hour'] = not_rainy_df['datetime'].dt.hour

not_rainy_df.head()

Unnamed: 0,num,datetime,target,temperature,windspeed,humidity,precipitation,insolation,nelec_cool_flag,solar_flag,hour
2,1,2020-06-01 02:00:00,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2
3,1,2020-06-01 03:00:00,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3
4,1,2020-06-01 04:00:00,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4
5,1,2020-06-01 05:00:00,8010.576,16.9,3.4,93.0,0.0,0.0,0.0,0.0,5
7,1,2020-06-01 07:00:00,8019.0,16.9,2.3,86.0,0.0,0.1,0.0,0.0,7


# daytime에서 RainyDay 일조량, 습도 데이터



In [22]:
# daytime_rainy_df = rainy_df[rainy_df['precipitation']==0]


Unnamed: 0,num,datetime,target,temperature,windspeed,humidity,precipitation,insolation,nelec_cool_flag,solar_flag,hour
0,1,2020-06-01 00:00:00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0
1,1,2020-06-01 01:00:00,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1
6,1,2020-06-01 06:00:00,7978.176,16.7,3.4,90.0,0.1,0.0,0.0,0.0,6
36,1,2020-06-02 12:00:00,7935.408,16.3,2.9,80.0,1.9,0.0,0.0,0.0,12
48,1,2020-06-03 00:00:00,8114.904,18.4,1.7,84.0,0.1,0.0,0.0,0.0,0


TypeError: 'numpy.ndarray' object is not callable