In [1]:
import pandas as pd 

data = pd.read_csv('./날씨.csv',encoding='CP949')

# 빈 데이터 0 채우기
data = data.fillna(0)

# 불필요한 지점코드 제거
data = data.drop(['지점'], axis=1)

data.head()

Unnamed: 0,지점명,일시,기온(°C),강수량(mm)
0,동두천,2019-01-01 00:00,-9.6,0.0
1,동두천,2019-01-01 01:00,-10.5,0.0
2,동두천,2019-01-01 02:00,-10.8,0.0
3,동두천,2019-01-01 03:00,-11.0,0.0
4,동두천,2019-01-01 04:00,-11.6,0.0


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78768 entries, 0 to 78767
Data columns (total 4 columns):
지점명        78768 non-null object
일시         78768 non-null object
기온(°C)     78768 non-null float64
강수량(mm)    78768 non-null float64
dtypes: float64(2), object(2)
memory usage: 2.4+ MB


In [3]:
# 수도권 지역 강수량,기온 평균
data = data.pivot_table(index = '일시', values = ['기온(°C)','강수량(mm)'], aggfunc='mean')

data.head()

Unnamed: 0_level_0,강수량(mm),기온(°C)
일시,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00,0.0,-7.044444
2019-01-01 01:00,0.0,-7.755556
2019-01-01 02:00,0.0,-8.133333
2019-01-01 03:00,0.0,-8.655556
2019-01-01 04:00,0.0,-9.088889


In [6]:
## 소수점 자리수 제한
기온 = round(data['기온(°C)'],1) 
data['기온(°C)'] = 기온

data.head()

Unnamed: 0_level_0,강수량(mm),기온(°C)
일시,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00,0.0,-7.0
2019-01-01 01:00,0.0,-7.8
2019-01-01 02:00,0.0,-8.1
2019-01-01 03:00,0.0,-8.7
2019-01-01 04:00,0.0,-9.1


In [7]:
# 시간을 20분 단위로 upsampling
data['일시'] = pd.date_range("2019-01-01 00:00:00", "2020-01-01 00:00:00", freq="H")
data = data.set_index('일시')
data = data.resample('20T').ffill()

data.head()

Unnamed: 0_level_0,강수량(mm),기온(°C)
일시,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00:00,0.0,-7.0
2019-01-01 00:20:00,0.0,-7.0
2019-01-01 00:40:00,0.0,-7.0
2019-01-01 01:00:00,0.0,-7.8
2019-01-01 01:20:00,0.0,-7.8


In [8]:
data = data.reset_index()
data = data.astype({'일시': 'str'})
data = data.replace('-','',regex=True)
data['방송날짜'] = data.일시.str.split(' ').str[0]
data['방송시간'] = data.일시.str.split(' ').str[1]

data.head()

Unnamed: 0,일시,강수량(mm),기온(°C),방송날짜,방송시간
0,20190101 00:00:00,0.0,-7.0,20190101,00:00:00
1,20190101 00:20:00,0.0,-7.0,20190101,00:20:00
2,20190101 00:40:00,0.0,-7.0,20190101,00:40:00
3,20190101 01:00:00,0.0,-7.8,20190101,01:00:00
4,20190101 01:20:00,0.0,-7.8,20190101,01:20:00


In [9]:
del data['일시']
new_columns = ['방송날짜',  '방송시간', '기온(°C)','강수량(mm)']
data = data[ new_columns ]

data.head()

Unnamed: 0,방송날짜,방송시간,기온(°C),강수량(mm)
0,20190101,00:00:00,-7.0,0.0
1,20190101,00:20:00,-7.0,0.0
2,20190101,00:40:00,-7.0,0.0
3,20190101,01:00:00,-7.8,0.0
4,20190101,01:20:00,-7.8,0.0


In [10]:
# 전처리 데이터 엑셀로 저장
data.to_excel('./날씨 전처리.xlsx', index = False)