In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc #맥 한글깨짐
rc('font', family='AppleGothic')#맥 한글깨짐

# 날씨 데이터
- 데이터 출처 : [기상청 기상자료개방포털](https://data.kma.go.kr/data/grnd/selectAsosRltmList.do?pgmNo=36)
- 제주도의 하루 비, 눈, 데이터


In [3]:
df_rain = pd.read_csv("/Users/yeseulseo/Side_Project/DACON/Jeju_traffic/data/rain.csv" , encoding='cp949')
df_snow = pd.read_csv("/Users/yeseulseo/Side_Project/DACON/Jeju_traffic/data/snow.csv" ,encoding='cp949')

In [3]:
df_rain.head()

Unnamed: 0,지점,지점명,일시,일강수량(mm)
0,184,제주,2021-09-01,14.5
1,184,제주,2021-09-02,37.8
2,184,제주,2021-09-03,21.8
3,184,제주,2021-09-05,7.6
4,184,제주,2021-09-06,5.3


In [4]:
df_snow.head()

Unnamed: 0,지점,지점명,일시,일 최심적설(cm)
0,184,제주,2021-12-26,4.4
1,184,제주,2021-12-27,1.2
2,184,제주,2022-01-11,0.4
3,184,제주,2022-02-20,0.3
4,189,서귀포,2021-12-17,0.4


## 결측값 

In [5]:
print('비 데이터 결측값')
print(df_rain.isnull().sum(), '\n')

print('눈 데이터 결측값')
print(df_snow.isnull().sum())

비 데이터 결측값
지점          0
지점명         0
일시          0
일강수량(mm)    0
dtype: int64 

눈 데이터 결측값
지점            0
지점명           0
일시            0
일 최심적설(cm)    0
dtype: int64


결측값 존재하지 않음.

## 데이터 정보

In [6]:
print(df_rain.info(),'\n')
print(df_snow.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   지점        568 non-null    int64  
 1   지점명       568 non-null    object 
 2   일시        568 non-null    object 
 3   일강수량(mm)  568 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 17.9+ KB
None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   지점          7 non-null      int64  
 1   지점명         7 non-null      object 
 2   일시          7 non-null      object 
 3   일 최심적설(cm)  7 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 352.0+ bytes
None


## 컬럼 변경

In [7]:
snow_df=df_snow[['일시','일 최심적설(cm)']]
snow_df.columns=['base_date','rain_snow']
snow_df

Unnamed: 0,base_date,rain_snow
0,2021-12-26,4.4
1,2021-12-27,1.2
2,2022-01-11,0.4
3,2022-02-20,0.3
4,2021-12-17,0.4
5,2021-12-26,1.0
6,2021-12-27,1.0


In [8]:
rain_df=df_rain[['일시','일강수량(mm)']]
rain_df.columns = ['base_date', 'rain_snow']
rain_df

Unnamed: 0,base_date,rain_snow
0,2021-09-01,14.5
1,2021-09-02,37.8
2,2021-09-03,21.8
3,2021-09-05,7.6
4,2021-09-06,5.3
...,...,...
563,2022-08-19,0.0
564,2022-08-24,7.2
565,2022-08-26,0.0
566,2022-08-30,0.4


눈, 비 데이터 합치기

In [9]:
rain_snow=pd.concat([rain_df,snow_df],axis=0)
rain_snow.head()

Unnamed: 0,base_date,rain_snow
0,2021-09-01,14.5
1,2021-09-02,37.8
2,2021-09-03,21.8
3,2021-09-05,7.6
4,2021-09-06,5.3


날짜별 그룹화 

In [10]:
rain_snow = rain_snow.groupby('base_date').mean().reset_index()
rain_snow

Unnamed: 0,base_date,rain_snow
0,2021-09-01,11.625
1,2021-09-02,37.500
2,2021-09-03,37.600
3,2021-09-05,9.300
4,2021-09-06,2.725
...,...,...
191,2022-08-24,7.200
192,2022-08-26,0.000
193,2022-08-27,0.000
194,2022-08-30,5.800


날짜 데이터 수정

In [11]:

rain_snow["base_date"] = rain_snow["base_date"].str.replace(pat=r'[^\w]', repl=r'', regex=True)
rain_snow["base_date"] = rain_snow["base_date"].astype(int)
rain_snow.head()

Unnamed: 0,base_date,rain_snow
0,20210901,11.625
1,20210902,37.5
2,20210903,37.6
3,20210905,9.3
4,20210906,2.725


- `rain_snow` 값이 `0.0` 보다 크면 `눈비`, `0.0`이면 `없음`으로 변경

In [12]:
for i in range(len(rain_snow)):
    if rain_snow['rain_snow'][i] == 0.0:
        rain_snow.loc[i, 'rain_snow'] = '없음'
    else :
        rain_snow.loc[i, 'rain_snow'] = '눈비'


rain_snow

Unnamed: 0,base_date,rain_snow
0,20210901,눈비
1,20210902,눈비
2,20210903,눈비
3,20210905,눈비
4,20210906,눈비
...,...,...
191,20220824,눈비
192,20220826,없음
193,20220827,없음
194,20220830,눈비


In [17]:
rain_snow['rain_snow'].value_counts()

rain_snow
눈비    150
없음     46
Name: count, dtype: int64

위 결과는 히스토그램으로 표현하면 더 좋을듯! 

In [19]:
rain_snow.to_csv('data/rain_snow.csv', index=False)

# 공휴일 데이터

In [4]:
holiday = pd.DataFrame({'base_date':[20210920,20210921,20210922,20211003,20211004,20211009,20211011,20211225,
                                      20220101,20220131,20220201,20220202,20220301,20220309,20220505,20220508,20220601,20220606,20220815  ]})

holiday['week'] = 0

In [5]:
holiday

Unnamed: 0,base_date,week
0,20210920,0
1,20210921,0
2,20210922,0
3,20211003,0
4,20211004,0
5,20211009,0
6,20211011,0
7,20211225,0
8,20220101,0
9,20220131,0


In [6]:
holiday.to_csv('data/holiday.csv', index=False)