In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime
from dateutil.parser import parse

In [27]:
train = pd.read_csv('surface_cp_train.csv', encoding='cp949')
train.shape

(438240, 15)

In [3]:
train[train['surface_tp_train.rn']==-90]['surface_tp_train.rn']

Series([], Name: surface_tp_train.rn, dtype: float64)

In [11]:
train[train['surface_tp_train.re']<=-90]['surface_tp_train.re'].unique()

array([-99], dtype=int64)

In [None]:
train_copy = train.copy()

train_copy['month'] = train_copy['surface_tp_train.mmddhh']//10000

train_int = train_copy['surface_tp_train.mmddhh'].astype(str)

train_copy['day'] = np.nan
train_copy['hour'] = np.nan

# 슬라이싱
# 역슬라이싱으로 day부분 추출
train_copy['day'] = train_int.apply(lambda x: int(x[-4:-2])) 
# 역슬라이싱으로 hour추출
train_copy['hour'] = train_int.apply(lambda x: int(x[-2:]))

print(train_copy.month.value_counts().sort_index())
print('\n')
print(train_copy.day.value_counts().sort_index())
print('\n')
print(train_copy.hour.value_counts().sort_index())


In [14]:
train_copy.replace(-99.9, pd.NA, inplace=True)
print(train.info())
train_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438240 entries, 0 to 438239
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Unnamed: 0               438240 non-null  int64  
 1   surface_tp_train.stn     438240 non-null  int64  
 2   surface_tp_train.year    438240 non-null  object 
 3   surface_tp_train.mmddhh  438240 non-null  int64  
 4   surface_tp_train.ta      438240 non-null  float64
 5   surface_tp_train.td      438240 non-null  float64
 6   surface_tp_train.hm      438240 non-null  float64
 7   surface_tp_train.ws      438240 non-null  float64
 8   surface_tp_train.rn      438240 non-null  float64
 9   surface_tp_train.re      438240 non-null  int64  
 10  surface_tp_train.ww      438240 non-null  object 
 11  surface_tp_train.ts      438240 non-null  float64
 12  surface_tp_train.si      438240 non-null  float64
 13  surface_tp_train.ss      438240 non-null  float64
 14  surf

In [15]:
train_copy['surface_tp_train.mmddhh'] = pd.to_datetime(train_copy['surface_tp_train.mmddhh'].astype(str).str[-8:], format="%m%d%H", errors='coerce')

In [16]:
train_copy['surface_tp_train.mmddhh']

0        1900-02-01 00:00:00
1        1900-02-01 01:00:00
2        1900-02-01 02:00:00
3        1900-02-01 03:00:00
4        1900-02-01 04:00:00
                 ...        
438235   1900-01-31 19:00:00
438236   1900-01-31 20:00:00
438237   1900-01-31 21:00:00
438238   1900-01-31 22:00:00
438239   1900-01-31 23:00:00
Name: surface_tp_train.mmddhh, Length: 438240, dtype: datetime64[ns]

In [17]:
train_copy['surface_tp_train.mmddhh'].unique()
year_a = train_copy[train_copy['surface_tp_train.year']=='A']


year_a.set_index('surface_tp_train.mmddhh',inplace=True)

In [18]:
train_copy['surface_tp_train.rn']

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
438235    0.0
438236    0.0
438237    0.0
438238    0.0
438239    0.0
Name: surface_tp_train.rn, Length: 438240, dtype: object

In [19]:
test = pd.read_csv('cp_test.csv', encoding='cp949')
test.head()

Unnamed: 0.1,Unnamed: 0,surface_tp_test.stn,surface_tp_test.year,surface_tp_test.mmddhh,surface_tp_test.ta,surface_tp_test.td,surface_tp_test.hm,surface_tp_test.ws,surface_tp_test.rn,surface_tp_test.re,surface_tp_test.ww,surface_tp_test.si,surface_tp_test.ss,surface_tp_test.sn
0,1,a,F,20100,0.6,-2.0,82.5,2.7,0.0,0,G,-99.9,-99.9,3.1
1,2,a,F,20101,0.0,-5.2,68.3,3.2,0.0,0,R,-99.9,-99.9,3.1
2,3,a,F,20102,-0.3,-6.4,63.7,2.7,0.0,0,C,-99.9,-99.9,3.1
3,4,a,F,20103,-1.0,-4.5,77.2,2.1,0.2,7,R,-99.9,-99.9,4.1
4,5,a,F,20104,-1.4,-3.1,88.3,2.9,0.6,3,R,-99.9,-99.9,4.7


In [20]:
test['mmdd'] = test['surface_tp_test.mmddhh']//100
test.mmdd

0        201
1        201
2        201
3        201
4        201
        ... 
26275    131
26276    131
26277    131
26278    131
26279    131
Name: mmdd, Length: 26280, dtype: int64

In [21]:
test[test['mmdd']==229]

Unnamed: 0.1,Unnamed: 0,surface_tp_test.stn,surface_tp_test.year,surface_tp_test.mmddhh,surface_tp_test.ta,surface_tp_test.td,surface_tp_test.hm,surface_tp_test.ws,surface_tp_test.rn,surface_tp_test.re,surface_tp_test.ww,surface_tp_test.si,surface_tp_test.ss,surface_tp_test.sn,mmdd
