In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## 가중치 평균

In [4]:
train = pd.read_csv('rainfall_train.csv')
test = pd.read_csv('rainfall_test.csv')

In [8]:
train = train.drop('Unnamed: 0', axis=1)
train.columns = ['fc_year','fc_month','fc_day','fc_hour','stn4contest','dh','ef_year','ef_month','ef_day','ef_hour',
                 'v01','v02','v03','v04','v05','v06','v07','v08','v09','vv','class_interval']

test = test.drop('Unnamed: 0', axis=1)
test.columns = ['fc_year','fc_month','fc_day','fc_hour','stn4contest','dh','ef_year','ef_month','ef_day','ef_hour',
                 'v01','v02','v03','v04','v05','v06','v07','v08','v09','class_interval']

In [9]:
train_drop = train[train['vv'] != -999]

In [10]:
df = train_drop.iloc[:,:19]

In [11]:
df['ef_year_temp'] = df['ef_year'].replace({'A': '2020', 'B': '2021', 'C': '2022'})
df['ef_datetime'] = pd.to_datetime(df[['ef_year_temp', 'ef_month', 'ef_day', 'ef_hour']].astype(str).agg('-'.join, axis=1), 
                                   format='%Y-%m-%d-%H')
df['weight'] = 1 / df['dh']

for var in ['v01', 'v02', 'v03','v04','v05','v06','v07','v08','v09']:
    df[f'{var}_weighted'] = df[var] * df['weight']

weighted_df = df.groupby(['stn4contest','ef_datetime']).apply(
    lambda x: pd.Series({
        'v01': np.average(x['v01'], weights=x['weight']),
        'v02': np.average(x['v02'], weights=x['weight']),
        'v03': np.average(x['v03'], weights=x['weight']),
        'v04': np.average(x['v04'], weights=x['weight']),
        'v05': np.average(x['v05'], weights=x['weight']),
        'v06': np.average(x['v06'], weights=x['weight']),
        'v07': np.average(x['v07'], weights=x['weight']),
        'v08': np.average(x['v08'], weights=x['weight']),
        'v09': np.average(x['v09'], weights=x['weight'])
    })
).reset_index()

In [12]:
train_y = train_drop.groupby(['stn4contest','ef_year','ef_month','ef_day','ef_hour'])[['vv','class_interval']].mean().reset_index(drop=True)

In [13]:
weighted_df['vv'] = train_y['vv']
weighted_df['class_interval'] = train_y['class_interval']
weighted_df = weighted_df.astype({'class_interval':'int'})

In [14]:
weighted_df['ef_datetime'] = pd.to_datetime(weighted_df.ef_datetime)

In [15]:
weighted_df['year'] = weighted_df['ef_datetime'].dt.year
weighted_df['month'] = weighted_df['ef_datetime'].dt.month
weighted_df['day'] = weighted_df['ef_datetime'].dt.day
weighted_df['hour'] = weighted_df['ef_datetime'].dt.hour

In [16]:
weighted_df = weighted_df.drop(columns=['ef_datetime'])

In [17]:
## 가중치평균한 데이터 저장
weighted_df.to_csv('data/train_stn+ef.csv', index=False, encoding='utf-8')

OSError: Cannot save file into a non-existent directory: 'data'

## 라벨인코딩, 원핫인코딩

In [18]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
stn_label = encoder.fit_transform(weighted_df[['stn4contest']])
weighted_df['stn4contest'] = stn_label

  y = column_or_1d(y, warn=True)


In [19]:
encoder.classes_

array(['STN001', 'STN002', 'STN003', 'STN004', 'STN005', 'STN006',
       'STN007', 'STN008', 'STN009', 'STN010', 'STN011', 'STN012',
       'STN013', 'STN014', 'STN015', 'STN016', 'STN017', 'STN018',
       'STN019', 'STN020'], dtype=object)

In [20]:
weighted_df

Unnamed: 0,stn4contest,v01,v02,v03,v04,v05,v06,v07,v08,v09,vv,class_interval,year,month,day,hour
0,0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,12
1,0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,15
2,0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,18
3,0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,21
4,0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77518,19,10.948718,7.974359,3.948718,2.0,0.487179,0.0,0.0,0.0,0.0,0.5,3,2022,10,10,9
77519,19,13.000000,9.000000,5.000000,3.0,1.000000,0.0,0.0,0.0,0.0,0.0,0,2022,10,10,12
77520,19,13.000000,9.000000,5.000000,3.0,1.000000,0.0,0.0,0.0,0.0,1.9,4,2022,10,10,15
77521,19,8.000000,6.000000,4.000000,2.0,2.000000,0.0,0.0,0.0,0.0,1.9,4,2022,10,10,18


In [39]:
weighted_df = weighted_df.drop(columns=['ef_datetime'])

TypeError: drop() got an unexpected keyword argument 'dtype'

In [25]:
## label encoding
weighted_df.to_csv('data/train_stn+ef_label.csv', index=False, encoding='utf-8')

In [37]:
weighted_df = pd.read_csv('data/train_stn+ef.csv')

In [38]:
stn_onehot = pd.get_dummies(weighted_df['stn4contest'], dtype=int)

In [41]:
weighted_df = pd.concat([weighted_df.drop(columns=['stn4contest']), stn_onehot], axis=1)

In [51]:
weighted_df.head()

Unnamed: 0,v01,v02,v03,v04,v05,v06,v07,v08,v09,vv,...,STN011,STN012,STN013,STN014,STN015,STN016,STN017,STN018,STN019,STN020
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
## onehot encoding
weighted_df.to_csv('data/train_stn+ef_onehot.csv', index=False, encoding='utf-8')

## 결측치 보간

### 선형보간

In [44]:
nan = train[train.vv == -999]

In [45]:
train_replace = train.replace(-999, np.nan)
train_replace.isnull().sum()

fc_year              0
fc_month             0
fc_day               0
fc_hour              0
stn4contest          0
dh                   0
ef_year              0
ef_month             0
ef_day               0
ef_hour              0
v01                  0
v02                  0
v03                  0
v04                  0
v05                  0
v06                  0
v07                  0
v08                  0
v09                  0
vv                8490
class_interval    8490
dtype: int64

In [46]:
train_replace['vv'].interpolate(method='linear', inplace=True)
train_replace.isnull().sum()

fc_year              0
fc_month             0
fc_day               0
fc_hour              0
stn4contest          0
dh                   0
ef_year              0
ef_month             0
ef_day               0
ef_hour              0
v01                  0
v02                  0
v03                  0
v04                  0
v05                  0
v06                  0
v07                  0
v08                  0
v09                  0
vv                   0
class_interval    8490
dtype: int64

In [47]:
def classify_class(value):
    if value < 0.1:
        return 0
    elif 0.1 <= value < 0.2:
        return 1
    elif 0.2 <= value < 0.5:
        return 2
    elif 0.5 <= value < 1.0:
        return 3
    elif 1.0 <= value < 2.0:
        return 4
    elif 2.0 <= value < 4.9:
        return 5
    elif 5.0 <= value < 10.0:
        return 6
    elif 10.0 <= value < 20.0:
        return 7
    elif 20.0 <= value < 30.0:
        return 8
    elif 30.0 <= value:
        return 9

In [48]:
class_interpolate = train_replace.iloc[nan.index]['vv'].apply(classify_class)
for idx, v in zip(nan.index, class_interpolate):
    train_replace.loc[idx, 'class_interval'] = v

In [50]:
train_replace.head()

Unnamed: 0,fc_year,fc_month,fc_day,fc_hour,stn4contest,dh,ef_year,ef_month,ef_day,ef_hour,...,v02,v03,v04,v05,v06,v07,v08,v09,vv,class_interval
0,A,5,1,9,STN001,3,A,5,1,12,...,0,0,0,0,0,0,0,0,0.0,0.0
1,A,5,1,9,STN001,6,A,5,1,15,...,0,0,0,0,0,0,0,0,0.0,0.0
2,A,5,1,9,STN001,9,A,5,1,18,...,0,0,0,0,0,0,0,0,0.0,0.0
3,A,5,1,9,STN001,12,A,5,1,21,...,0,0,0,0,0,0,0,0,0.0,0.0
4,A,5,1,9,STN001,15,A,5,2,0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [53]:
len(train_replace) == len(train)

True

In [9]:
train_replace.to_csv('data/train_linear_interpolate.csv', index=False, encoding='utf-8')

### 선형보간 + 가중치평균

In [10]:
df = train_replace.iloc[:,:19]

In [11]:
df['ef_year_temp'] = df['ef_year'].replace({'A': '2020', 'B': '2021', 'C': '2022'})
df['ef_datetime'] = pd.to_datetime(df[['ef_year_temp', 'ef_month', 'ef_day', 'ef_hour']].astype(str).agg('-'.join, axis=1), 
                                   format='%Y-%m-%d-%H')
df['weight'] = 1 / df['dh']

for var in ['v01', 'v02', 'v03','v04','v05','v06','v07','v08','v09']:
    df[f'{var}_weighted'] = df[var] * df['weight']

weighted_df = df.groupby(['stn4contest','ef_datetime']).apply(
    lambda x: pd.Series({
        'v01': np.average(x['v01'], weights=x['weight']),
        'v02': np.average(x['v02'], weights=x['weight']),
        'v03': np.average(x['v03'], weights=x['weight']),
        'v04': np.average(x['v04'], weights=x['weight']),
        'v05': np.average(x['v05'], weights=x['weight']),
        'v06': np.average(x['v06'], weights=x['weight']),
        'v07': np.average(x['v07'], weights=x['weight']),
        'v08': np.average(x['v08'], weights=x['weight']),
        'v09': np.average(x['v09'], weights=x['weight'])
    })
).reset_index()

In [14]:
train_y = train_replace.groupby(['stn4contest','ef_year','ef_month','ef_day','ef_hour'])[['vv','class_interval']].mean().reset_index(drop=True)

In [15]:
weighted_df['vv'] = train_y['vv']
weighted_df['class_interval'] = train_y['class_interval']
weighted_df = weighted_df.astype({'class_interval':'int'})

In [16]:
weighted_df['year'] = weighted_df['ef_datetime'].dt.year
weighted_df['month'] = weighted_df['ef_datetime'].dt.month
weighted_df['day'] = weighted_df['ef_datetime'].dt.day
weighted_df['hour'] = weighted_df['ef_datetime'].dt.hour

In [17]:
weighted_df

Unnamed: 0,stn4contest,ef_datetime,v01,v02,v03,v04,v05,v06,v07,v08,v09,vv,class_interval,year,month,day,hour
0,STN001,2020-05-01 12:00:00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,12
1,STN001,2020-05-01 15:00:00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,15
2,STN001,2020-05-01 18:00:00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,18
3,STN001,2020-05-01 21:00:00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,21
4,STN001,2020-05-02 00:00:00,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0,2020,5,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77995,STN020,2022-10-10 09:00:00,10.948718,7.974359,3.948718,2.0,0.487179,0.0,0.0,0.0,0.0,0.5,3,2022,10,10,9
77996,STN020,2022-10-10 12:00:00,13.000000,9.000000,5.000000,3.0,1.000000,0.0,0.0,0.0,0.0,0.0,0,2022,10,10,12
77997,STN020,2022-10-10 15:00:00,13.000000,9.000000,5.000000,3.0,1.000000,0.0,0.0,0.0,0.0,1.9,4,2022,10,10,15
77998,STN020,2022-10-10 18:00:00,8.000000,6.000000,4.000000,2.0,2.000000,0.0,0.0,0.0,0.0,1.9,4,2022,10,10,18


In [18]:
weighted_df.to_csv('data/train_linear_interpolate_stn+ef.csv', index=False, encoding='utf-8')