In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime
import matplotlib.dates as mdates
import xgboost
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from pycaret.regression import *

In [2]:
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/NGULIM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

In [3]:
train = pd.read_csv('data/train_0704f.csv')
print(train.isnull().sum())
print(train.shape)

Unnamed: 0      0
stn             0
year            0
mmddhh          0
ta              0
td              0
hm              0
ws              0
rn             82
re             82
ww              0
ts              0
si              0
ss              0
sn              0
month           0
day             0
hour            0
spring          0
summer          0
autumn          0
winter          0
season          0
year_num        0
ymd             0
ymdh            0
day_split       0
re_interval     0
inflection      0
dtype: int64
(437373, 29)


In [4]:
test = pd.read_csv('data/test_0704f.csv')
print(test.isnull().sum())
print(test.shape)

Unnamed: 0     0
stn            0
year           0
mmddhh         0
ta             0
td             0
hm             0
ws             0
rn             4
re             4
ww             0
si             0
ss             0
sn             0
year_num       0
month          0
day            0
hour           0
ymd            0
ymdh           0
day_split      0
re_interval    0
inflection     0
dtype: int64
(26280, 23)


##### Imputation
1. labelencoder
2. knnimputer

In [5]:
train.columns

Index(['Unnamed: 0', 'stn', 'year', 'mmddhh', 'ta', 'td', 'hm', 'ws', 'rn',
       're', 'ww', 'ts', 'si', 'ss', 'sn', 'month', 'day', 'hour', 'spring',
       'summer', 'autumn', 'winter', 'season', 'year_num', 'ymd', 'ymdh',
       'day_split', 're_interval', 'inflection'],
      dtype='object')

In [6]:
test.columns

Index(['Unnamed: 0', 'stn', 'year', 'mmddhh', 'ta', 'td', 'hm', 'ws', 'rn',
       're', 'ww', 'si', 'ss', 'sn', 'year_num', 'month', 'day', 'hour', 'ymd',
       'ymdh', 'day_split', 're_interval', 'inflection'],
      dtype='object')

In [7]:
train_copy = train[['stn', 'mmddhh', 'ta', 'hm', 'ws', 'rn',
       're', 'ts', 'si', 'ss', 'sn','month', 'day', 'hour','year_num',
       'day_split', 'inflection']].copy()
test_copy = test[['stn', 'mmddhh', 'ta', 'hm', 'ws',
       'rn', 're', 'si', 'ss', 'sn', 'month', 'day', 'hour','year_num',
       'day_split', 'inflection']].copy()

print(train_copy.dtypes)
print(test_copy.dtypes)

stn             int64
mmddhh          int64
ta            float64
hm            float64
ws            float64
rn            float64
re            float64
ts            float64
si            float64
ss            float64
sn            float64
month           int64
day             int64
hour            int64
year_num        int64
day_split      object
inflection     object
dtype: object
stn            object
mmddhh          int64
ta            float64
hm            float64
ws            float64
rn            float64
re            float64
si            float64
ss            float64
sn            float64
month           int64
day             int64
hour            int64
year_num        int64
day_split      object
inflection     object
dtype: object


In [8]:
test_copy['stn'] = test_copy.stn.apply(lambda x: 1 if x == 'a'
                                               else 2 if x== 'b'
                                               else 3 if x=='c'
                                            else x)

In [9]:
non_numeric_cols = train_copy.select_dtypes(exclude=['float', 'int']).columns
print(non_numeric_cols)
le = LabelEncoder()
for col in non_numeric_cols:
    train_copy[col] = le.fit_transform(train_copy[col])
    test_copy[col] = le.transform(test_copy[col])
    
print(train_copy.head(3))
print(test_copy.head(3))

Index(['day_split', 'inflection'], dtype='object')
   stn  mmddhh    ta    hm   ws   rn   re   ts   si   ss   sn  month  day  \
0    1   20100  -9.9  93.9  0.6  0.0  0.0 -1.3  0.0  0.0  0.0      2    1   
1    1   20101 -10.8  93.8  0.6  0.0  0.0 -1.5  0.0  0.0  0.0      2    1   
2    1   20102 -11.4  94.6  0.7  0.0  0.0 -1.7  0.0  0.0  0.0      2    1   

   hour  year_num  day_split  inflection  
0     0      2016          2           1  
1     1      2016          2           1  
2     2      2016          2           1  
   stn  mmddhh   ta    hm   ws   rn   re   si   ss   sn  month  day  hour  \
0    1   20100  0.6  82.5  2.7  0.0  0.0  0.0  0.0  3.1      2    1     0   
1    1   20101  0.0  68.3  3.2  0.0  0.0  0.0  0.0  3.1      2    1     1   
2    1   20102 -0.3  63.7  2.7  0.0  0.0  0.0  0.0  3.1      2    1     2   

   year_num  day_split  inflection  
0      2021          2           1  
1      2021          2           1  
2      2021          2           1  


In [10]:
train_copy['day_split'] = train_copy['day_split'] + 1

test_copy['day_split'] = test_copy['day_split'] + 1 

In [11]:
X = train_copy.drop(['ts'], axis=1)
Y = train_copy['ts']

In [12]:
print(X.columns)
print(test_copy.columns)

Index(['stn', 'mmddhh', 'ta', 'hm', 'ws', 'rn', 're', 'si', 'ss', 'sn',
       'month', 'day', 'hour', 'year_num', 'day_split', 'inflection'],
      dtype='object')
Index(['stn', 'mmddhh', 'ta', 'hm', 'ws', 'rn', 're', 'si', 'ss', 'sn',
       'month', 'day', 'hour', 'year_num', 'day_split', 'inflection'],
      dtype='object')


In [13]:
imp=KNNImputer(n_neighbors=5, weights='distance')

imp_train = imp.fit_transform(X)
imp_test = imp.transform(test_copy)

train_imp=pd.DataFrame(imp_train, columns=X.columns)
test_imp=pd.DataFrame(imp_test, columns=test_copy.columns)

print(train_imp.isnull().sum())
print(test_imp.isnull().sum())

stn           0
mmddhh        0
ta            0
hm            0
ws            0
rn            0
re            0
si            0
ss            0
sn            0
month         0
day           0
hour          0
year_num      0
day_split     0
inflection    0
dtype: int64
stn           0
mmddhh        0
ta            0
hm            0
ws            0
rn            0
re            0
si            0
ss            0
sn            0
month         0
day           0
hour          0
year_num      0
day_split     0
inflection    0
dtype: int64


##### 스케일링

In [14]:
# standard
from sklearn.preprocessing import StandardScaler

st = StandardScaler()

scaled_train = st.fit_transform(train_imp)
scaled_test = st.transform(test_imp)

s_train=pd.DataFrame(scaled_train, columns=train_imp.columns)
s_test=pd.DataFrame(scaled_test, columns=test_imp.columns)

s_train['inflection'] = train['inflection']
s_test['inflection'] = test['inflection']

s_train['ts'] = train['ts']
print(s_train.head())
print(s_test.head())

        stn    mmddhh        ta        hm        ws        rn        re  \
0 -1.566476 -1.354646 -2.330543  1.306805 -0.987598 -0.120021 -0.240619   
1 -1.566476 -1.354617 -2.418987  1.302180 -0.987598 -0.120021 -0.240619   
2 -1.566476 -1.354588 -2.477949  1.339174 -0.926544 -0.120021 -0.240619   
3 -1.566476 -1.354559 -2.497603  1.283683 -0.987598 -0.120021 -0.240619   
4 -1.566476 -1.354530 -2.517258  1.265186 -0.987598 -0.120021 -0.240619   

        si       ss        sn     month       day      hour  year_num  \
0 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.660738 -1.446405   
1 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.516309 -1.446405   
2 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.371879 -1.446405   
3 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.227449 -1.446405   
4 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.083019 -1.446405   

   day_split inflection   ts  
0  -0.030786         up -1.3  
1  -0.030786         up -1.5  
2  -0.030786     

In [15]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()

scaled_train = mms.fit_transform(train_imp)
scaled_test = mms.transform(test_imp)

m_train=pd.DataFrame(scaled_train, columns=train_imp.columns)
m_test=pd.DataFrame(scaled_test, columns=test_imp.columns)

m_train['inflection'] = train['inflection']
m_test['inflection'] = test['inflection']

m_train['ts'] = train['ts']
print(m_train.head())
print(m_test.head())

   stn    mmddhh        ta        hm        ws   rn   re   si   ss   sn  \
0  0.0  0.088478  0.191803  0.937113  0.024390  0.0  0.0  0.0  0.0  0.0   
1  0.0  0.088486  0.177049  0.936082  0.024390  0.0  0.0  0.0  0.0  0.0   
2  0.0  0.088495  0.167213  0.944330  0.028455  0.0  0.0  0.0  0.0  0.0   
3  0.0  0.088504  0.163934  0.931959  0.024390  0.0  0.0  0.0  0.0  0.0   
4  0.0  0.088513  0.160656  0.927835  0.024390  0.0  0.0  0.0  0.0  0.0   

      month  day      hour  year_num  day_split inflection   ts  
0  0.090909  0.0  0.000000       0.0        0.5         up -1.3  
1  0.090909  0.0  0.043478       0.0        0.5         up -1.5  
2  0.090909  0.0  0.086957       0.0        0.5         up -1.7  
3  0.090909  0.0  0.130435       0.0        0.5         up -1.8  
4  0.090909  0.0  0.173913       0.0        0.5         up -2.0  
   stn    mmddhh        ta        hm        ws        rn        re   si   ss  \
0  0.0  0.088478  0.363934  0.819588  0.109756  0.000000  0.000000  0.0  

In [16]:
from sklearn.preprocessing import RobustScaler

rs = RobustScaler()

scaled_train = rs.fit_transform(train_imp)
scaled_test = rs.transform(test_imp)

r_train=pd.DataFrame(scaled_train, columns=train_imp.columns)
r_test=pd.DataFrame(scaled_test, columns=test_imp.columns)

r_train['inflection'] = train['inflection']
r_test['inflection'] = test['inflection']

r_train['ts'] = train['ts']
print(r_train.head())
print(r_test.head())

   stn    mmddhh        ta        hm    ws   rn   re        si   ss   sn  \
0 -1.0 -0.836396 -1.500000  0.789017 -0.60  0.0  0.0 -0.010101  0.0  0.0   
1 -1.0 -0.836379 -1.554878  0.786127 -0.60  0.0  0.0 -0.010101  0.0  0.0   
2 -1.0 -0.836363 -1.591463  0.809249 -0.55  0.0  0.0 -0.010101  0.0  0.0   
3 -1.0 -0.836346 -1.603659  0.774566 -0.60  0.0  0.0 -0.010101  0.0  0.0   
4 -1.0 -0.836329 -1.615854  0.763006 -0.60  0.0  0.0 -0.010101  0.0  0.0   

      month  day      hour  year_num  day_split inflection   ts  
0 -0.833333 -1.0 -0.846154      -1.0        0.0         up -1.3  
1 -0.833333 -1.0 -0.769231      -1.0        0.0         up -1.5  
2 -0.833333 -1.0 -0.692308      -1.0        0.0         up -1.7  
3 -0.833333 -1.0 -0.615385      -1.0        0.0         up -1.8  
4 -0.833333 -1.0 -0.538462      -1.0        0.0         up -2.0  
   stn    mmddhh        ta        hm    ws   rn   re        si   ss   sn  \
0 -1.0 -0.836396 -0.859756  0.459538  0.45  0.0  0.0 -0.010101  0.0  3.

##### pycaret

In [17]:
tr1 = setup(r_train, target = 'ts', train_size = 0.7, session_id=2023)
model1 = compare_models(n_select=2, sort='RMSE', include=['et','catboost'])
# model1 = compare_models(n_select=6, sort='RMSE', include=['lr','ridge','et','xgboost','lightgbm','catboost'])
model1

Unnamed: 0,Description,Value
0,Session id,2023
1,Target,ts
2,Target type,Regression
3,Original data shape,"(437373, 17)"
4,Transformed data shape,"(437373, 17)"
5,Transformed train set shape,"(306161, 17)"
6,Transformed test set shape,"(131212, 17)"
7,Ordinal features,1
8,Numeric features,15
9,Categorical features,1


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,1.0213,2.5099,1.5842,0.9843,0.1777,0.2638,22.316
catboost,CatBoost Regressor,1.1375,2.8151,1.6778,0.9824,0.2103,0.3483,9.918


Processing:   0%|          | 0/14 [00:00<?, ?it/s]

[ExtraTreesRegressor(n_jobs=-1, random_state=2023),
 <catboost.core.CatBoostRegressor at 0x25ec75b1f70>]

In [None]:
# tr2 = setup(m_train, target = 'ts', train_size = 0.7, session_id=2023)
# model2 = compare_models(n_select=7, sort='RMSE', include=['lr','lasso','ridge','et','xgboost','lightgbm','catboost'])
# model2

In [None]:
# tr3 = setup(s_train, target = 'ts', train_size = 0.7, session_id=2023)
# model3 = compare_models(n_select=7, sort='RMSE', include=['lr','lasso','ridge','et','xgboost','lightgbm','catboost'])
# model3

In [18]:
et = create_model('et')
# ct = create_model('catboost')
# xb = create_model('xgboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.0259,2.5656,1.6018,0.9839,0.1786,0.2673
1,1.0149,2.439,1.5617,0.9848,0.1773,0.267
2,1.0147,2.5161,1.5862,0.9842,0.1777,0.2762
3,1.0228,2.4997,1.581,0.9843,0.177,0.2553
4,1.0266,2.4667,1.5706,0.9844,0.1797,0.264
5,1.0256,2.5422,1.5944,0.984,0.178,0.2557
6,1.0227,2.5294,1.5904,0.9842,0.1766,0.2523
7,1.0223,2.5575,1.5992,0.9842,0.1759,0.2636
8,1.0152,2.4821,1.5755,0.9846,0.177,0.2672
9,1.0224,2.5007,1.5813,0.9844,0.179,0.2692


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [26]:
interpret_model(et, plot='summary')

In [19]:
pred_et = predict_model(et, data = r_test)
# pred_ct = predict_model(ct, data = r_test)
# pred_xb = predict_model(xb, data = r_test)

In [20]:
len(pred_et)

26280

In [23]:
pred_et['prediction_label'].describe()

count    26280.000000
mean        15.122655
std         12.056468
min        -15.132000
25%          5.017750
50%         15.660000
75%         24.498250
max         50.352000
Name: prediction_label, dtype: float64

In [24]:
test['ts'] = pred_et['prediction_label']

In [25]:
final1 = test[['stn','year','mmddhh','ts']]
final1.to_csv('final1_0705.csv',index=False, encoding='cp949')