In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime
import matplotlib.dates as mdates
import xgboost
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from pycaret.regression import *

In [3]:
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/NGULIM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

In [4]:
train = pd.read_csv('data/train_0704f.csv')
print(train.isnull().sum())
print(train.shape)

Unnamed: 0      0
stn             0
year            0
mmddhh          0
ta              0
td              0
hm              0
ws              0
rn             82
re             82
ww              0
ts              0
si              0
ss              0
sn              0
month           0
day             0
hour            0
spring          0
summer          0
autumn          0
winter          0
season          0
year_num        0
ymd             0
ymdh            0
day_split       0
re_interval     0
inflection      0
dtype: int64
(437373, 29)


In [5]:
test = pd.read_csv('data/test_0704f.csv')
print(test.isnull().sum())
print(test.shape)

Unnamed: 0     0
stn            0
year           0
mmddhh         0
ta             0
td             0
hm             0
ws             0
rn             4
re             4
ww             0
si             0
ss             0
sn             0
year_num       0
month          0
day            0
hour           0
ymd            0
ymdh           0
day_split      0
re_interval    0
inflection     0
dtype: int64
(26280, 23)


##### Imputation
1. labelencoder
2. knnimputer

In [6]:
train.columns

Index(['Unnamed: 0', 'stn', 'year', 'mmddhh', 'ta', 'td', 'hm', 'ws', 'rn',
       're', 'ww', 'ts', 'si', 'ss', 'sn', 'month', 'day', 'hour', 'spring',
       'summer', 'autumn', 'winter', 'season', 'year_num', 'ymd', 'ymdh',
       'day_split', 're_interval', 'inflection'],
      dtype='object')

In [7]:
test.columns

Index(['Unnamed: 0', 'stn', 'year', 'mmddhh', 'ta', 'td', 'hm', 'ws', 'rn',
       're', 'ww', 'si', 'ss', 'sn', 'year_num', 'month', 'day', 'hour', 'ymd',
       'ymdh', 'day_split', 're_interval', 'inflection'],
      dtype='object')

In [8]:
# Create 'ymd' column
train['ymd'] = (train['year_num'].astype(str) + '-' + train['month'].astype(str).str.zfill(2)
                     + '-' + train['day'].astype(str).str.zfill(2))
print(train.ymd)

# Create 'ymdh' column
train['ymdh'] = (train['year_num'].astype(str) + '-' + train['month'].astype(str).str.zfill(2)
                      + '-' + train['day'].astype(str).str.zfill(2)
                      + '-' + train['hour'].astype(str).str.zfill(2))
print(train.ymdh)

# Convert 'ymdh' column to datetime
train['ymdh'] = pd.to_datetime(train['ymdh'], errors='coerce')
print(train.ymdh)

0         2016-02-01
1         2016-02-01
2         2016-02-01
3         2016-02-01
4         2016-02-01
             ...    
437368    2021-01-31
437369    2021-01-31
437370    2021-01-31
437371    2021-01-31
437372    2021-01-31
Name: ymd, Length: 437373, dtype: object
0         2016-02-01-00
1         2016-02-01-01
2         2016-02-01-02
3         2016-02-01-03
4         2016-02-01-04
              ...      
437368    2021-01-31-19
437369    2021-01-31-20
437370    2021-01-31-21
437371    2021-01-31-22
437372    2021-01-31-23
Name: ymdh, Length: 437373, dtype: object
0        2016-02-01 00:00:00
1        2016-02-01 01:00:00
2        2016-02-01 02:00:00
3        2016-02-01 03:00:00
4        2016-02-01 04:00:00
                 ...        
437368   2021-01-31 19:00:00
437369   2021-01-31 20:00:00
437370   2021-01-31 21:00:00
437371   2021-01-31 22:00:00
437372   2021-01-31 23:00:00
Name: ymdh, Length: 437373, dtype: datetime64[ns]


In [9]:
daily_stats = train.groupby(train['ymdh'].dt.date)['ta'].agg(['mean', 'min', 'max'])
print(daily_stats)
daily_stats.info()

                mean   min   max
ymdh                            
2016-02-01 -1.207917 -12.0   7.8
2016-02-02 -0.721250 -14.2   9.8
2016-02-03  2.480000  -9.1  12.2
2016-02-04  3.807531  -7.4  14.8
2016-02-05  4.317917  -0.6  11.7
...              ...   ...   ...
2021-01-27  1.555000  -7.8  11.2
2021-01-28 -0.019583 -10.6   9.3
2021-01-29 -0.899167 -11.5   7.3
2021-01-30 -0.985417 -12.5   8.7
2021-01-31  0.165833 -12.5   9.6

[1825 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>
Index: 1825 entries, 2016-02-01 to 2021-01-31
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mean    1825 non-null   float64
 1   min     1825 non-null   float64
 2   max     1825 non-null   float64
dtypes: float64(3)
memory usage: 57.0+ KB


In [10]:
train['temp_split'] = np.nan

In [11]:
ds = daily_stats.copy()
ds['yy'] = ds.index
print('평균 기온 Q1 ~ Q3 결과')
a = ds[(ds['mean']>=5.910833)&(ds['mean']<=22.239167)]
a.loc[:, 'yy'] = a['yy'].astype(str)
print(len(a))
print(train[train['ymd'].isin(a['yy'])].shape)
print(train[train['ymd'].isin(a['yy'])].index[:5])

print('\t')
print('평균 기온 > Q3 결과')
b = ds[(ds['mean']>22.239167)]
b.loc[:, 'yy'] = b['yy'].astype(str)
print(len(b))
print(train[train['ymd'].isin(b['yy'])].shape)
print(train[train['ymd'].isin(b['yy'])].index[:5])

print('\t')
print('평균 기온 < Q1 결과')
d = ds[(ds['mean']<5.910833)]
d.loc[:, 'yy'] = d['yy'].astype(str)
print(len(d))
print(train[train['ymd'].isin(d['yy'])].shape)
print(train[train['ymd'].isin(d['yy'])].index[:5])

평균 기온 Q1 ~ Q3 결과
913
(218769, 30)
Int64Index([360, 361, 362, 363, 364], dtype='int64')
	
평균 기온 > Q3 결과
456
(109256, 30)
Int64Index([2806, 2807, 2808, 2809, 2810], dtype='int64')
	
평균 기온 < Q1 결과
456
(109348, 30)
Int64Index([0, 1, 2, 3, 4], dtype='int64')


In [12]:
middle = train[train['ymd'].isin(a['yy'])].copy()
print(middle.month.unique())

high = train[train['ymd'].isin(b['yy'])].copy()
print(high.month.unique())

low = train[train['ymd'].isin(d['yy'])].copy()
print(low.month.unique())

[ 2  3  4  5  6  8  9 10 11  7 12  1]
[ 5  6  7  8  9 10]
[ 2  3 11 12  1  4]


In [13]:
test['temp_split']= np.nan

In [14]:
middle_1 = test[test['mmddhh'].isin(middle['mmddhh'])].copy()
print(middle_1.month.unique())

high_1 = test[test['mmddhh'].isin(high['mmddhh'])].copy()
print(high_1.month.unique())

low_1 = test[test['mmddhh'].isin(low['mmddhh'])].copy()
print(low_1.month.unique())

[ 2  3  4  5  6  7  8  9 10 11 12  1]
[ 5  6  7  8  9 10]
[ 2  3  4 11 12  1]


In [15]:
middle_1['temp_split'] = middle_1['temp_split'].fillna('middle')
low_1['temp_split'] = low_1['temp_split'].fillna('low')
high_1['temp_split'] = high_1['temp_split'].fillna('high')

In [16]:
middle.isnull().sum()

Unnamed: 0          0
stn                 0
year                0
mmddhh              0
ta                  0
td                  0
hm                  0
ws                  0
rn                 61
re                 61
ww                  0
ts                  0
si                  0
ss                  0
sn                  0
month               0
day                 0
hour                0
spring              0
summer              0
autumn              0
winter              0
season              0
year_num            0
ymd                 0
ymdh                0
day_split           0
re_interval         0
inflection          0
temp_split     218769
dtype: int64

In [17]:
middle['temp_split'] = middle['temp_split'].fillna('middle')
low['temp_split'] = low['temp_split'].fillna('low')
high['temp_split'] = high['temp_split'].fillna('high')

In [18]:
train_1 = pd.concat([middle,low,high], axis=0)
train_1.sort_index(inplace=True)

print(train_1.isnull().sum(), train_1['temp_split'].value_counts())

test_1 = pd.concat([middle_1,low_1,high_1], axis=0)
test_1.sort_index(inplace=True)

print(test_1.isnull().sum(), test_1['temp_split'].value_counts())

Unnamed: 0      0
stn             0
year            0
mmddhh          0
ta              0
td              0
hm              0
ws              0
rn             82
re             82
ww              0
ts              0
si              0
ss              0
sn              0
month           0
day             0
hour            0
spring          0
summer          0
autumn          0
winter          0
season          0
year_num        0
ymd             0
ymdh            0
day_split       0
re_interval     0
inflection      0
temp_split      0
dtype: int64 middle    218769
low       109348
high      109256
Name: temp_split, dtype: int64
Unnamed: 0     0
stn            0
year           0
mmddhh         0
ta             0
td             0
hm             0
ws             0
rn             7
re             7
ww             0
si             0
ss             0
sn             0
year_num       0
month          0
day            0
hour           0
ymd            0
ymdh           0
day_split      0
re_inter

In [24]:
train_copy = train[['stn', 'mmddhh', 'ta', 'hm', 'ws', 'rn',
       're', 'ts', 'si', 'ss', 'sn','month', 'day', 'hour','year_num',
       'day_split']].copy()
test_copy = test[['stn', 'mmddhh', 'ta', 'hm', 'ws',
       'rn', 're', 'si', 'ss', 'sn', 'month', 'day', 'hour','year_num',
       'day_split']].copy()

print(train_copy.dtypes)
print(test_copy.dtypes)

stn            int64
mmddhh         int64
ta           float64
hm           float64
ws           float64
rn           float64
re           float64
ts           float64
si           float64
ss           float64
sn           float64
month          int64
day            int64
hour           int64
year_num       int64
day_split     object
dtype: object
stn           object
mmddhh         int64
ta           float64
hm           float64
ws           float64
rn           float64
re           float64
si           float64
ss           float64
sn           float64
month          int64
day            int64
hour           int64
year_num       int64
day_split     object
dtype: object


In [25]:
test_copy['stn'] = test_copy.stn.apply(lambda x: 1 if x == 'a'
                                               else 2 if x== 'b'
                                               else 3 if x=='c'
                                            else x)

In [26]:
train_copy.day_split.unique()

array(['새벽', '아침', '낮', '저녁', '밤'], dtype=object)

In [27]:
test_copy.isnull().sum()

stn          0
mmddhh       0
ta           0
hm           0
ws           0
rn           4
re           4
si           0
ss           0
sn           0
month        0
day          0
hour         0
year_num     0
day_split    0
dtype: int64

In [28]:
non_numeric_cols = train_copy.select_dtypes(exclude=['float', 'int']).columns
print(non_numeric_cols)
le = LabelEncoder()
for col in non_numeric_cols:
    train_copy[col] = le.fit_transform(train_copy[col])
    test_copy[col] = le.transform(test_copy[col])
    
print(train_copy.head(3))
print(test_copy.head(3))

Index(['day_split'], dtype='object')
   stn  mmddhh    ta    hm   ws   rn   re   ts   si   ss   sn  month  day  \
0    1   20100  -9.9  93.9  0.6  0.0  0.0 -1.3  0.0  0.0  0.0      2    1   
1    1   20101 -10.8  93.8  0.6  0.0  0.0 -1.5  0.0  0.0  0.0      2    1   
2    1   20102 -11.4  94.6  0.7  0.0  0.0 -1.7  0.0  0.0  0.0      2    1   

   hour  year_num  day_split  
0     0      2016          2  
1     1      2016          2  
2     2      2016          2  
   stn  mmddhh   ta    hm   ws   rn   re   si   ss   sn  month  day  hour  \
0    1   20100  0.6  82.5  2.7  0.0  0.0  0.0  0.0  3.1      2    1     0   
1    1   20101  0.0  68.3  3.2  0.0  0.0  0.0  0.0  3.1      2    1     1   
2    1   20102 -0.3  63.7  2.7  0.0  0.0  0.0  0.0  3.1      2    1     2   

   year_num  day_split  
0      2021          2  
1      2021          2  
2      2021          2  


In [29]:
train_copy['day_split'] = train_copy['day_split'] + 1

test_copy['day_split'] = test_copy['day_split'] + 1 

In [30]:
X = train_copy.drop(['ts'], axis=1)
Y = train_copy['ts']

In [31]:
print(X.columns)
print(test_copy.columns)

Index(['stn', 'mmddhh', 'ta', 'hm', 'ws', 'rn', 're', 'si', 'ss', 'sn',
       'month', 'day', 'hour', 'year_num', 'day_split'],
      dtype='object')
Index(['stn', 'mmddhh', 'ta', 'hm', 'ws', 'rn', 're', 'si', 'ss', 'sn',
       'month', 'day', 'hour', 'year_num', 'day_split'],
      dtype='object')


In [32]:
imp=KNNImputer(n_neighbors=5, weights='distance')

imp_train = imp.fit_transform(X)
imp_test = imp.transform(test_copy)

train_imp=pd.DataFrame(imp_train, columns=X.columns)
test_imp=pd.DataFrame(imp_test, columns=test_copy.columns)

print(train_imp.isnull().sum())
print(test_imp.isnull().sum())

stn          0
mmddhh       0
ta           0
hm           0
ws           0
rn           0
re           0
si           0
ss           0
sn           0
month        0
day          0
hour         0
year_num     0
day_split    0
dtype: int64
stn          0
mmddhh       0
ta           0
hm           0
ws           0
rn           0
re           0
si           0
ss           0
sn           0
month        0
day          0
hour         0
year_num     0
day_split    0
dtype: int64


##### 스케일링

In [33]:
# standard
from sklearn.preprocessing import StandardScaler

st = StandardScaler()

scaled_train = st.fit_transform(train_imp)
scaled_test = st.transform(test_imp)

s_train=pd.DataFrame(scaled_train, columns=train_imp.columns)
s_test=pd.DataFrame(scaled_test, columns=test_imp.columns)

s_train['ts'] = train['ts']
print(s_train.head())
print(s_test.head())

        stn    mmddhh        ta        hm        ws        rn        re  \
0 -1.566476 -1.354646 -2.330543  1.306805 -0.987598 -0.120021 -0.240619   
1 -1.566476 -1.354617 -2.418987  1.302180 -0.987598 -0.120021 -0.240619   
2 -1.566476 -1.354588 -2.477949  1.339174 -0.926544 -0.120021 -0.240619   
3 -1.566476 -1.354559 -2.497603  1.283683 -0.987598 -0.120021 -0.240619   
4 -1.566476 -1.354530 -2.517258  1.265186 -0.987598 -0.120021 -0.240619   

        si       ss        sn     month       day      hour  year_num  \
0 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.660738 -1.446405   
1 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.516309 -1.446405   
2 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.371879 -1.446405   
3 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.227449 -1.446405   
4 -0.65999 -0.67377 -0.073284 -1.312481 -1.673144 -1.083019 -1.446405   

   day_split   ts  
0  -0.030786 -1.3  
1  -0.030786 -1.5  
2  -0.030786 -1.7  
3  -0.030786 -1.8  
4  -0.0307

In [34]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()

scaled_train = mms.fit_transform(train_imp)
scaled_test = mms.transform(test_imp)

m_train=pd.DataFrame(scaled_train, columns=train_imp.columns)
m_test=pd.DataFrame(scaled_test, columns=test_imp.columns)

m_train['ts'] = train['ts']
print(m_train.head())
print(m_test.head())


   stn    mmddhh        ta        hm        ws   rn   re   si   ss   sn  \
0  0.0  0.088478  0.191803  0.937113  0.024390  0.0  0.0  0.0  0.0  0.0   
1  0.0  0.088486  0.177049  0.936082  0.024390  0.0  0.0  0.0  0.0  0.0   
2  0.0  0.088495  0.167213  0.944330  0.028455  0.0  0.0  0.0  0.0  0.0   
3  0.0  0.088504  0.163934  0.931959  0.024390  0.0  0.0  0.0  0.0  0.0   
4  0.0  0.088513  0.160656  0.927835  0.024390  0.0  0.0  0.0  0.0  0.0   

      month  day      hour  year_num  day_split   ts  
0  0.090909  0.0  0.000000       0.0        0.5 -1.3  
1  0.090909  0.0  0.043478       0.0        0.5 -1.5  
2  0.090909  0.0  0.086957       0.0        0.5 -1.7  
3  0.090909  0.0  0.130435       0.0        0.5 -1.8  
4  0.090909  0.0  0.173913       0.0        0.5 -2.0  
   stn    mmddhh        ta        hm        ws        rn        re   si   ss  \
0  0.0  0.088478  0.363934  0.819588  0.109756  0.000000  0.000000  0.0  0.0   
1  0.0  0.088486  0.354098  0.673196  0.130081  0.000000  0

In [35]:
from sklearn.preprocessing import RobustScaler

rs = RobustScaler()

scaled_train = rs.fit_transform(train_imp)
scaled_test = rs.transform(test_imp)

r_train=pd.DataFrame(scaled_train, columns=train_imp.columns)
r_test=pd.DataFrame(scaled_test, columns=test_imp.columns)

r_train['ts'] = train['ts']
print(r_train.head())
print(r_test.head())

   stn    mmddhh        ta        hm    ws   rn   re        si   ss   sn  \
0 -1.0 -0.836396 -1.500000  0.789017 -0.60  0.0  0.0 -0.010101  0.0  0.0   
1 -1.0 -0.836379 -1.554878  0.786127 -0.60  0.0  0.0 -0.010101  0.0  0.0   
2 -1.0 -0.836363 -1.591463  0.809249 -0.55  0.0  0.0 -0.010101  0.0  0.0   
3 -1.0 -0.836346 -1.603659  0.774566 -0.60  0.0  0.0 -0.010101  0.0  0.0   
4 -1.0 -0.836329 -1.615854  0.763006 -0.60  0.0  0.0 -0.010101  0.0  0.0   

      month  day      hour  year_num  day_split   ts  
0 -0.833333 -1.0 -0.846154      -1.0        0.0 -1.3  
1 -0.833333 -1.0 -0.769231      -1.0        0.0 -1.5  
2 -0.833333 -1.0 -0.692308      -1.0        0.0 -1.7  
3 -0.833333 -1.0 -0.615385      -1.0        0.0 -1.8  
4 -0.833333 -1.0 -0.538462      -1.0        0.0 -2.0  
   stn    mmddhh        ta        hm    ws   rn   re        si   ss   sn  \
0 -1.0 -0.836396 -0.859756  0.459538  0.45  0.0  0.0 -0.010101  0.0  3.1   
1 -1.0 -0.836379 -0.896341  0.049133  0.70  0.0  0.0 -0.01010

In [None]:
r_train['temp_split'] = train_1.temp_split
r_test['temp_split'] = test_1.temp_split

##### pycaret

In [39]:
tr1 = setup(r_train, target = 'ts', train_size = 0.7, session_id=2023)
# model1 = compare_models(n_select=4, sort='MAE', include=['lr','et','xgboost','catboost'])
# model1

Unnamed: 0,Description,Value
0,Session id,2023
1,Target,ts
2,Target type,Regression
3,Original data shape,"(437373, 17)"
4,Transformed data shape,"(437373, 19)"
5,Transformed train set shape,"(306161, 19)"
6,Transformed test set shape,"(131212, 19)"
7,Numeric features,15
8,Categorical features,1
9,Preprocess,True


In [None]:
# tr2 = setup(m_train, target = 'ts', train_size = 0.7, session_id=2023)
# model2 = compare_models(n_select=7, sort='RMSE', include=['lr','lasso','ridge','et','xgboost','lightgbm','catboost'])
# model2

In [None]:
# tr3 = setup(s_train, target = 'ts', train_size = 0.7, session_id=2023)
# model3 = compare_models(n_select=7, sort='RMSE', include=['lr','lasso','ridge','et','xgboost','lightgbm','catboost'])
# model3

In [40]:
et = create_model('et')
# bct = create_model('catboost')
# xb = create_model('xgboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.0555,2.6834,1.6381,0.9832,0.1848,0.2821
1,1.0364,2.5355,1.5923,0.9842,0.1828,0.2761
2,1.0386,2.6067,1.6145,0.9837,0.1831,0.2899
3,1.0471,2.6019,1.613,0.9837,0.1825,0.2671
4,1.0495,2.5529,1.5978,0.9839,0.1861,0.2771
5,1.0451,2.6185,1.6182,0.9836,0.1825,0.2663
6,1.0428,2.6028,1.6133,0.9837,0.1824,0.2653
7,1.0467,2.6478,1.6272,0.9836,0.1819,0.2753
8,1.0457,2.6076,1.6148,0.9839,0.1837,0.2794
9,1.0473,2.5996,1.6123,0.9838,0.1835,0.279


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [46]:
test_1

Unnamed: 0.1,Unnamed: 0,stn,year,mmddhh,ta,td,hm,ws,rn,re,ww,si,ss,sn,year_num,month,day,hour,ymd,ymdh,day_split,re_interval,inflection,temp_split
0,1,a,F,20100,0.6,-2.0,82.5,2.7,0.0,0.0,G,0.0,0.0,3.1,2021,2,1,0,2021-02-01,2021-02-01 00:00:00,새벽,0,up,middle
0,1,a,F,20100,0.6,-2.0,82.5,2.7,0.0,0.0,G,0.0,0.0,3.1,2021,2,1,0,2021-02-01,2021-02-01 00:00:00,새벽,0,up,low
1,2,a,F,20101,0.0,-5.2,68.3,3.2,0.0,0.0,R,0.0,0.0,3.1,2021,2,1,1,2021-02-01,2021-02-01 01:00:00,새벽,0,up,low
1,2,a,F,20101,0.0,-5.2,68.3,3.2,0.0,0.0,R,0.0,0.0,3.1,2021,2,1,1,2021-02-01,2021-02-01 01:00:00,새벽,0,up,middle
2,3,a,F,20102,-0.3,-6.4,63.7,2.7,0.0,0.0,C,0.0,0.0,3.1,2021,2,1,2,2021-02-01,2021-02-01 02:00:00,새벽,0,up,middle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26275,26276,c,G,13119,7.1,-1.9,53.0,5.7,0.0,0.0,C,0.0,0.0,0.0,2022,1,31,19,2022-01-31,2022-01-31 19:00:00,저녁,0,down,low
26276,26277,c,G,13120,6.7,-0.5,60.1,4.7,0.0,0.0,C,0.0,0.0,0.0,2022,1,31,20,2022-01-31,2022-01-31 20:00:00,저녁,0,down,low
26277,26278,c,G,13121,6.2,-0.1,63.9,3.7,0.0,0.0,C,0.0,0.0,0.0,2022,1,31,21,2022-01-31,2022-01-31 21:00:00,밤,0,down,low
26278,26279,c,G,13122,6.5,0.8,67.1,4.8,0.0,0.0,C,0.0,0.0,0.0,2022,1,31,22,2022-01-31,2022-01-31 22:00:00,밤,0,down,low


In [47]:
r_test['split'] = test_1['temp_split']

ValueError: cannot reindex on an axis with duplicate labels

In [44]:
pred_et = predict_model(et, data = r_test)
# pred_ct = predict_model(ct, data = r_test)
# pred_xb = predict_model(xb, data = r_test)

KeyError: "['temp_split'] not in index"

In [None]:
pred_et['prediction_label'].describe()

In [None]:
test['ts'] = pred_et['prediction_label']

In [None]:
final1 = test[['stn','year','mmddhh','ts']]
final1.to_csv('final2_0705.csv',index=False, encoding='cp949')