In [1]:
# basic lib
import FinanceDataReader as fdr
import pandas_datareader.data as web
from datetime import datetime
from tqdm.notebook import tqdm as tqdm

import time
import random
import pandas as pd 
import glob
pd.options.display.max_rows=100

import numpy as np 
import matplotlib.pyplot as plt

%matplotlib inline
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%autosave 360
from matplotlib.gridspec import GridSpec
import seaborn as sns 

sns.set_style(style="darkgrid")

import re
import platform    
import warnings
warnings.filterwarnings(action='ignore')

from matplotlib import font_manager, rc

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system.... sorry.....')
plt.rcParams['axes.unicode_minus'] = False
    
        
# seed 설정
np.random.seed(42)

#model 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgbm

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

Autosaving every 360 seconds


In [2]:
# csv파일 한번에 가져오기
path = '/Users/tk/Documents/inflation' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=1)
df = frame.drop('DATE', axis=1)
df['monthly'] = pd.to_datetime(df['monthly'])
data = df.set_index('monthly')
data = data.rename(columns={'CPALTT01USM657N':'CPI', 'NPPTTL':'Employment', 'GOLDAMGBD228NLBM':'Gold',
             'INDPRO':'Production', 'UNRATE':'Unemployment', 'W825RC1':'Unemployment_benefit', 'PIORECRUSDM':'Iron',
             'PCOPPUSDM':'Copper', 'POILWTIUSDM': 'WTI_Crude', 'DGS10':'US10Y'})
data = data.dropna()
data

Unnamed: 0_level_0,US10Y,CPI,Employment,Gold,Production,Unemployment,Unemployment_benefit,Iron,Copper,WTI_Crude
monthly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-01,2.58,0.544775,125346.437,1332.81,101.3561,4.0,29.8,75.782609,7065.852273,63.556957
2018-02-01,2.86,0.453469,125674.267,1333.775,101.6495,4.1,29.1,77.65,7006.525,62.15
2018-03-01,2.84,0.226113,125841.0,1326.56,102.298,4.0,28.8,71.318182,6799.178571,62.861364
2018-04-01,2.87,0.397509,126102.265,1334.37,103.4095,4.0,28.5,66.333333,6851.5125,66.320476
2018-05-01,2.98,0.415892,126306.269,1303.6119,102.5408,3.8,27.6,66.630435,6825.27381,69.892609
2018-06-01,2.91,0.159388,126443.638,1282.12,103.3045,4.0,27.6,66.857143,6965.857143,67.522857
2018-07-02,2.89,0.006746,126634.615,1238.06,103.5474,3.8,27.8,67.045455,6250.75,70.991364
2018-08-01,2.89,0.055554,126794.662,1201.86,104.1659,3.8,27.5,68.021739,6051.045455,67.988696
2018-09-01,3.0,0.116203,126893.385,1199.2,104.1315,3.7,26.9,68.8,6050.7625,70.1855
2018-10-01,3.15,0.176676,127154.379,1214.72,103.9874,3.8,26.6,72.021739,6219.586957,70.751304


### Dataset 구조 결측치 확인

In [3]:
print(f'기본 Dataset 구조 : {data.shape}')
print('='*80)
print(data.info())

기본 Dataset 구조 : (39, 10)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39 entries, 2018-01-01 to 2021-03-01
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   US10Y                 39 non-null     float64
 1   CPI                   39 non-null     float64
 2   Employment            39 non-null     float64
 3   Gold                  39 non-null     float64
 4   Production            39 non-null     float64
 5   Unemployment          39 non-null     float64
 6   Unemployment_benefit  39 non-null     float64
 7   Iron                  39 non-null     float64
 8   Copper                39 non-null     float64
 9   WTI_Crude             39 non-null     float64
dtypes: float64(10)
memory usage: 3.4 KB
None


In [11]:
train = data.loc[:'2020-12-01']
test = data.loc['2021-01-01':]
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [12]:
num_cols=[['CPI','Employment','Gold','Production','Unemployment',
         'Unemployment_benefit','Iron','Copper','WTI_Crude']]
mms = MinMaxScaler()

for col in tqdm(num_cols):
    data_df=pd.concat([train, test])
    mms.fit(data_df[col])
    train[col]=mms.transform(train[col])
    test[col]=mms.transform(test[col])

  0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
drop_cols=['US10Y', 'Unemployment', 'Unemployment_benefit']
train_X = train.drop(drop_cols, axis=1)
train_y = train[['US10Y']]

test_X = test.drop(drop_cols, axis=1)
test_y = test[['US10Y']]

In [14]:
# (1) light gbm
hyper_params = {
}

lgbr = lgbm.LGBMRegressor(n_estimators=100,
                         max_bin=2000) 
lgbr.fit(train_X, train_y)
y_pred2 = lgbr.predict(test_X)

MSE = mean_squared_error(test_y, y_pred2)
MAE = mean_absolute_error(test_y, y_pred2)
print('MSE :', MSE)
print('MAE :', MAE)

MSE : 0.49101674250537214
MAE : 0.6652777767843668
