# Prepare steel data for model

In [5]:
import pandas as pd
import numpy as np
from pandas.tseries.offsets import MonthEnd

In [6]:
df_prod = pd.read_csv('../data/raw/Industry/SteelHistorical.csv',
                     index_col=['Economy'])

In [7]:
df_prod.info()

<class 'pandas.core.frame.DataFrame'>
Index: 567 entries, AUS to VN
Data columns (total 2 columns):
Year                567 non-null int64
SteelConsumption    567 non-null int64
dtypes: int64(2)
memory usage: 13.3+ KB


### Make year column YYYY-MM-DD format for Prophet (historical data)

In [8]:
df_prod['ds'] = pd.to_datetime(df_prod['Year'], format="%Y") + MonthEnd(12)
df_prod.head()

Unnamed: 0_level_0,Year,SteelConsumption,ds
Economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AUS,1990,5511000,1990-12-31
AUS,1991,5294000,1991-12-31
AUS,1992,5120000,1992-12-31
AUS,1993,5757000,1993-12-31
AUS,1994,6326000,1994-12-31


### read in historical macro data

In [9]:
df_macro = pd.read_csv('../data/raw/Industry/MacroHistorical.csv',
                      index_col=['Economy'])
df_macro.head()

Unnamed: 0_level_0,Year,GDP,Population
Economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AUS,1990,515.317626,17.042
AUS,1991,513.320531,17.272
AUS,1992,515.592569,17.486
AUS,1993,536.417759,17.688
AUS,1994,557.759322,17.883


In [10]:
df_macro['ds']=pd.to_datetime(df_macro['Year'],format='%Y')
df_macro['ds'] = pd.to_datetime(df_macro['ds'], format="%Y%m") + MonthEnd(12)
df_macro.head()

Unnamed: 0_level_0,Year,GDP,Population,ds
Economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUS,1990,515.317626,17.042,1990-12-31
AUS,1991,513.320531,17.272,1991-12-31
AUS,1992,515.592569,17.486,1992-12-31
AUS,1993,536.417759,17.688,1993-12-31
AUS,1994,557.759322,17.883,1994-12-31


In [11]:
df_macro['GDP_per_capita'] = df_macro['GDP'].div(df_macro['Population'])
df = pd.merge(df_prod,df_macro,how='left',on=['Economy','ds','Year'])
df.head()

Unnamed: 0_level_0,Year,SteelConsumption,ds,GDP,Population,GDP_per_capita
Economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AUS,1990,5511000,1990-12-31,515.317626,17.042,30.238096
AUS,1991,5294000,1991-12-31,513.320531,17.272,29.719808
AUS,1992,5120000,1992-12-31,515.592569,17.486,29.486021
AUS,1993,5757000,1993-12-31,536.417759,17.688,30.326649
AUS,1994,6326000,1994-12-31,557.759322,17.883,31.18936


### create features

In [12]:
df['ln_prod_per_cap'] = df['SteelConsumption'].div(df['Population'])
df['ln_prod_per_cap'] = np.log(df['ln_prod_per_cap'])

df['ln_GDP_per_cap'] = np.log(df['GDP_per_capita'])

In [13]:
df = df.rename(columns={"ln_prod_per_cap":"y"})
df.head()

Unnamed: 0_level_0,Year,SteelConsumption,ds,GDP,Population,GDP_per_capita,y,ln_GDP_per_cap
Economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AUS,1990,5511000,1990-12-31,515.317626,17.042,30.238096,12.686576,3.409103
AUS,1991,5294000,1991-12-31,513.320531,17.272,29.719808,12.632998,3.391814
AUS,1992,5120000,1992-12-31,515.592569,17.486,29.486021,12.587264,3.383916
AUS,1993,5757000,1993-12-31,536.417759,17.688,30.326649,12.693041,3.412027
AUS,1994,6326000,1994-12-31,557.759322,17.883,31.18936,12.776328,3.440077


In [14]:
# get list of economies
economies = df.index.unique()
economies

Index(['AUS', 'BD', 'CDA', 'CHL', 'PRC', 'HKC', 'INA', 'JPN', 'KOR', 'MAS',
       'MEX', 'NZ', 'PNG', 'PE', 'RP', 'RUS', 'SIN', 'CT', 'THA', 'USA', 'VN'],
      dtype='object', name='Economy')

### add future GDP and population

In [15]:
df_future_macro = pd.read_csv('../data/raw/Industry/MacroAssumptions.csv',
                             index_col=['Economy'])

### create features (future)

In [16]:
df_future_macro['GDP_per_capita'] = df_future_macro['GDP'].div(df_future_macro['Population'])
df_future_macro['ln_GDP_per_cap'] = np.log(df_future_macro['GDP_per_capita'])
df_future_macro.head()

Unnamed: 0_level_0,Year,GDP,Population,GDP_per_capita,ln_GDP_per_cap
Economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AUS,2017,1157.283926,24.451,47.33074,3.85716
AUS,2018,1201.142573,24.772,48.487913,3.881315
AUS,2019,1245.039044,25.089,49.624897,3.904493
AUS,2020,1289.224526,25.398,50.760868,3.927126
AUS,2021,1333.639686,25.7,51.892595,3.949176


### Make year column YYYY-MM-DD format for Prophet (future data)

In [17]:
df_future_macro['ds'] = pd.to_datetime(df_future_macro['Year'], format="%Y") + MonthEnd(12)
df_future_macro.head()

Unnamed: 0_level_0,Year,GDP,Population,GDP_per_capita,ln_GDP_per_cap,ds
Economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AUS,2017,1157.283926,24.451,47.33074,3.85716,2017-12-31
AUS,2018,1201.142573,24.772,48.487913,3.881315,2018-12-31
AUS,2019,1245.039044,25.089,49.624897,3.904493,2019-12-31
AUS,2020,1289.224526,25.398,50.760868,3.927126,2020-12-31
AUS,2021,1333.639686,25.7,51.892595,3.949176,2021-12-31


### combine 

In [18]:
regressors_hist = df
regressors_fut = df_future_macro

_regressors_list =[]

for economy in economies:
    _regressors = pd.concat([regressors_hist.loc[economy],regressors_fut.loc[economy]],
                      ignore_index=False, sort=False)
    _regressors_list.append(_regressors)
regressors = pd.concat(_regressors_list)

In [19]:
regressors.head()

Unnamed: 0_level_0,Year,SteelConsumption,ds,GDP,Population,GDP_per_capita,y,ln_GDP_per_cap
Economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AUS,1990,5511000.0,1990-12-31,515.317626,17.042,30.238096,12.686576,3.409103
AUS,1991,5294000.0,1991-12-31,513.320531,17.272,29.719808,12.632998,3.391814
AUS,1992,5120000.0,1992-12-31,515.592569,17.486,29.486021,12.587264,3.383916
AUS,1993,5757000.0,1993-12-31,536.417759,17.688,30.326649,12.693041,3.412027
AUS,1994,6326000.0,1994-12-31,557.759322,17.883,31.18936,12.776328,3.440077
