# Industry Demand Model

This is the Industry energy demand model. The input data files are for each sub-sector and from the macro model. The model uses Prophet to generate predictions for each economy and fuel. 

It predicts demands for all sub-sectors.

You can adjust the fuels you are predicting.

You can also add or remove 'features', e.g., GDP, population, GDP per capita, ln(GDP per capita), etc.

In [None]:
import numpy as np
import pandas as pd
from pandas.tseries.offsets import MonthEnd
from fbprophet import Prophet

## 1. Import data and process the data

In [None]:
# read in historical production data for each sub-sector

df_steel = pd.read_csv('../data/raw/Industry/SteelHistorical.csv',index_col=['Economy','Year'])

df_cement = pd.read_excel('../data/raw/Industry/02_Cement.xlsx', usecols='A,L:AL')

In [None]:
df_steel.columns= ['Steel']

In [None]:
df_cement.head()

In [None]:
# the tables need to be reformatted (tidy) so they can be merged together:

df = pd.melt(df_cement, id_vars=['Economy'], var_name='Year',value_name='Cement')
df = df.set_index(['Economy','Year'])
df.head()

In [None]:
df_historical = pd.merge(df_steel,df,how='outer',on=['Economy','Year'])

In [None]:
df_historical.info()

In [None]:
df_historical.head()

In [None]:
# get list of sub-sectors

subsectors = df_historical.columns.unique()
subsectors_tuple = tuple(subsectors)

In [None]:
# check that it worked

subsectors

In [None]:
# this is the historical macro data

df_macro = pd.read_csv('../data/raw/Industry/MacroHistorical.csv',
                       index_col=['Economy','Year'])
df_macro

In [None]:
# combine the Industry data and macro data

df_hist = pd.merge(df_historical,df_macro,how='left',on=['Economy','Year'])

In [None]:
df_hist.head()

In [None]:
df_hist.info()

## BIG ASSUMPTIONS in the next cell:

In [None]:
# there are some missing values (NaN)
# Check the data to see if they should be zero, or replaced with a better number
# choosing to replace them for now:

df_hist['Cement'] = df_hist['Cement'].fillna(method='backfill')
df_hist['Cement'] = df_hist['Cement'].fillna(method='ffill')

In [None]:
# check that it worked
# Brunei:

df_hist.info()

In [None]:
# add the YYYY-MM-12 column format for Prophet

df_hist = df_hist.reset_index(level='Year')
df_hist['ds'] = pd.to_datetime(df_hist['Year'], format="%Y") + MonthEnd(12)

In [None]:
df_hist.head()

In [None]:
df_hist.tail()

In [None]:
# get a list of economies

economies = df_hist.index.unique()
economies

#### create features

In [None]:
df_hist['GDP per capita'] = df_hist['GDP'].div(df_hist['Population'])
df_hist['feature_ln_GDP_per_cap'] = np.log(df_hist['GDP per capita'])

#### create targets

In [None]:
list_of_targets = []
for sub in subsectors_tuple:
    df_hist['target_percap'] = df_hist[sub].div(df_hist['Population'])
    newcolname = ('target_ln_percap_'+ sub)
    df_hist[newcolname] = np.log(df_hist['target_percap'])
    df_hist = df_hist.drop(columns='target_percap')
    list_of_targets.append(newcolname)

#### make list of targets

In [None]:
# these are the sub-sectors we will predict
# the tuple is required so that looping works

list_of_targets
targets = tuple(list_of_targets)

# use this to change the names to sub-sectors at the end
subsector_dict = dict(zip(list_of_targets,subsectors))

In [None]:
# check that it worked

subsector_dict

In [None]:
# check that 'GDP per capita', 'ln GDP per capita', 'target per capita', 'target ln per capita for steel', and 'target ln per capita for cement'
# are in the big dataframe called df_hist

df_hist.head()

#### Prepare future data

In [None]:
df_future_macro = pd.read_csv('../data/raw/Industry/MacroAssumptions.csv',
                              index_col=['Economy'])
df_future_macro['ds'] = pd.to_datetime(df_future_macro['Year'], format="%Y") + MonthEnd(12)
df_future_macro.head()

In [None]:
df_future_macro.tail()

In [None]:
# create the same feature as in the historical dataset:
# ln of GDP per capita

df_future_macro['GDP per capita'] = df_future_macro['GDP'].div(df_future_macro['Population'])
df_future_macro['feature_ln_GDP_per_cap'] = np.log(df_future_macro['GDP per capita'])

In [None]:
# check that it worked:

df_future_macro.head()

In [None]:
df_future_macro.tail()

In [None]:
# combine the historical and future data

df_big = pd.concat([df_hist,df_future_macro], sort=False)

In [None]:
# check that it worked:

df_big.head()

In [None]:
df_big.tail()

In [None]:
df_big.info()

#### Split data in to training, testing, and prediction sets:

In [None]:
# update to have a training dataset (instead of 1990-2016)

df_train = df_big[df_big['Year']<=2016]

In [None]:
# check that the last year is correct (should be 2016)

df_train.loc['01_AUS'].tail()

In [None]:
# make a dataset from 1990-2050 for predictions

df_predict = df_big.drop(columns=subsectors).drop(columns='Year')

In [None]:
# check that it worked

df_predict.loc['01_AUS'].head()

In [None]:
df_predict.loc['01_AUS'].tail()

## 2. Construct the models

#### Construct models by economy and subsector:

In [None]:

# models
models = {}

for economy in economies:
    nested_dict = {}
    for target in list_of_targets:
        m = Prophet(daily_seasonality=False,
                       weekly_seasonality=False,
                       yearly_seasonality=False,
                       seasonality_mode='additive',
                       growth='linear')
        
        # you can add more regressors:
        #m.add_regressor('GDP')
        #m.add_regressor('Population')
        nested_dict[target] = m
    models[economy] = nested_dict

In [None]:
# check that individual models are stored in memory
# each economy, fuel model should have its own unique number (e.g., 0x25c9b5261c8)
models

#### Make a dataframe for each economy-subsector

In [None]:
# dataframes for models

# change to targets

economy_subsector_dfs = {}

for economy in economies:
    subsector_dfs = {}
    _df = df_train.loc[economy]
    for target in targets:
        _df2 = _df[[target,'feature_ln_GDP_per_cap','ds']]
        _df2 = _df2.rename(columns={target: "y"})
        subsector_dfs[target] = _df2
    economy_subsector_dfs[economy] = subsector_dfs

## 3. Fit the models

In [None]:
for economy,m1 in models.items():
    print('- The economy is %s' %economy)
    _data_economy = economy_subsector_dfs[economy]
    _model = m1
    for target,m2 in _model.items():
        _data_target = _data_economy[target]
        model = m2
        print('-- Fitting model for %s' %target)
        model.fit(_data_target)
        
print('\n Finished fitting models')

## 4. Make predictions

In [None]:
# this part will take awhile to finish

economy_predictions = {}

for economy,m1 in models.items():
    target_predictions = {}
    print('\n - Making prediction for %s' %economy)
    _predict_economy = df_predict.loc[economy]
    _model = m1
    for target,m2 in _model.items():
        model = m2
        forecast = m2.predict(_predict_economy)
        target_predictions[target] = forecast
        print('-- Predicting demand for %s' %target)
    economy_predictions[economy] = target_predictions
    
print('\n Finished making predictions.')

## 5. Extract results

In [None]:
# combine the results in to a dataframe
# Note that Prophet produces many outputs such as the errors. 
# Only the demand predictions are included right now, but you can add the other outputs by adding more columns to 'results'

results_list = []

for economy in economies:
    a = economy_predictions[economy]
    for target in targets:
        b = a[target]
        b['Subsector'] = target
        b['Economy'] = economy
        b['Year'] = b['ds'].dt.year
        b.replace(subsector_dict, inplace=True)
        #b = b.set_index(['Economy','Year','Subsector'])
        _b = b[['Economy','Year','Subsector','yhat']]
        _b = _b.set_index(['Economy','Year'])
        results_list.append(_b)

_results = pd.concat(results_list)
_results = pd.DataFrame(_results)
_results.columns = ['Subsector','Demand']

_results['Demand - thousand tons per capita'] = (np.exp(_results['Demand'])).div(1000)
#results['Demand - thousand tons'] = np.multiply(results['Demand - thousand tons per capita'],df_big['Population'])

# this makes sure there are no negative values:
#results['Demand'] = np.where(results['Demand'] < 0, 0,results['Demand'])
#results.replace(subsector_dict, inplace=True)

In [None]:
results.head()

In [None]:
# this needs to be cleaned up

_a = df_big[['Year','Population']].reset_index()
_b = _a.set_index(['Economy','Year'])
df_list = []

for sub in subsectors:
    _df = _results[_results['Subsector']==sub]
    _df['Demand - thousand tons'] = np.multiply(_df['Demand - thousand tons per capita'],_b['Population'])
    df_list.append(_df)
final_results = pd.concat(df_list)

In [None]:
# check to see if it worked:

final_results.tail()

In [None]:
# write the results to a CSV file (this is in tidy format)

final_results = final_results.reset_index()
final_results.to_csv('../data/final/Industry_results.csv', header=True)