In [21]:

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import fbprophet
import pickle
import scipy.optimize as optim
import logging

logging.getLogger().setLevel(logging.ERROR)
    

%matplotlib inline

plt.style.use('bmh')

%store -r WORKDIR

if 'WORKDIR' not in dir():
    WORKDIR = 'C:/Users/BZ241WX/Documents/InfNet/CursoPosDataScience/mit_data_science/'



max_date = '2020-08-01'
min_samples = 30


data_proc_file = WORKDIR + '/Data/Processed/covid19_data_modeling.parquet'
model_score_file = WORKDIR + '/Data/Modeling/model_scores.parquet'
model_file = WORKDIR + '/Data/Modeling/trained_models.jbl'

time_col = 'date'
grain_col = 'countrycode'
target_col = 'cases'
countrycode_list = ['US','CN','BR','IT','FR','UK']

ntest = 15

pd.plotting.register_matplotlib_converters()

# Carga dos Dados 

In [6]:
df = pd.read_parquet(data_proc_file)

df = df[df[time_col] < max_date]

print('shape:', df.shape)
print('columns:', df.columns)

shape: (10751, 5)
columns: Index(['cases', 'countrycode', 'date', 'deaths', 'recovered'], dtype='object')


# Treino / Teste 

In [7]:
def split_last_n_by_grain(df, ntest, time_column_name, grain_column_names):
    """Group df by grain and split on last n rows for each group."""
    df_grouped = (df.sort_values(time_column_name) # Sort by ascending time
                  .groupby(grain_column_names, group_keys=False))
    df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-ntest])
    df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-ntest:])
    return df_head, df_tail


df_train, df_test = split_last_n_by_grain(df, ntest, time_col, grain_col)

print('Train:', df_train.shape)
print('Test :', df_test.shape)

Train: (7976, 5)
Test : (2775, 5)


# Construção do Pipeline 

In [8]:
def mape(y_true, y_pred):
    return ((y_pred - y_true).abs()/y_true).mean() * 100
    
trained_models = {}
df_model_result=pd.DataFrame()
result_list = []
for countrycode in countrycode_list:
    print('Processing ', countrycode, end='')
    Xtrn = df_train[df_train[grain_col] == countrycode][[time_col, target_col]].copy()
    Xtst = df_test[df_test[grain_col] == countrycode].groupby(time_col)[target_col].sum()
    Xrefit = df[df[grain_col] == countrycode][[time_col, target_col]].copy()
    
    Xtrn.columns = ['ds', 'y']
    if Xtrn.shape[0] < min_samples:
        print('=> less than', min_samples, '. Ignored.(', Xtrn.shape[0],')')
        continue
    
    model = fbprophet.Prophet(growth='linear').fit(Xtrn)
    
    # Evaluate Model
    n_periods = Xtst.shape[0]
    forecast_data = model.make_future_dataframe(
        periods=n_periods,
        include_history=True
        )
    df_forecast = model.predict(forecast_data).set_index('ds')
    forecast     = df_forecast.yhat[-n_periods:].rename('forecast')
    forecast_low  =  df_forecast.yhat_lower[-n_periods:].rename('forecast_lo')
    forecast_up   =  df_forecast.yhat_upper[-n_periods:].rename('forecast_up')
    
    # Score model
    score = mape(Xtst, forecast)
    print(' mape %.2f %%'%score)
    
    # Refit model
    Xrefit.columns = ['ds','y']
    trained_models[countrycode] = fbprophet.Prophet(growth='linear').fit(Xrefit)
    
    #result list
    result_list.append({'countrycode':countrycode,
                        'model_name': 'prophet',
                        'date_begin': Xrefit.ds.min(),
                        'date_end'  : Xrefit.ds.max(),
                        'score': score})
    
df_results = pd.DataFrame().from_dict(result_list)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


Processing  US

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


 mape 0.93 %


INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


Processing  CN

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


 mape 0.56 %
Processing  BR

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


 mape 21.18 %


INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


Processing  IT

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


 mape 5.58 %


INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


Processing  FR

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


 mape 23.06 %
Processing  UK=> less than 30 . Ignored.( 0 )


# Exportar os resultados e modelagem 

In [16]:
# exportar a tabela de resultados
df_results.to_parquet(model_score_file)

filename = model_score_file.replace(".parquet","_" + df_results.date_end.max().date().strftime('%Y-%m-%d') + ".parquet") 
df_results.to_parquet(filename)


with open(model_file, 'wb') as fid:
    pickle.dump(trained_models, fid)


filename = model_file.replace(".jbl","_" + df_results.date_end.max().date().strftime('%Y-%m-%d') + ".jbl")     
with open(model_file, 'wb') as fid:
    pickle.dump(trained_models, fid)
    
    
df_results.head()

../../Data/Modeling/model_scores_2020-04-30.parquet


Unnamed: 0,countrycode,date_begin,date_end,model_name,score
0,US,2020-01-22,2020-04-30,prophet,0.92712
1,CN,2020-01-22,2020-04-30,prophet,0.564901
2,BR,2020-02-26,2020-04-30,prophet,21.179157
3,IT,2020-01-31,2020-04-30,prophet,5.583808
4,FR,2020-01-24,2020-04-30,prophet,23.059688
