In [50]:

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import fbprophet
import pickle
import scipy.optimize as optim
import logging

logging.getLogger().setLevel(logging.ERROR)
    

%matplotlib inline

plt.style.use('bmh')

%store -r WORKDIR

if 'WORKDIR' not in dir():
    WORKDIR = 'C:/Users/thewr/git/mit_data_science.git/'



max_date = '2020-06-01'
min_samples = 30


data_proc_file = WORKDIR + '/Data/Processed/covid19_data_modeling.parquet'
model_score_file = WORKDIR + '/Data/Modeling/model_scores.parquet'
model_file = WORKDIR + '/Data/Modeling/trained_models.jbl'

time_col = 'date'
grain_col = 'countrycode'
target_col = 'cases'
countrycode_list = ['US','CN','BR','IT','FR','UK']

ntest = 15

pd.plotting.register_matplotlib_converters()

# Carga dos Dados 

In [51]:
df = pd.read_parquet(data_proc_file)

df = df[df[time_col] < max_date]

print('shape:', df.shape)
print('columns:', df.columns)

shape: (16572, 5)
columns: Index(['countrycode', 'date', 'cases', 'deaths', 'recovered'], dtype='object')


In [52]:
df.head()

Unnamed: 0,countrycode,date,cases,deaths,recovered
6,AD,2020-05-31,764.0,51.0,1.0
7,AD,2020-05-30,764.0,51.0,1.0
8,AD,2020-05-29,764.0,51.0,1.0
9,AD,2020-05-28,763.0,51.0,1.0
10,AD,2020-05-27,763.0,51.0,1.0


In [35]:
df.shape

(28099, 5)

# Treino / Teste 

In [53]:
def split_last_n_by_grain(df, ntest, time_column_name, grain_column_names):
    """Group df by grain and split on last n rows for each group."""
    df_grouped = (df.sort_values(time_column_name) # Sort by ascending time
                  .groupby(grain_column_names, group_keys=False))
    df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-ntest])
    df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-ntest:])
    return df_head, df_tail


df_train, df_test = split_last_n_by_grain(df, ntest, time_col, grain_col)

print('Train:', df_train.shape)
print('Test :', df_test.shape)

Train: (13746, 5)
Test : (2826, 5)


In [37]:
df_train.head()

Unnamed: 0,countrycode,date,cases,deaths,recovered
96,AD,2020-03-02,1.0,0.0,0.0
95,AD,2020-03-03,1.0,0.0,0.0
94,AD,2020-03-04,1.0,0.0,0.0
93,AD,2020-03-05,1.0,0.0,0.0
92,AD,2020-03-06,1.0,0.0,0.0


In [42]:
df_test.head()

Unnamed: 0,countrycode,date,cases,deaths,recovered
25151,AD,2020-07-17,880.0,52.0,800.0
25340,AD,2020-07-18,880.0,52.0,800.0
25529,AD,2020-07-19,880.0,52.0,800.0
25718,AD,2020-07-20,884.0,52.0,800.0
25907,AD,2020-07-21,884.0,52.0,800.0


# Construção do Pipeline 

In [58]:
def mape(y_true, y_pred):
    return ((y_pred - y_true).abs()/y_true).mean() * 100
    
trained_models = {}
df_model_result=pd.DataFrame()
Xtrn1 = pd.DataFrame()
Xtst2 = pd.DataFrame()
result_list = []
for countrycode in countrycode_list:
    print('Processing ', countrycode, end='')
    Xtrn = df_train[df_train[grain_col] == countrycode][[time_col, target_col]].copy()
    print('Printing Xtrn ',Xtrn.shape, Xtrn.head(2))
    Xtst = df_test[df_test[grain_col] == countrycode].groupby(time_col)[target_col].sum()
    print('Printing Xtst ',Xtst.shape,Xtst.head(2))
    Xrefit = df[df[grain_col] == countrycode][[time_col, target_col]].copy()
    print('Printing XREFIT ',Xrefit.shape)
    
    Xtrn.columns = ['ds', 'y']
    if Xtrn.shape[0] < min_samples:
        print('=> less than', min_samples, '. Ignored.(', Xtrn.shape[0],')')
        continue
    
    model = fbprophet.Prophet(growth='linear').fit(Xtrn)
    
    # Evaluate Model
    n_periods = Xtst.shape[0]
    forecast_data = model.make_future_dataframe(
        periods=n_periods,
        include_history=True
        )
    df_forecast = model.predict(forecast_data).set_index('ds')
    forecast     = df_forecast.yhat[-n_periods:].rename('forecast')
    forecast_low  =  df_forecast.yhat_lower[-n_periods:].rename('forecast_lo')
    forecast_up   =  df_forecast.yhat_upper[-n_periods:].rename('forecast_up')
    
    # Score model
    score = mape(Xtst, forecast)
    print(' mape %.2f %%'%score)
    
    # Refit model
    Xrefit.columns = ['ds','y']
    trained_models[countrycode] = fbprophet.Prophet(growth='linear').fit(Xrefit)
    
    #result list
    result_list.append({'countrycode':countrycode,
                        'model_name': 'prophet',
                        'date_begin': Xrefit.ds.min(),
                        'date_end'  : Xrefit.ds.max(),
                        'score': score})
    
df_results = pd.DataFrame().from_dict(result_list)

Processing  USPrinting Xtrn  (116, 2)             date  cases
22001 2020-01-22    1.0
21812 2020-01-23    1.0
Printing Xtst  (15,) date
2020-05-17    1486376.0
2020-05-18    1506840.0
Name: cases, dtype: float64
Printing XREFIT  (131, 2)


ValueError: Capacities must be supplied for logistic growth in column "cap"

In [55]:
df_results.shape

(5, 5)

In [45]:
df_results.head()

Unnamed: 0,countrycode,model_name,date_begin,date_end,score
0,US,prophet,2020-01-22,2020-07-31,9.694024
1,CN,prophet,2020-01-22,2020-07-31,1.047769
2,BR,prophet,2020-02-26,2020-07-31,1.621774
3,IT,prophet,2020-01-31,2020-07-31,0.091595
4,FR,prophet,2020-01-24,2020-07-31,1.036974


In [24]:
 Xtrn1.head()

Unnamed: 0,ds,y


In [47]:
print(trained_models)

{'US': <fbprophet.forecaster.Prophet object at 0x0000017D0612D448>, 'CN': <fbprophet.forecaster.Prophet object at 0x0000017D069C1E48>, 'BR': <fbprophet.forecaster.Prophet object at 0x0000017D0610C988>, 'IT': <fbprophet.forecaster.Prophet object at 0x0000017D06A05D88>, 'FR': <fbprophet.forecaster.Prophet object at 0x0000017D063A8D88>}


In [22]:
 Xtrn.head()

Unnamed: 0,ds,y


# Exportar os resultados e modelagem 

In [56]:
# exportar a tabela de resultados
df_results.to_parquet(model_score_file)

filename = model_score_file.replace(".parquet","_" + df_results.date_end.max().date().strftime('%Y-%m-%d') + ".parquet") 
df_results.to_parquet(filename)


with open(model_file, 'wb') as fid:
    pickle.dump(trained_models, fid)


filename = model_file.replace(".jbl","_" + df_results.date_end.max().date().strftime('%Y-%m-%d') + ".jbl")     
with open(model_file, 'wb') as fid:
    pickle.dump(trained_models, fid)
    
    
df_results.head()

Unnamed: 0,countrycode,model_name,date_begin,date_end,score
0,US,prophet,2020-01-22,2020-05-31,3.261371
1,CN,prophet,2020-01-22,2020-05-31,1.54397
2,BR,prophet,2020-02-26,2020-05-31,18.699037
3,IT,prophet,2020-01-31,2020-05-31,4.289474
4,FR,prophet,2020-01-24,2020-05-31,4.173835
