## Base Configurations

In [1]:
import pymongo
import pandas as pd
from pymongo import MongoClient
from random import random
import json

import numpy as np
import pandas as pd
import pickle

### Define Functions

In [3]:
def _connect_mongo(host, port, db):
    conn = MongoClient(host, port)
    return conn[db]

def read_mongo(db, collection, query={}, host='localhost', port=27017, no_id=True):
    """ Read from Mongo and Store into DataFrame """
    db = _connect_mongo(host=host, port=port, db=db)
    cursor = db[collection].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

### Read Data

In [4]:
df_covid = read_mongo("itu", "covid_data")
df_country = read_mongo("itu", "country_info")

In [5]:
df = df_country.merge(df_covid, on='location', how='left')

df = df[['continent','location','date','total_cases','new_cases',
         'new_cases_smoothed','total_deaths','new_deaths','new_deaths_smoothed',
         'total_cases_per_million','new_cases_per_million','new_cases_smoothed_per_million',
         'total_deaths_per_million','new_deaths_per_million','new_deaths_smoothed_per_million',
         'reproduction_rate','icu_patients','icu_patients_per_million','hosp_patients',
         'hosp_patients_per_million','weekly_icu_admissions','weekly_icu_admissions_per_million',
         'weekly_hosp_admissions','weekly_hosp_admissions_per_million','total_tests','new_tests',
         'total_tests_per_thousand','new_tests_per_thousand','new_tests_smoothed','new_tests_smoothed_per_thousand',
         'positive_rate','tests_per_case','tests_units','total_vaccinations','people_vaccinated','people_fully_vaccinated',
         'new_vaccinations','new_vaccinations_smoothed','total_vaccinations_per_hundred','people_vaccinated_per_hundred',
         'people_fully_vaccinated_per_hundred','new_vaccinations_smoothed_per_million','stringency_index','population',
         'population_density','median_age','aged_65_older','aged_70_older','gdp_per_capita',
         'cardiovasc_death_rate','diabetes_prevalence','handwashing_facilities',
         'hospital_beds_per_thousand','life_expectancy','human_development_index']]

del df_covid, df_country

df.head()

Unnamed: 0,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,median_age,aged_65_older,aged_70_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,0.026,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511
1,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,0.026,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511
2,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,0.026,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511
3,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,0.026,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511
4,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,0.026,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511


In [3]:
date_start_test = '2022-04-01'   
date_start_forecast = '2022-11-01'

df = df[df['date'] < date_start_forecast].copy()

### Formating

In [4]:
df['date'] = pd.to_datetime(df.date)

In [5]:
(df.date.min(),df.date.max())

(Timestamp('2020-01-01 00:00:00'), Timestamp('2022-10-31 00:00:00'))

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 231633 entries, 0 to 232093
Data columns (total 55 columns):
 #   Column                                 Non-Null Count   Dtype         
---  ------                                 --------------   -----         
 0   continent                              218557 non-null  object        
 1   location                               231633 non-null  object        
 2   date                                   231633 non-null  datetime64[ns]
 3   total_cases                            218538 non-null  float64       
 4   new_cases                              218333 non-null  float64       
 5   new_cases_smoothed                     217137 non-null  float64       
 6   total_deaths                           199310 non-null  float64       
 7   new_deaths                             199262 non-null  float64       
 8   new_deaths_smoothed                    198082 non-null  float64       
 9   total_cases_per_million                217540 no

In [7]:
for i in df.loc[:, df.columns != 'tests_units'].columns[3:]:
    df[i] = df[i].astype(float)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 231633 entries, 0 to 232093
Data columns (total 55 columns):
 #   Column                                 Non-Null Count   Dtype         
---  ------                                 --------------   -----         
 0   continent                              218557 non-null  object        
 1   location                               231633 non-null  object        
 2   date                                   231633 non-null  datetime64[ns]
 3   total_cases                            218538 non-null  float64       
 4   new_cases                              218333 non-null  float64       
 5   new_cases_smoothed                     217137 non-null  float64       
 6   total_deaths                           199310 non-null  float64       
 7   new_deaths                             199262 non-null  float64       
 8   new_deaths_smoothed                    198082 non-null  float64       
 9   total_cases_per_million                217540 no

In [8]:
df.replace({None: 0}, inplace = True)
df.set_index('date',inplace = True)

In [10]:
df.head()

Unnamed: 0_level_0,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,...,median_age,aged_65_older,aged_70_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-24,Asia,Afghanistan,5.0,5.0,0.0,0.0,0.0,0.0,0.122,0.122,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511
2020-02-25,Asia,Afghanistan,5.0,0.0,0.0,0.0,0.0,0.0,0.122,0.0,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511
2020-02-26,Asia,Afghanistan,5.0,0.0,0.0,0.0,0.0,0.0,0.122,0.0,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511
2020-02-27,Asia,Afghanistan,5.0,0.0,0.0,0.0,0.0,0.0,0.122,0.0,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511
2020-02-28,Asia,Afghanistan,5.0,0.0,0.0,0.0,0.0,0.0,0.122,0.0,...,18.6,2.581,1.337,1803.987,597.029,9.59,37.746,0.5,64.83,0.511


## Explanatory Data Analysis

### Target Value

Target values should be tomorrow's new cases. You should use 'TARGET' column in your models.

In [11]:
df['TARGET'] = df.groupby('location')['new_cases'].shift(-1) ## Tomorrow's case will be our prediction
df.dropna(subset=['TARGET'], inplace = True)
df.tail()

Unnamed: 0_level_0,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,...,aged_65_older,aged_70_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,TARGET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-26,Africa,Zimbabwe,257893.0,0.0,0.0,5606.0,0.0,0.0,15801.745,0.0,...,2.822,1.882,1899.775,307.846,1.82,36.791,1.7,61.49,0.571,0.0
2022-10-27,Africa,Zimbabwe,257893.0,0.0,0.0,5606.0,0.0,0.0,15801.745,0.0,...,2.822,1.882,1899.775,307.846,1.82,36.791,1.7,61.49,0.571,0.0
2022-10-28,Africa,Zimbabwe,257893.0,0.0,0.0,5606.0,0.0,0.0,15801.745,0.0,...,2.822,1.882,1899.775,307.846,1.82,36.791,1.7,61.49,0.571,0.0
2022-10-29,Africa,Zimbabwe,257893.0,0.0,0.0,5606.0,0.0,0.0,15801.745,0.0,...,2.822,1.882,1899.775,307.846,1.82,36.791,1.7,61.49,0.571,0.0
2022-10-30,Africa,Zimbabwe,257893.0,0.0,0.0,5606.0,0.0,0.0,15801.745,0.0,...,2.822,1.882,1899.775,307.846,1.82,36.791,1.7,61.49,0.571,0.0


## Feature Extraction

Examples are given to you below. You may increase the number of features extracted.

In [12]:
df.reset_index(inplace = True)

df['new_cases_avg_3g'] = df.groupby('location')['new_cases'].rolling(3, min_periods=1).mean().reset_index(0,drop=True) # Last 3 days avg
df['daily_death_ratio'] = round(df['new_deaths']/df['total_deaths'],5)


## Modeling

In [13]:
df_train = df[df['date'] < date_start_test].copy()
df_test = df[(df['date'] >= date_start_test) & (df['date'] < date_start_forecast)].copy()

In [14]:
df_train.head()

Unnamed: 0,date,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,TARGET,new_cases_avg_3g,daily_death_ratio
0,2020-02-24,Asia,Afghanistan,5.0,5.0,0.0,0.0,0.0,0.0,0.122,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,0.0,5.0,
1,2020-02-25,Asia,Afghanistan,5.0,0.0,0.0,0.0,0.0,0.0,0.122,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,0.0,2.5,
2,2020-02-26,Asia,Afghanistan,5.0,0.0,0.0,0.0,0.0,0.0,0.122,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,0.0,1.666667,
3,2020-02-27,Asia,Afghanistan,5.0,0.0,0.0,0.0,0.0,0.0,0.122,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,0.0,0.0,
4,2020-02-28,Asia,Afghanistan,5.0,0.0,0.0,0.0,0.0,0.0,0.122,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,0.0,0.0,


In [24]:
(df_train.date.min(),df_train.date.max())

(Timestamp('2020-01-01 00:00:00'), Timestamp('2022-03-31 00:00:00'))

In [15]:
df_test.head()

Unnamed: 0,date,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,TARGET,new_cases_avg_3g,daily_death_ratio
767,2022-04-01,Asia,Afghanistan,177782.0,35.0,65.857,7670.0,0.0,1.857,4322.57,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,21.0,41.333333,0.0
768,2022-04-02,Asia,Afghanistan,177803.0,21.0,68.857,7671.0,1.0,2.0,4323.081,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,24.0,29.0,0.00013
769,2022-04-03,Asia,Afghanistan,177827.0,24.0,43.857,7671.0,0.0,1.286,4323.664,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,70.0,26.666667,0.0
770,2022-04-04,Asia,Afghanistan,177897.0,70.0,42.143,7671.0,0.0,1.143,4325.366,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,35.0,38.333333,0.0
771,2022-04-05,Asia,Afghanistan,177932.0,35.0,39.143,7671.0,0.0,0.857,4326.217,...,1803.987,597.029,9.59,37.746,0.5,64.83,0.511,42.0,43.0,0.0


In [25]:
(df_test.date.min(),df_test.date.max())

(Timestamp('2022-04-01 00:00:00'), Timestamp('2022-10-30 00:00:00'))

### Model Performance

### Save Model

In [None]:
df_features = pd.DataFrame(model.feature_importances_, columns = ['Importance'],
             index = x_train.columns).sort_values('Importance', ascending = False).reset_index()
df_features.rename(columns = {'index':'Variable'}, inplace = True)

In [35]:
model_file = 'finalized_model.pickle'
pickle.dump(model, open(model_file, 'wb')) # model is your final model object.

variable_file = 'model_variables.pickle'
pickle.dump(df_features, open(variable_file, 'wb')) # df_features is dataframe that holds final model variables.