In [1]:
import json

import pandas as pd
from datetime import datetime, timedelta

import trane
import featuretools as ft
import numpy as np
import sklearn
#print(sklearn.__version__)

In [2]:
df = pd.read_csv("covid19.csv")
df['DATE'] = df['Date'].apply(lambda x: datetime.strptime(x, "%m/%d/%y"))
df = df.sort_values(by=['Date'])
df= df.drop(['Deaths','Confirmed','Recovered','Lat','Long'], axis=1)
df.tail()

Unnamed: 0,Province/State,Country/Region,Date,DATE
11933,,Cuba,3/9/20,2020-03-09
11934,,Cyprus,3/9/20,2020-03-09
11935,,Czechia,3/9/20,2020-03-09
11922,Tibet,China,3/9/20,2020-03-09
11969,,Guyana,3/9/20,2020-03-09


In [3]:
entity_col = "Country/Region"
df_ft = df.copy()
str_col_list = ['Country/Region', 'Province/State']
str_mappers = {}
for str_col in str_col_list:
    str_to_id = {}
    id_to_str = []
    n_entity = 0

    for item in set(df_ft[str_col]):
        str_to_id[item] = n_entity
        id_to_str.append(item)
        n_entity += 1

    if str_col == entity_col:
        df[str_col] = df[str_col].apply(lambda x: str_to_id[x])
    df_ft[str_col] = df_ft[str_col].apply(lambda x: str_to_id[x])
    str_mappers[str_col] = (str_to_id, id_to_str)


meta = trane.TableMeta(json.loads(open('meta_covid.json').read()))

df_ft.tail()

Unnamed: 0,Province/State,Country/Region,Date,DATE
11933,0,142,3/9/20,2020-03-09
11934,0,37,3/9/20,2020-03-09
11935,0,119,3/9/20,2020-03-09
11922,21,151,3/9/20,2020-03-09
11969,0,40,3/9/20,2020-03-09


In [4]:
cutoff_base = datetime.strptime("2020-01-09", "%Y-%m-%d")
cutoff_end = datetime.strptime("2020-03-09", "%Y-%m-%d")
cutoff_strategy = trane.FixWindowCutoffStrategy(entity_col, cutoff_base, cutoff_end, 1)

features = trane.FeaturetoolsWrapper(df_ft, entity_col, 'DATE',
                                        {'Country/Region': ft.variable_types.Categorical,
                                         'Province/State': ft.variable_types.Categorical
                                        }, 'cases')
features.compute_features(df_ft, cutoff_strategy, 5)

Built 19 features
Elapsed: 00:15 | Progress: 100%|██████████




In [5]:
features.features


Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT(cases),MODE(cases.Province/State) = 0.0,MODE(cases.Province/State) = 8.0,MODE(cases.Province/State) = 4.0,MODE(cases.Province/State) = 1.0,MODE(cases.Province/State) is unknown,NUM_UNIQUE(cases.Province/State),MODE(cases.DAY(DATE)) = 1.0,MODE(cases.DAY(DATE)) = 22.0,MODE(cases.DAY(DATE)) = 25.0,...,MODE(cases.YEAR(Date)) = 2020.0,MODE(cases.YEAR(Date)) is unknown,NUM_UNIQUE(cases.DAY(DATE)),NUM_UNIQUE(cases.DAY(Date)),NUM_UNIQUE(cases.MONTH(DATE)),NUM_UNIQUE(cases.MONTH(Date)),NUM_UNIQUE(cases.WEEKDAY(DATE)),NUM_UNIQUE(cases.WEEKDAY(Date)),NUM_UNIQUE(cases.YEAR(DATE)),NUM_UNIQUE(cases.YEAR(Date))
Country/Region,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,2020-01-08,0.0,False,False,False,False,True,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2020-01-09,0.0,False,False,False,False,True,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2020-01-10,0.0,False,False,False,False,True,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2020-01-11,0.0,False,False,False,False,True,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2020-01-12,0.0,False,False,False,False,True,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,2020-03-03,5.0,True,False,False,False,False,1.0,True,False,False,...,True,False,5.0,5.0,2.0,2.0,5.0,5.0,1.0,1.0
176,2020-03-04,5.0,True,False,False,False,False,1.0,True,False,False,...,True,False,5.0,5.0,2.0,2.0,5.0,5.0,1.0,1.0
176,2020-03-05,5.0,True,False,False,False,False,1.0,True,False,False,...,True,False,5.0,5.0,1.0,1.0,5.0,5.0,1.0,1.0
176,2020-03-06,5.0,True,False,False,False,False,1.0,False,False,False,...,True,False,5.0,5.0,1.0,1.0,5.0,5.0,1.0,1.0


In [6]:
problem_generator = trane.PredictionProblemGenerator(
    table_meta=meta, entity_col=entity_col, time_col="Data")

problems = problem_generator.generate()

Success/Attempt = 231/1044


In [7]:
new_df = df[df['DATE'] < "2020-02-15"]
evaluator = trane.PredictionProblemEvaluator(new_df,
                                             entity_col=entity_col,
                                             cutoff_strategy=cutoff_strategy,
                                             min_train_set=20,
                                             min_test_set=20,
                                             previous_k_as_feature=2,
                                             latest_k_as_test=8
                                             )

In [8]:
for i in problems:
    print(i)

For each <Country/Region> predict the number of records
For each <Country/Region> predict the number of records with <Lat> greater than __
For each <Country/Region> predict the number of records with <Long> greater than __
For each <Country/Region> predict the number of records with <Confirmed> greater than __
For each <Country/Region> predict the number of records with <Deaths> greater than __
For each <Country/Region> predict the number of records with <Recovered> greater than __
For each <Country/Region> predict the number of records with <Lat> less than __
For each <Country/Region> predict the number of records with <Long> less than __
For each <Country/Region> predict the number of records with <Confirmed> less than __
For each <Country/Region> predict the number of records with <Deaths> less than __
For each <Country/Region> predict the number of records with <Recovered> less than __
For each <Country/Region> predict the total <Lat> in all related records
For each <Country/Region

In [None]:
result = trane.multi_process_evaluation(evaluator, problems[50:100], features)

  0%|          | 0/50 [00:02<?, ?it/s]


In [9]:
len(problems)

231

ERROR! Session/line number was not unique in database. History logging moved to new session 250


Process SpawnPoolWorker-2:
Traceback (most recent call last):
  File "/Users/sarapido/anaconda3/envs/Trane3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/sarapido/anaconda3/envs/Trane3.8/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/sarapido/anaconda3/envs/Trane3.8/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/sarapido/anaconda3/envs/Trane3.8/lib/python3.8/multiprocessing/queues.py", line 355, in get
    with self._rlock:
  File "/Users/sarapido/anaconda3/envs/Trane3.8/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
Process SpawnPoolWorker-8:
Traceback (most recent call last):
  File "/Users/sarapido/anaconda3/envs/Trane3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/sarapido/anaconda3/en