In [31]:
import json

import pandas as pd
from datetime import datetime, timedelta

import trane
import featuretools as ft
import numpy as np
import sklearn
#print(sklearn.__version__)

In [36]:
df = pd.read_csv("medical_no_show.csv")
df['scheduled_day'] = df['scheduled_day'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
df['appointment_day'] = df['appointment_day'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
df = df.sort_values(by=['scheduled_day'])
df.tail()

Unnamed: 0,appointment_id,patient_id,appointment_id.1,gender,scheduled_day,appointment_day,age,neighborhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
92442,5790461,729255200000000.0,5790461,M,2016-06-08 19:32:25,2016-06-08,54,JARDIM CAMBURI,0,0,0,0,0,0,0
88146,5790464,947614400000000.0,5790464,F,2016-06-08 19:32:56,2016-06-08,43,JARDIM CAMBURI,0,0,0,0,0,0,0
88147,5790466,356247900000.0,5790466,M,2016-06-08 19:33:23,2016-06-08,27,JARDIM CAMBURI,0,0,0,0,0,0,0
87219,5790481,234131800000.0,5790481,F,2016-06-08 19:58:52,2016-06-08,30,JARDIM CAMBURI,0,0,0,0,0,0,0
87223,5790484,5237164000000.0,5790484,F,2016-06-08 20:07:23,2016-06-08,27,JARDIM CAMBURI,0,0,0,0,0,0,0


In [38]:
entity_col = "patient_id"
df_ft = df.copy()
df_ft.head()

Unnamed: 0,appointment_id,patient_id,appointment_id.1,gender,scheduled_day,appointment_day,age,neighborhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
3764,5030230,832256400000000.0,5030230,F,2015-11-10 07:13:56,2016-05-04,51,RESISTÊNCIA,0,0,0,0,0,1,0
46292,5122866,91637470000000.0,5122866,M,2015-12-03 08:17:28,2016-05-02,34,VILA RUBIM,0,1,0,0,0,1,1
102795,5134197,1216587000000.0,5134197,F,2015-12-07 10:40:59,2016-06-03,27,SÃO CRISTÓVÃO,1,0,0,0,0,1,1
102797,5134220,31899600000000.0,5134220,F,2015-12-07 10:42:42,2016-06-03,48,MARUÍPE,0,1,1,0,0,1,0
102796,5134223,9582232000000.0,5134223,F,2015-12-07 10:43:01,2016-06-03,80,SÃO CRISTÓVÃO,0,1,1,0,0,1,0


In [39]:
str_col_list = ['patient_id', 'appointment_id', 'appointment_id.1','gender', 'neighborhood']
str_mappers = {}
for str_col in str_col_list:
    str_to_id = {}
    id_to_str = []
    n_entity = 0

    for item in set(df_ft[str_col]):
        str_to_id[item] = n_entity
        id_to_str.append(item)
        n_entity += 1

    if str_col == entity_col:
        df[str_col] = df[str_col].apply(lambda x: str_to_id[x])
    df_ft[str_col] = df_ft[str_col].apply(lambda x: str_to_id[x])
    str_mappers[str_col] = (str_to_id, id_to_str)


meta = trane.TableMeta(json.loads(open('meta.json').read()))
meta.table_meta

{'tables': [{'fields': [{'name': 'patient_id', 'type': 'text'},
    {'name': 'appointment_id', 'type': 'text'},
    {'name': 'gender', 'type': 'text'},
    {'name': 'scheduled_day', 'type': 'datetime'},
    {'name': 'appointment_day', 'type': 'datetime'},
    {'name': 'age', 'type': 'number', 'subtype': 'integer'},
    {'name': 'neighborhood', 'type': 'categorical', 'subtype': 'categorical'},
    {'name': 'scholarship', 'type': 'categorical', 'subtype': 'boolean'},
    {'name': 'hipertension', 'type': 'categorical', 'subtype': 'boolean'},
    {'name': 'diabetes', 'type': 'categorical', 'subtype': 'boolean'},
    {'name': 'alcoholism', 'type': 'categorical', 'subtype': 'boolean'},
    {'name': 'handcap', 'type': 'categorical', 'subtype': 'boolean'},
    {'name': 'sms_received', 'type': 'categorical', 'subtype': 'boolean'},
    {'name': 'no_show', 'type': 'number', 'subtype': 'integer'}]}]}

In [40]:
df_ft.tail()

Unnamed: 0,appointment_id,patient_id,appointment_id.1,gender,scheduled_day,appointment_day,age,neighborhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
92442,9376,54589,9376,1,2016-06-08 19:32:25,2016-06-08,54,67,0,0,0,0,0,0,0
88146,9378,16588,9378,0,2016-06-08 19:32:56,2016-06-08,43,67,0,0,0,0,0,0,0
88147,9379,28525,9379,1,2016-06-08 19:33:23,2016-06-08,27,67,0,0,0,0,0,0,0
87219,9382,35199,9382,0,2016-06-08 19:58:52,2016-06-08,30,67,0,0,0,0,0,0,0
87223,9383,24818,9383,0,2016-06-08 20:07:23,2016-06-08,27,67,0,0,0,0,0,0,0


In [42]:
cutoff_base = datetime.strptime("2016-05-04", "%Y-%m-%d")
cutoff_end = datetime.strptime("2016-06-08", "%Y-%m-%d")
cutoff_strategy = trane.FixWindowCutoffStrategy(entity_col, cutoff_base, cutoff_end, 1)

features = trane.FeaturetoolsWrapper(df_ft, entity_col, 'appointment_day',
                                        {'gender': ft.variable_types.Categorical,
                                         'age': ft.variable_types.Discrete,
                                         'neighborhood': ft.variable_types.Categorical,
                                         'scholarship': ft.variable_types.Categorical,
                                         'hypertension': ft.variable_types.Categorical,
                                         'diabetes': ft.variable_types.Categorical,
                                         'alcoholism': ft.variable_types.Categorical,
                                         'handicap': ft.variable_types.Categorical,
                                         'sms_received': ft.variable_types.Categorical,
                                         'no_show': ft.variable_types.Categorical,
                                        }, 'cases')
features.compute_features(df_ft, cutoff_strategy, 5)

Built 49 features
Elapsed: 31:33 | Progress: 100%|██████████




In [43]:
features.features

Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT(cases),MAX(cases.appointment_id),MAX(cases.appointment_id.1),MEAN(cases.appointment_id),MEAN(cases.appointment_id.1),MIN(cases.appointment_id),MIN(cases.appointment_id.1),MODE(cases.age) = 0.0,MODE(cases.age) = 1.0,MODE(cases.age) = 52.0,...,MODE(cases.YEAR(scheduled_day)) = 2015.0,MODE(cases.YEAR(scheduled_day)) is unknown,NUM_UNIQUE(cases.DAY(appointment_day)),NUM_UNIQUE(cases.DAY(scheduled_day)),NUM_UNIQUE(cases.MONTH(appointment_day)),NUM_UNIQUE(cases.MONTH(scheduled_day)),NUM_UNIQUE(cases.WEEKDAY(appointment_day)),NUM_UNIQUE(cases.WEEKDAY(scheduled_day)),NUM_UNIQUE(cases.YEAR(appointment_day)),NUM_UNIQUE(cases.YEAR(scheduled_day))
patient_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,2016-05-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2016-05-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2016-05-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2016-05-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2016-05-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62298,2016-06-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62298,2016-06-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62298,2016-06-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62298,2016-06-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,...,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
problem_generator = trane.PredictionProblemGenerator(
    table_meta=meta, entity_col=entity_col, time_col="Data")

problems = problem_generator.generate()

Success/Attempt = 70/3498


In [46]:
new_df = df[df['appointment_day'] < "2016-05-25"]
new_df.tail()

Unnamed: 0,appointment_id,patient_id,appointment_id.1,gender,scheduled_day,appointment_day,age,neighborhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
62665,5735221,55063,5735221,M,2016-05-24 19:16:33,2016-05-24,49,JARDIM CAMBURI,0,0,0,0,0,0,0
7709,5735229,57835,5735229,M,2016-05-24 19:30:32,2016-05-24,43,REDENÇÃO,0,0,0,0,0,0,0
62669,5735231,55488,5735231,F,2016-05-24 19:31:49,2016-05-24,47,JARDIM CAMBURI,0,0,0,0,0,0,0
62673,5735233,31930,5735233,F,2016-05-24 19:42:04,2016-05-24,39,JARDIM CAMBURI,0,0,0,0,0,0,0
20533,5735234,48295,5735234,F,2016-05-24 19:44:47,2016-05-24,47,SANTA MARTHA,0,0,0,1,0,0,0


In [47]:
evaluator = trane.PredictionProblemEvaluator(new_df,
                                             entity_col=entity_col,
                                             cutoff_strategy=cutoff_strategy,
                                             min_train_set=20,
                                             min_test_set=20,
                                             previous_k_as_feature=2,
                                             latest_k_as_test=8
                                             )

In [48]:
for i in problems:
    print(i)

For each <patient_id> predict the number of records
For each <patient_id> predict the number of records with <age> greater than __
For each <patient_id> predict the number of records with <no_show> greater than __
For each <patient_id> predict the number of records with <neighborhood> equal to __
For each <patient_id> predict the number of records with <neighborhood> not equal to __
For each <patient_id> predict the number of records with <age> less than __
For each <patient_id> predict the number of records with <no_show> less than __
For each <patient_id> predict the total <age> in all related records
For each <patient_id> predict the total <no_show> in all related records
For each <patient_id> predict the total <age> in all related records with <age> greater than __
For each <patient_id> predict the total <no_show> in all related records with <age> greater than __
For each <patient_id> predict the total <age> in all related records with <no_show> greater than __
For each <patient_id

In [None]:
result = trane.multi_process_evaluation(evaluator, problems, features)

ERROR! Session/line number was not unique in database. History logging moved to new session 252


  0%|          | 0/70 [00:12<?, ?it/s]


In [30]:
len(problems)

1