In [1]:
import json

import pandas as pd
from datetime import datetime, timedelta

import trane
import featuretools as ft
import numpy as np



In [2]:
df = pd.read_csv("data/bike-sampled.csv")
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
df = df.sort_values(by=['date'])
df.tail()


Unnamed: 0,date,hour,usertype,gender,tripduration,temperature,from_station_id,dpcapacity_start,to_station_id,dpcapacity_end
3304,2017-01-31,10,Subscriber,Female,6.966667,37.9,247,19.0,247,19.0
3305,2017-01-31,10,Subscriber,Male,6.483333,37.9,425,15.0,426,19.0
3306,2017-01-31,10,Subscriber,Female,8.25,37.9,175,19.0,45,15.0
3299,2017-01-31,10,Subscriber,Male,16.266667,37.9,202,15.0,317,23.0
0,2017-01-31,23,Subscriber,Male,3.316667,35.1,230,19.0,131,15.0


In [3]:
entity_col = "__fake_root_entity__"
meta = trane.TableMeta(json.loads(open('data/meta.json').read()))
df, meta = trane.overall_prediction_helper(df, meta)

# MAP str to int
df_ft = df.copy()
str_col_list = ['usertype', 'gender']
str_mappers = {}
for str_col in str_col_list:
    str_to_id = {}
    id_to_str = []
    n_entity = 0
    
    for item in set(df_ft[str_col]):
        str_to_id[item] = n_entity
        id_to_str.append(item)
        n_entity += 1

    if str_col == entity_col:
        df[str_col] = df[str_col].apply(lambda x: str_to_id[x])
    df_ft[str_col] = df_ft[str_col].apply(lambda x: str_to_id[x])
    str_mappers[str_col] = (str_to_id, id_to_str)

    

df_ft.tail()

Unnamed: 0,date,hour,usertype,gender,tripduration,temperature,from_station_id,dpcapacity_start,to_station_id,dpcapacity_end,__fake_root_entity__
3304,2017-01-31,10,0,0,6.966667,37.9,247,19.0,247,19.0,0
3305,2017-01-31,10,0,1,6.483333,37.9,425,15.0,426,19.0,0
3306,2017-01-31,10,0,0,8.25,37.9,175,19.0,45,15.0,0
3299,2017-01-31,10,0,1,16.266667,37.9,202,15.0,317,23.0,0
0,2017-01-31,23,0,1,3.316667,35.1,230,19.0,131,15.0,0


In [6]:
cutoff_base = datetime.strptime("2017-01-06", "%Y-%m-%d")
cutoff_end = datetime.strptime("2017-01-31", "%Y-%m-%d")
cutoff_strategy = trane.FixWindowCutoffStrategy(entity_col, cutoff_base, cutoff_end, 1)

features = trane.FeaturetoolsWrapper(df_ft, entity_col, 'date', 
                                        {'hour': ft.variable_types.Categorical,
                                         'usertype': ft.variable_types.Categorical,
                                         'gender': ft.variable_types.Categorical,
                                         'from_station_id': ft.variable_types.Categorical,
                                         'to_station_id': ft.variable_types.Categorical,
                                        }, 'bikes')
features.compute_features(df_ft, cutoff_strategy, 5)

Built 43 features
Elapsed: 00:08 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 3/3 chunks


In [10]:
problem_generator = trane.PredictionProblemGenerator(
    table_meta=meta, entity_col=entity_col, time_col="date")

problems = problem_generator.generate()


Success/Attempt = 0/0Success/Attempt = 1/1Success/Attempt = 1/2Success/Attempt = 2/3Success/Attempt = 2/4Success/Attempt = 2/5Success/Attempt = 3/6Success/Attempt = 4/7Success/Attempt = 5/8Success/Attempt = 5/9Success/Attempt = 5/10Success/Attempt = 5/11Success/Attempt = 6/12Success/Attempt = 6/13Success/Attempt = 7/14Success/Attempt = 8/15Success/Attempt = 8/16Success/Attempt = 8/17Success/Attempt = 8/18Success/Attempt = 8/19Success/Attempt = 9/20Success/Attempt = 10/21Success/Attempt = 11/22Success/Attempt = 11/23Success/Attempt = 12/24Success/Attempt = 13/25Success/Attempt = 13/26Success/Attempt = 13/27Success/Attempt = 13/28Success/Attempt = 13/29Success/Attempt = 14/30Success/Attempt = 15/31Success/Attempt = 15/32Success/Attempt = 16/33Success/Attempt = 16/34Success/Attempt = 16/35Success/Attempt = 17/36Success/Attempt = 18/37Success/Attempt = 19/38Success/Attempt = 19/39Success/Attempt = 19/40Success/Attempt = 19/41Success/Attempt = 19/4

Success/Attempt = 79/361Success/Attempt = 79/362Success/Attempt = 80/363Success/Attempt = 80/364Success/Attempt = 80/365Success/Attempt = 81/366Success/Attempt = 82/367Success/Attempt = 83/368Success/Attempt = 83/369Success/Attempt = 83/370Success/Attempt = 83/371Success/Attempt = 83/372Success/Attempt = 83/373Success/Attempt = 83/374Success/Attempt = 83/375Success/Attempt = 83/376Success/Attempt = 83/377Success/Attempt = 83/378Success/Attempt = 83/379Success/Attempt = 83/380Success/Attempt = 83/381Success/Attempt = 83/382Success/Attempt = 83/383Success/Attempt = 83/384Success/Attempt = 83/385Success/Attempt = 83/386Success/Attempt = 83/387Success/Attempt = 83/388Success/Attempt = 83/389Success/Attempt = 83/390Success/Attempt = 83/391Success/Attempt = 83/392Success/Attempt = 84/393Success/Attempt = 84/394Success/Attempt = 84/395Success/Attempt = 85/396Success/Attempt = 86/397Success/Attempt = 87/398Success/Attempt = 87/399Success/Attempt = 87/400

Success/Attempt = 271/1395Success/Attempt = 271/1396Success/Attempt = 271/1397Success/Attempt = 271/1398Success/Attempt = 271/1399Success/Attempt = 271/1400Success/Attempt = 271/1401Success/Attempt = 271/1402Success/Attempt = 272/1403Success/Attempt = 272/1404Success/Attempt = 272/1405Success/Attempt = 273/1406Success/Attempt = 274/1407Success/Attempt = 275/1408Success/Attempt = 275/1409Success/Attempt = 275/1410Success/Attempt = 275/1411Success/Attempt = 275/1412Success/Attempt = 276/1413Success/Attempt = 276/1414Success/Attempt = 276/1415Success/Attempt = 277/1416Success/Attempt = 278/1417Success/Attempt = 279/1418Success/Attempt = 279/1419Success/Attempt = 279/1420Success/Attempt = 279/1421Success/Attempt = 279/1422Success/Attempt = 279/1423Success/Attempt = 279/1424Success/Attempt = 279/1425Success/Attempt = 279/1426Success/Attempt = 279/1427Success/Attempt = 279/1428Success/Attempt = 279/1429Success/Attempt = 279/1430Success/Attempt = 279/1431

In [11]:
evaluator = trane.PredictionProblemEvaluator(df, 
                                             entity_col=entity_col, 
                                             cutoff_strategy=cutoff_strategy, 
                                             min_train_set=5,
                                             min_test_set=5,
                                             previous_k_as_feature=2,
                                             latest_k_as_test=8
                                             )


In [12]:
result = trane.multi_process_evaluation(evaluator, problems, features)
with open("prob_with_acc.json", "w") as f:
    json.dump(result, f)


  0%|          | 0/418 [00:00<?, ?it/s][A
  0%|          | 1/418 [00:00<02:21,  2.95it/s][A
  0%|          | 2/418 [00:01<03:36,  1.92it/s][A
  1%|          | 3/418 [00:04<09:17,  1.34s/it][A
  4%|▍         | 17/418 [00:06<02:22,  2.82it/s][A
  7%|▋         | 28/418 [00:07<01:44,  3.73it/s][A
  7%|▋         | 29/418 [00:07<01:42,  3.78it/s][A
  7%|▋         | 30/418 [00:08<01:44,  3.71it/s][A
  7%|▋         | 31/418 [00:08<01:43,  3.75it/s][A
 11%|█         | 47/418 [00:08<01:06,  5.56it/s][A
 12%|█▏        | 50/418 [00:08<01:04,  5.68it/s][A
 13%|█▎        | 56/418 [00:09<01:02,  5.83it/s][A
 14%|█▍        | 58/418 [00:09<01:01,  5.90it/s][A
 14%|█▍        | 60/418 [00:09<00:59,  6.03it/s][A
 15%|█▌        | 64/418 [00:10<00:57,  6.17it/s][A
 16%|█▌        | 66/418 [00:10<00:56,  6.22it/s][A
 16%|█▋        | 68/418 [00:10<00:55,  6.27it/s][A
 17%|█▋        | 70/418 [00:11<00:54,  6.33it/s][A
 18%|█▊        | 76/418 [00:11<00:53,  6.38it/s][A
 18%|█▊        | 77/418