In [1]:
# import packages
import pandas as pd
import json
import utils
import utils_hist
import numpy as np
from pathlib import Path
import glob
import os

In [2]:
def data_processing(file_name, data_dir = '../../../../betfair/betfair_project/data/raw/api/advanced'):
    
    print(f'Processing {file_name}')
          
    # stream in historic data
    print('Streaming data.')
    df = pd.DataFrame(utils_hist.stream_data(f'{data_dir}/{file_name}'))
    
    print('Processing data.')
    # extract time -> seconds
    df["time_sec"] = df["Time"].dt.round("1s")

    # remove duplicates (by second)
    df = df.drop_duplicates(["SelectionId", "time_sec"], "last")

    # keep races only before off
    df = df.loc[(df["Inplay"] == False) & (df["Status"] == "OPEN")].copy()

    # remove non-runners
    nrs = df.loc[df['RunnerStatus'] == 'REMOVED']['SelectionId'].unique()
    df = df.loc[~df['SelectionId'].isin(nrs)].copy()

    # remove outsiders (100/1 +)
    outs = df.loc[df['LastPriceTraded'] > 100]['SelectionId'].unique()
    df = df.loc[~df['SelectionId'].isin(outs)].copy()

    # find imp-prob
    df = df.loc[df["LastPriceTraded"] != 0].copy()
    df["imp_prob"] = df["LastPriceTraded"].apply(lambda x: round(1 / x, 4) * 100)

    # drop unnecessary columns
    df = df[["time_sec", "SelectionId", "LastPriceTraded", "imp_prob"]].copy()

    # preview data
    print('Data sample\n', df.head())

    # write out data
    out_dir = f'data/{file_name}/{file_name}.csv'
    print(f'Writing data to {out_dir}')
    Path(f"data/{file_name}").mkdir(parents=True, exist_ok=True)
    df.to_csv(out_dir, index=False)
    print('\n')
    

In [3]:
# test file
file_name = '1.168899357'
data_processing(file_name)

Processing 1.168899357
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
2612 2020-02-16 17:42:16     12249254             3.25     30.77
2617 2020-02-16 17:42:17     12249254             3.25     30.77
2622 2020-02-16 17:42:23     12249254             3.25     30.77
2627 2020-02-16 17:42:25     12249254             3.25     30.77
2632 2020-02-16 17:42:28     12249254             3.25     30.77
Writing data to data/1.168899357/1.168899357.csv




---

### Processing all files in advanced

In [4]:
all_files_path = '../../../../betfair/betfair_project/data/raw/api/advanced/*[!.bz2]'

all_files = [os.path.basename(x) for x in glob.glob(all_files_path)]

In [5]:
n = len(all_files)

for i, f in enumerate(all_files[0:500]):
    print(f'Processing file {i+1}/{n}')
    data_processing(f)

Processing file 1/1657
Processing 1.169028429
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1230 2020-02-19 17:12:04     24496216             3.95     25.32
1239 2020-02-19 17:12:10     24496216             3.95     25.32
1248 2020-02-19 17:12:21     24496216             3.95     25.32
1257 2020-02-19 17:12:27     24496216             3.95     25.32
1266 2020-02-19 17:12:32     24496216             3.95     25.32
Writing data to data/1.169028429/1.169028429.csv


Processing file 2/1657
Processing 1.168013705
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
6536 2020-01-28 17:29:06     14972106             42.0      2.38
6550 2020-01-28 17:29:09     14972106             42.0      2.38
6564 2020-01-28 17:29:10     14972106             42.0      2.38
6578 2020-01-28 17:29:20     14972106             42.0      2.38
6592 2020-01-28 17:29:33     14972106             



Processing file 17/1657
Processing 1.168618625
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
4124 2020-02-10 16:38:12     19951404             2.28     43.86
4130 2020-02-10 16:38:13     19951404             2.28     43.86
4136 2020-02-10 16:38:15     19951404             2.28     43.86
4142 2020-02-10 16:38:18     19951404             2.28     43.86
4148 2020-02-10 16:38:19     19951404             2.28     43.86
Writing data to data/1.168618625/1.168618625.csv


Processing file 18/1657
Processing 1.167624165
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
4654 2020-01-19 17:13:59     15273880              5.4     18.52
4665 2020-01-19 17:14:02     15273880              5.4     18.52
4676 2020-01-19 17:14:15     15273880              5.4     18.52
4687 2020-01-19 17:14:20     15273880              5.4     18.52
4698 2020-01-19 17:14:36     15273880         



Processing file 33/1657
Processing 1.169668056
Streaming data.
Processing data.
Data sample
 Empty DataFrame
Columns: [time_sec, SelectionId, LastPriceTraded, imp_prob]
Index: []
Writing data to data/1.169668056/1.169668056.csv


Processing file 34/1657
Processing 1.168755339
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3327 2020-02-13 17:10:30     19810280              8.2      12.2
3332 2020-02-13 17:10:34     19810280              8.2      12.2
3337 2020-02-13 17:10:59     19810280              8.2      12.2
3347 2020-02-13 17:11:07     19810280              8.2      12.2
3352 2020-02-13 17:11:11     19810280              8.2      12.2
Writing data to data/1.168755339/1.168755339.csv


Processing file 35/1657
Processing 1.170180836
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1060 2020-03-16 16:17:44      7338826             2.02      49.5
1066 2020-0



Processing file 50/1657
Processing 1.169971967
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
2409 2020-03-09 15:56:54      2744673              5.1     19.61
2416 2020-03-09 15:56:59      2744673              5.1     19.61
2423 2020-03-09 15:57:01      2744673              5.1     19.61
2430 2020-03-09 15:57:04      2744673              5.1     19.61
2437 2020-03-09 15:57:07      2744673              5.1     19.61
Writing data to data/1.169971967/1.169971967.csv


Processing file 51/1657
Processing 1.167836957
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
695 2020-01-24 16:35:08     10864518              3.5     28.57
707 2020-01-24 16:35:11     10864518              3.5     28.57
713 2020-01-24 16:35:13     10864518              3.5     28.57
725 2020-01-24 16:35:21     10864518              3.5     28.57
731 2020-01-24 16:35:23     10864518              3



Processing file 66/1657
Processing 1.168799543
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
2231 2020-02-14 18:27:48     27596976              6.8     14.71
2237 2020-02-14 18:27:49      7431550             17.0      5.88
2240 2020-02-14 18:27:49     27596976              6.8     14.71
2246 2020-02-14 18:27:50      7431550             17.0      5.88
2249 2020-02-14 18:27:50     27596976              6.8     14.71
Writing data to data/1.168799543/1.168799543.csv


Processing file 67/1657
Processing 1.167486096
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3400 2020-01-16 18:06:39       149259              9.8      10.2
3410 2020-01-16 18:06:40       149259              9.8      10.2
3430 2020-01-16 18:06:41       149259              9.8      10.2
3450 2020-01-16 18:07:08       149259              9.8      10.2
3460 2020-01-16 18:07:16       149259         



Processing file 82/1657
Processing 1.169392208
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3195 2020-02-27 17:56:00     23682829              2.2     45.45
3205 2020-02-27 17:56:01     23682829              2.2     45.45
3215 2020-02-27 17:56:02     23682829              2.2     45.45
3225 2020-02-27 17:56:06     23682829              2.2     45.45
3235 2020-02-27 17:56:16     23682829              2.2     45.45
Writing data to data/1.169392208/1.169392208.csv


Processing file 83/1657
Processing 1.168392957
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1268 2020-02-05 16:35:27     11911785             12.0      8.33
1280 2020-02-05 16:35:29     11911785             12.0      8.33
1292 2020-02-05 16:35:38     11911785             12.0      8.33
1304 2020-02-05 16:35:39     11911785             12.0      8.33
1328 2020-02-05 16:35:50     11911785         

Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
4159 2020-02-29 16:05:11     27477602             1.03     97.09
4167 2020-02-29 16:05:13     27477602             1.03     97.09
4175 2020-02-29 16:05:27     27477602             1.03     97.09
4183 2020-02-29 16:05:32     27477602             1.03     97.09
4199 2020-02-29 16:05:33     27477602             1.03     97.09
Writing data to data/1.169500956/1.169500956.csv


Processing file 99/1657
Processing 1.169101620
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
561 2020-02-21 12:18:35     12327656             11.0      9.09
573 2020-02-21 12:18:46     12327656             11.0      9.09
585 2020-02-21 12:23:09     12327656             11.0      9.09
597 2020-02-21 12:23:21     12327656             11.0      9.09
609 2020-02-21 12:24:05     12327656             11.0      9.09
Writing data to data/1.169101620/1.169101620.csv


P

Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1643 2020-02-25 17:11:12     25525995              2.8     35.71
1648 2020-02-25 17:11:14     25525995              2.8     35.71
1653 2020-02-25 17:11:16     25525995              2.8     35.71
1658 2020-02-25 17:11:18     25525995              2.8     35.71
1663 2020-02-25 17:11:27     25525995              2.8     35.71
Writing data to data/1.169309187/1.169309187.csv


Processing file 115/1657
Processing 1.169920596
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
2044 2020-03-08 11:34:22     11082737              9.0     11.11
2078 2020-03-08 11:34:24     11082737              9.0     11.11
2095 2020-03-08 11:34:28     11082737              9.0     11.11
2112 2020-03-08 11:34:34     11082737              9.0     11.11
2146 2020-03-08 11:34:41     11082737              9.0     11.11
Writing data to data/1.169920596/1.169920596.



Processing file 130/1657
Processing 1.167393963
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
662 2020-01-14 15:48:03     18172985             3.40     29.41
669 2020-01-14 15:48:05     18172985             3.55     28.17
676 2020-01-14 15:48:07     18172985             3.55     28.17
683 2020-01-14 15:48:09     18172985             3.55     28.17
690 2020-01-14 15:48:11     18172985             3.55     28.17
Writing data to data/1.167393963/1.167393963.csv


Processing file 131/1657
Processing 1.169201654
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3319 2020-02-23 14:49:37     11461775              7.0     14.29
3327 2020-02-23 14:49:41     11461775              7.0     14.29
3343 2020-02-23 14:49:42     11461775              7.0     14.29
3351 2020-02-23 14:49:51     11461775              7.0     14.29
3359 2020-02-23 14:50:14     11461775             



Processing file 146/1657
Processing 1.167929655
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3273 2020-01-26 16:33:43     12533127             2.38     42.02
3281 2020-01-26 16:36:27     12533127             2.38     42.02
3289 2020-01-26 16:36:40     12533127             2.38     42.02
3305 2020-01-26 16:36:41     12533127             2.38     42.02
3313 2020-01-26 16:37:03     12533127             2.38     42.02
Writing data to data/1.167929655/1.167929655.csv


Processing file 147/1657
Processing 1.168096573
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
670 2020-01-30 16:09:39        11381             13.0      7.69
682 2020-01-30 16:50:47        11381             13.0      7.69
694 2020-01-30 16:50:55        11381             13.0      7.69
706 2020-01-30 16:51:01        11381             13.0      7.69
718 2020-01-30 16:51:10        11381             

Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1320 2020-02-28 15:19:09     19186430              3.2     31.25
1326 2020-02-28 15:19:17     19186430              3.2     31.25
1332 2020-02-28 15:19:33     19186430              3.2     31.25
1338 2020-02-28 15:19:35     19186430              3.2     31.25
1344 2020-02-28 15:19:39     19186430              3.2     31.25
Writing data to data/1.169434623/1.169434623.csv


Processing file 163/1657
Processing 1.170069284
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
4663 2020-03-11 18:38:38        21955             2.74      36.5
4673 2020-03-11 18:38:41        21955             2.74      36.5
4683 2020-03-11 18:38:48        21955             2.74      36.5
4693 2020-03-11 18:38:49        21955             2.74      36.5
4703 2020-03-11 18:38:50        21955             2.74      36.5
Writing data to data/1.170069284/1.170069284.

Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3142 2020-02-17 16:38:21     21166844              5.0      20.0
3148 2020-02-17 16:38:29     21166844              5.0      20.0
3154 2020-02-17 16:38:43     21166844              5.0      20.0
3160 2020-02-17 16:38:48     21166844              5.0      20.0
3166 2020-02-17 16:39:11     21166844              5.0      20.0
Writing data to data/1.168937485/1.168937485.csv


Processing file 179/1657
Processing 1.168988137
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1643 2020-02-18 17:32:48     11432465              4.8     20.83
1655 2020-02-18 17:32:56     11432465              4.7     21.28
1667 2020-02-18 17:32:58     11432465              4.8     20.83
1679 2020-02-18 17:33:03     11432465              4.8     20.83
1691 2020-02-18 17:33:06     11432465              4.8     20.83
Writing data to data/1.168988137/1.168988137.



Processing file 194/1657
Processing 1.169309181
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1936 2020-02-25 13:49:54     13415080              7.0     14.29
1947 2020-02-25 13:49:56     13415080              7.0     14.29
1958 2020-02-25 13:50:08     13415080              7.0     14.29
1969 2020-02-25 13:50:12     13415080              7.0     14.29
1980 2020-02-25 13:50:16     13415080              7.0     14.29
Writing data to data/1.169309181/1.169309181.csv


Processing file 195/1657
Processing 1.166942630
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1014 2020-01-02 17:26:12     15363580             4.60     21.74
1025 2020-01-02 17:26:25     15363580             4.60     21.74
1036 2020-01-02 17:26:44     15363580             3.85     25.97
1047 2020-01-02 17:26:45     15363580             3.85     25.97
1058 2020-01-02 17:26:46     15363580       



Processing file 210/1657
Processing 1.168473820
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1363 2020-02-07 17:27:01     18856306              4.8     20.83
1368 2020-02-07 17:27:03     18856306              4.8     20.83
1373 2020-02-07 17:27:04     18856306              4.8     20.83
1383 2020-02-07 17:27:05     18856306              4.8     20.83
1388 2020-02-07 17:27:11     18856306              4.8     20.83
Writing data to data/1.168473820/1.168473820.csv


Processing file 211/1657
Processing 1.168393283
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
649 2020-02-05 15:59:24     19936281              7.0     14.29
655 2020-02-05 16:31:59      9659886             14.0      7.14
662 2020-02-05 16:31:59     19936281              7.0     14.29
668 2020-02-05 16:32:04      9659886             14.0      7.14
675 2020-02-05 16:32:04     19936281             



Processing file 226/1657
Processing 1.167106104
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
6269 2020-01-06 17:20:40     10108396              4.2     23.81
6278 2020-01-06 17:20:41     10108396              4.2     23.81
6287 2020-01-06 17:20:45     10108396              4.2     23.81
6296 2020-01-06 17:20:46     10108396              4.2     23.81
6305 2020-01-06 17:21:06     10108396              4.2     23.81
Writing data to data/1.167106104/1.167106104.csv


Processing file 227/1657
Processing 1.167486111
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1201 2020-01-16 15:57:02     17784375             3.95     25.32
1212 2020-01-16 15:59:46     17784375             3.95     25.32
1223 2020-01-16 16:00:33     17784375             3.95     25.32
1234 2020-01-16 16:01:54     17784375             3.95     25.32
1245 2020-01-16 16:02:03     17784375       



Processing file 242/1657
Processing 1.170021047
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3435 2020-03-10 17:13:07       847179              7.2     13.89
3448 2020-03-10 17:13:10       847179              7.2     13.89
3474 2020-03-10 17:13:11       847179              7.2     13.89
3487 2020-03-10 17:13:12       847179              7.2     13.89
3500 2020-03-10 17:13:14       847179              7.2     13.89
Writing data to data/1.170021047/1.170021047.csv


Processing file 243/1657
Processing 1.167300998
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3799 2020-01-11 16:12:20     22628368              4.6     21.74
3809 2020-01-11 16:12:21     22628368              4.6     21.74
3819 2020-01-11 16:12:35     22628368              4.6     21.74
3829 2020-01-11 16:12:36     22628368              4.6     21.74
3839 2020-01-11 16:12:37     22628368       



Processing file 258/1657
Processing 1.167967933
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
5192 2020-01-27 16:44:00     18976983              4.6     21.74
5199 2020-01-27 16:44:11     18976983              4.5     22.22
5206 2020-01-27 16:44:12     18976983              4.5     22.22
5220 2020-01-27 16:44:13     18976983              4.5     22.22
5227 2020-01-27 16:44:16     18976983              4.5     22.22
Writing data to data/1.167967933/1.167967933.csv


Processing file 259/1657
Processing 1.167176619
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
4185 2020-01-08 15:55:51     13164979              7.2     13.89
4199 2020-01-08 15:55:58     13164979              7.2     13.89
4213 2020-01-08 15:56:05     13164979              7.2     13.89
4227 2020-01-08 15:56:19     13164979              7.2     13.89
4241 2020-01-08 15:56:22     13164979       



Processing file 274/1657
Processing 1.166898779
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1562 2019-12-31 17:20:36     23308557             3.75     26.67
1565 2019-12-31 17:20:36       322524             6.00     16.67
1568 2019-12-31 17:20:39     23308557             3.75     26.67
1569 2019-12-31 17:20:39       619299             1.92     52.08
1571 2019-12-31 17:20:39       322524             6.00     16.67
Writing data to data/1.166898779/1.166898779.csv


Processing file 275/1657
Processing 1.166899499
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
734 2019-12-31 15:57:07     12368000             3.45     28.99
750 2019-12-31 15:57:10     12368000             3.55     28.17
758 2019-12-31 15:57:13     12368000             3.55     28.17
774 2019-12-31 15:57:14     12368000             3.65     27.40
790 2019-12-31 15:57:18     12368000             

Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1113 2020-03-02 17:23:15     17913702              3.9     25.64
1117 2020-03-02 17:23:17     17913702              3.9     25.64
1121 2020-03-02 17:23:19     17913702              3.9     25.64
1125 2020-03-02 17:23:21     17913702              3.9     25.64
1133 2020-03-02 17:23:22     17913702              3.0     33.33
Writing data to data/1.169619389/1.169619389.csv


Processing file 291/1657
Processing 1.168524719
Streaming data.
Processing data.
Data sample
                  time_sec  SelectionId  LastPriceTraded  imp_prob
41988 2020-02-08 16:27:44     27442033              4.2     23.81
42004 2020-02-08 16:27:46     27442033              4.2     23.81
42020 2020-02-08 16:27:47     27442033              4.2     23.81
42036 2020-02-08 16:27:48     27442033              4.2     23.81
42052 2020-02-08 16:27:49     27442033              4.2     23.81
Writing data to data/1.168524719/1.1685

Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
263 2020-03-10 11:38:19     17573690             2.92     34.25
271 2020-03-10 11:38:56     17573690             2.92     34.25
279 2020-03-10 11:39:01     17573690             2.92     34.25
287 2020-03-10 11:45:16     17573690             2.92     34.25
295 2020-03-10 11:45:25     17573690             2.92     34.25
Writing data to data/1.170025447/1.170025447.csv


Processing file 307/1657
Processing 1.169619374
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
8526 2020-03-02 16:58:27     11082753             2.76     36.23
8538 2020-03-02 16:58:28     11082753             2.76     36.23
8550 2020-03-02 16:58:33     11082753             2.76     36.23
8562 2020-03-02 16:58:50     11082753             2.76     36.23
8574 2020-03-02 16:58:51     11082753             2.76     36.23
Writing data to data/1.169619374/1.169619374.csv






Processing file 322/1657
Processing 1.170025440
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
645 2020-03-10 11:27:24     16333098             21.0      4.76
671 2020-03-10 11:27:50     16333098             21.0      4.76
697 2020-03-10 11:27:59     16333098             21.0      4.76
723 2020-03-10 11:28:20     16333098             21.0      4.76
749 2020-03-10 11:28:22     16333098             21.0      4.76
Writing data to data/1.170025440/1.170025440.csv


Processing file 323/1657
Processing 1.167524205
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
562 2020-01-17 16:18:43      3268165              6.8     14.71
572 2020-01-17 16:18:49      3268165              6.8     14.71
592 2020-01-17 16:18:50      3268165              6.8     14.71
602 2020-01-17 16:18:55      3268165              6.8     14.71
612 2020-01-17 16:19:10      3268165              6.8  



Processing file 338/1657
Processing 1.169056716
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3915 2020-02-20 17:34:12     13484889              8.4      11.9
3928 2020-02-20 17:34:59     13484889              8.4      11.9
3941 2020-02-20 17:35:03     13484889              8.4      11.9
3954 2020-02-20 17:36:32     13484889              8.4      11.9
3967 2020-02-20 17:36:33     13484889              8.4      11.9
Writing data to data/1.169056716/1.169056716.csv


Processing file 339/1657
Processing 1.167755367
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
2769 2020-01-22 17:12:55     27072334             1.62     61.73
2777 2020-01-22 17:12:58     27072334             1.62     61.73
2785 2020-01-22 17:13:02     27072334             1.62     61.73
2793 2020-01-22 17:13:04     27072334             1.59     62.89
2801 2020-01-22 17:13:07     27072334       



Processing file 354/1657
Processing 1.169056711
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1747 2020-02-20 17:09:28       308650              2.9     34.48
1750 2020-02-20 17:09:29       308650              2.9     34.48
1753 2020-02-20 17:09:30       308650              2.9     34.48
1756 2020-02-20 17:09:43       308650              2.9     34.48
1759 2020-02-20 17:09:46       308650              2.9     34.48
Writing data to data/1.169056711/1.169056711.csv


Processing file 355/1657
Processing 1.166898714
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1553 2019-12-31 16:23:06     13030837              3.2     31.25
1559 2019-12-31 16:23:11     13030837              3.2     31.25
1565 2019-12-31 16:23:13     13030837              3.2     31.25
1571 2019-12-31 16:23:16     13030837              3.2     31.25
1577 2019-12-31 16:23:22     13030837       



Processing file 370/1657
Processing 1.167934650
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1251 2020-01-26 15:12:34     24462986             2.04     49.02
1261 2020-01-26 15:12:35     24462986             2.04     49.02
1271 2020-01-26 15:12:37     24462986             2.04     49.02
1281 2020-01-26 15:12:45     24462986             2.04     49.02
1291 2020-01-26 15:16:10     24462986             2.04     49.02
Writing data to data/1.167934650/1.167934650.csv


Processing file 371/1657
Processing 1.167756518
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1450 2020-01-22 17:02:15      9517676              5.3     18.87
1458 2020-01-22 17:02:16      9517676              5.3     18.87
1466 2020-01-22 17:02:17      9517676              5.3     18.87
1474 2020-01-22 17:02:19      9517676              5.3     18.87
1482 2020-01-22 17:02:20      9517676       



Processing file 386/1657
Processing 1.167785127
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
961  2020-01-23 16:19:16      9500664              5.4     18.52
983  2020-01-23 16:19:34      9500664              5.4     18.52
994  2020-01-23 16:19:37      9500664              5.4     18.52
1005 2020-01-23 16:19:38      9500664              5.4     18.52
1016 2020-01-23 16:19:45      9500664              5.4     18.52
Writing data to data/1.167785127/1.167785127.csv


Processing file 387/1657
Processing 1.168431540
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3948 2020-02-06 17:29:26     23597536              4.0      25.0
3957 2020-02-06 17:29:28     23597536              4.0      25.0
3975 2020-02-06 17:29:31     23597536              4.0      25.0
3984 2020-02-06 17:29:45     23597536              4.0      25.0
3993 2020-02-06 17:30:26     23597536       



Processing file 402/1657
Processing 1.167662065
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
3684 2020-01-20 16:14:48     10512023              4.1     24.39
3698 2020-01-20 16:15:04     10512023              4.1     24.39
3712 2020-01-20 16:15:08     10512023              4.1     24.39
3726 2020-01-20 16:15:22     10512023              4.1     24.39
3740 2020-01-20 16:15:36     10512023              4.1     24.39
Writing data to data/1.167662065/1.167662065.csv


Processing file 403/1657
Processing 1.167343068
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
456 2020-01-12 15:10:26     12281462             13.0      7.69
465 2020-01-12 15:10:32     12281462             13.0      7.69
474 2020-01-12 15:10:36     12281462             13.0      7.69
483 2020-01-12 15:10:46     12281462             13.0      7.69
492 2020-01-12 15:14:35     12281462             



Processing file 418/1657
Processing 1.169211584
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
5451 2020-02-23 17:28:46     13306148             2.22     45.05
5459 2020-02-23 17:28:48     13306148             2.22     45.05
5467 2020-02-23 17:28:59     13306148             2.20     45.45
5475 2020-02-23 17:29:01     13306148             2.20     45.45
5483 2020-02-23 17:29:03     13306148             2.20     45.45
Writing data to data/1.169211584/1.169211584.csv


Processing file 419/1657
Processing 1.168845458
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
4839 2020-02-15 16:06:56     12925623              2.9     34.48
4845 2020-02-15 16:06:58     12925623              2.9     34.48
4851 2020-02-15 16:07:00     12925623              2.9     34.48
4857 2020-02-15 16:07:02     12925623              2.9     34.48
4863 2020-02-15 16:07:03     12925623       



Processing file 434/1657
Processing 1.168060371
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1307 2020-01-29 16:42:53      5628291             2.88     34.72
1316 2020-01-29 16:43:05      5628291             2.88     34.72
1325 2020-01-29 16:43:14      5628291             2.88     34.72
1326 2020-01-29 16:43:14     12126337             7.20     13.89
1334 2020-01-29 16:43:19      5628291             2.88     34.72
Writing data to data/1.168060371/1.168060371.csv


Processing file 435/1657
Processing 1.168136214
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
565 2020-01-31 16:31:39     17336468             2.32      43.1
568 2020-01-31 16:31:43     17336468             2.32      43.1
571 2020-01-31 16:31:54     17336468             2.32      43.1
574 2020-01-31 16:31:57     17336468             2.32      43.1
580 2020-01-31 16:31:58     17336468             



Processing file 450/1657
Processing 1.169721618
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
2693 2020-03-04 17:10:29     15729592             2.52     39.68
2701 2020-03-04 17:10:34     15729592             2.52     39.68
2709 2020-03-04 17:10:41     15729592             2.52     39.68
2717 2020-03-04 17:10:45     15729592             2.52     39.68
2725 2020-03-04 17:10:51     15729592             2.52     39.68
Writing data to data/1.169721618/1.169721618.csv


Processing file 451/1657
Processing 1.170070657
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
851 2020-03-11 16:52:33     18348948              3.6     27.78
857 2020-03-11 16:52:35     18348948              3.6     27.78
863 2020-03-11 16:52:37     18348948              3.6     27.78
869 2020-03-11 16:52:38     18348948              3.6     27.78
875 2020-03-11 16:52:40     18348948             



Processing file 466/1657
Processing 1.168845473
Streaming data.
Processing data.
Data sample
 Empty DataFrame
Columns: [time_sec, SelectionId, LastPriceTraded, imp_prob]
Index: []
Writing data to data/1.168845473/1.168845473.csv


Processing file 467/1657
Processing 1.167662040
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1413 2020-01-20 16:03:05     25194081              9.0     11.11
1427 2020-01-20 16:03:11     25194081              9.0     11.11
1441 2020-01-20 16:03:13     25194081              9.0     11.11
1448 2020-01-20 16:03:17     25194081              9.0     11.11
1455 2020-01-20 16:03:21     25194081              9.0     11.11
Writing data to data/1.167662040/1.167662040.csv


Processing file 468/1657
Processing 1.168013690
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
8169 2020-01-28 17:12:54      8455679              8.0      12.5
8191 202



Processing file 482/1657
Processing 1.168674259
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
477 2020-02-11 17:12:29     26641006             2.02     49.50
485 2020-02-11 17:12:32     26641006             2.02     49.50
489 2020-02-11 17:12:34     26641006             2.02     49.50
497 2020-02-11 17:12:35     26641006             2.04     49.02
501 2020-02-11 17:12:38     26641006             2.04     49.02
Writing data to data/1.168674259/1.168674259.csv


Processing file 483/1657
Processing 1.167488037
Streaming data.
Processing data.
Data sample
                time_sec  SelectionId  LastPriceTraded  imp_prob
91  2020-01-17 12:26:31     21548474              2.2     45.45
95  2020-01-17 12:27:55     21548474              2.2     45.45
99  2020-01-17 12:28:00     21548474              2.2     45.45
103 2020-01-17 12:28:11     21548474              2.2     45.45
107 2020-01-17 12:28:15     21548474              2.2  



Processing file 498/1657
Processing 1.167300983
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
2151 2020-01-11 15:34:47     12094951              7.2     13.89
2155 2020-01-11 15:34:47      7742655              5.9     16.95
2159 2020-01-11 15:34:47     15481469              5.4     18.52
2160 2020-01-11 15:34:48     12094951              7.2     13.89
2161 2020-01-11 15:34:48     10660875             30.0      3.33
Writing data to data/1.167300983/1.167300983.csv


Processing file 499/1657
Processing 1.167390614
Streaming data.
Processing data.
Data sample
                 time_sec  SelectionId  LastPriceTraded  imp_prob
1296 2020-01-14 14:32:12      8709395              9.0     11.11
1301 2020-01-14 14:32:35      8709395              9.0     11.11
1306 2020-01-14 14:33:24      8709395              9.0     11.11
1311 2020-01-14 14:34:13      8709395              9.0     11.11
1316 2020-01-14 14:34:39      8709395       

In [None]:
# Improvements
# Tidy up function scripts - speed is crucial

# Run for all files, stopped at ~500 for now!

# remove imp_prob to save on disk space, create this in-memory - remove nuemeric index too?

# keep only ime from timestamp t0 save space