In [194]:
# import packages
import pandas as pd
import numpy as np
import utils
from pathlib import Path
import os
import glob
import json

In [195]:
# read in data
def find_pairs(file_name):
    # read in data
    df = pd.read_csv(f'data/{file_name}/{file_name}.csv').astype({'time_sec' : '<M8[ns]'}) # need this?
    
    # keep data X data points before off
    period_mins = 60 # X minutes before off
    watch_pct = 0.8 # X% of period used to train/watch

    # define time slices
    period_max = df.sort_values("time_sec")["time_sec"].tail(1).values[0].copy() # off time
    period_min = period_max - np.timedelta64(period_mins, "m") # 'start' time
    watch_split = period_max - np.timedelta64(round(period_mins * (1 - watch_pct)), "m") # split between watch/act
    
    # keep only times within watch/react
    df = df.loc[(df["time_sec"] > period_min) & (df["time_sec"] < period_max)].copy()
    
    # converting to series
    df_ip = df.pivot(index="time_sec", columns="SelectionId")["imp_prob"]
    df_ip.columns = df_ip.columns.astype(str)

    
    # seperate df into watch/react
    df_ip_watch = df_ip.loc[df_ip.index < watch_split].copy()
    df_ip_watch.index = df_ip_watch.index.astype(str)
    df_ip_react = df_ip.loc[df_ip.index > watch_split].copy()
    df_ip_react.index = df_ip_react.index.astype(str)
    
    # compute correlations
    cor_mat = df_ip_watch.corr()
    
    # find inverse pairs (based on having correlation measure < X)
    cor_min = -0.9
    pairs = utils.getIndexes(cor_mat, cor_min) 
    pairs = list(set([tuple(sorted(t)) for t in pairs]))
    
    # if pairs found in watch period then output
    n_p = len(pairs)
#     print(n_p, pairs)
    
    if  n_p > 0:
        
        print(f'Pairs found = {n_p}. Writing file.')
    
        # making pairs directory for market
        Path(f"data/{file_name}/pairs").mkdir(parents=True, exist_ok=True)

        # compiling data with metadata
        out = {
            'period_mins' : period_mins,
            'watch_pct' : watch_pct,
            'cor_min' : cor_min,
            'watch_df' : df_ip_watch.to_dict(),
            'react_df' : df_ip_react.to_dict(),
            'pairs' : pairs
        }
        
        with open(f'data/{file_name}/pairs/{file_name}.json', 'w') as outfile:
            json.dump(out, outfile, sort_keys=True, indent=4)
        
    else:
        print('No pairs found.')


In [196]:
# test file?
file_name = '1.169028429'
find_pairs(file_name)

No pairs found.


In [201]:
df_ip_watch.head()

NameError: name 'df_ip_watch' is not defined

---
### Identifying pairs from processed

In [197]:
all_files_path = glob.glob('data/*[!ipynb]')
all_files = [os.path.basename(x) for x in glob.glob('data/*[!ipynb]')]

In [198]:
n = len(all_files)

for i, f in enumerate(all_files):
    print(f'Screen for pairs in  {f} : {i+1}/{n}')
    
    try:
        find_pairs(f)
        
    except IndexError:
        print('Some index error')
        
    print('\n')

Screen for pairs in  1.169028429 : 1/500
No pairs found.


Screen for pairs in  1.168013705 : 2/500
No pairs found.


Screen for pairs in  1.169341238 : 3/500
No pairs found.


Screen for pairs in  1.168473844 : 4/500
No pairs found.


Screen for pairs in  1.167971949 : 5/500
No pairs found.


Screen for pairs in  1.169057140 : 6/500
No pairs found.


Screen for pairs in  1.169866063 : 7/500
Pairs found = 1. Writing file.


Screen for pairs in  1.169675383 : 8/500
No pairs found.


Screen for pairs in  1.167449067 : 9/500
No pairs found.


Screen for pairs in  1.170188269 : 10/500
Some index error


Screen for pairs in  1.170070363 : 11/500
No pairs found.


Screen for pairs in  1.168298570 : 12/500
Pairs found = 1. Writing file.


Screen for pairs in  1.167929665 : 13/500
No pairs found.


Screen for pairs in  1.170070707 : 14/500
No pairs found.


Screen for pairs in  1.167134920 : 15/500
No pairs found.


Screen for pairs in  1.166897828 : 16/500
Pairs found = 1. Writing file.


Scr

No pairs found.


Screen for pairs in  1.168336657 : 134/500
No pairs found.


Screen for pairs in  1.169028419 : 135/500
No pairs found.


Screen for pairs in  1.168062590 : 136/500
No pairs found.


Screen for pairs in  1.168473874 : 137/500
No pairs found.


Screen for pairs in  1.168715047 : 138/500
No pairs found.


Screen for pairs in  1.169760713 : 139/500
No pairs found.


Screen for pairs in  1.169866053 : 140/500
No pairs found.


Screen for pairs in  1.167449057 : 141/500
No pairs found.


Screen for pairs in  1.167061899 : 142/500
No pairs found.


Screen for pairs in  1.168431494 : 143/500
No pairs found.


Screen for pairs in  1.169675378 : 144/500
No pairs found.


Screen for pairs in  1.168298540 : 145/500
No pairs found.


Screen for pairs in  1.167929655 : 146/500
No pairs found.


Screen for pairs in  1.168096573 : 147/500
No pairs found.


Screen for pairs in  1.168096587 : 148/500
No pairs found.


Screen for pairs in  1.168618615 : 149/500
No pairs found.


Screen

No pairs found.


Screen for pairs in  1.169028393 : 268/500
No pairs found.


Screen for pairs in  1.170166637 : 269/500
Pairs found = 1. Writing file.


Screen for pairs in  1.169619345 : 270/500
No pairs found.


Screen for pairs in  1.168846821 : 271/500
No pairs found.


Screen for pairs in  1.167787613 : 272/500
No pairs found.


Screen for pairs in  1.168433274 : 273/500
No pairs found.


Screen for pairs in  1.166898779 : 274/500
No pairs found.


Screen for pairs in  1.166899499 : 275/500
No pairs found.


Screen for pairs in  1.167576026 : 276/500
No pairs found.


Screen for pairs in  1.167485957 : 277/500
No pairs found.


Screen for pairs in  1.167878469 : 278/500
No pairs found.


Screen for pairs in  1.167629990 : 279/500
No pairs found.


Screen for pairs in  1.169341945 : 280/500
No pairs found.


Screen for pairs in  1.167177559 : 281/500
No pairs found.


Screen for pairs in  1.170070687 : 282/500
Pairs found = 1. Writing file.


Screen for pairs in  1.167076169 : 28

No pairs found.


Screen for pairs in  1.168846816 : 396/500
No pairs found.


Screen for pairs in  1.169947580 : 397/500
No pairs found.


Screen for pairs in  1.169760693 : 398/500
No pairs found.


Screen for pairs in  1.168524729 : 399/500
No pairs found.


Screen for pairs in  1.169309407 : 400/500
No pairs found.


Screen for pairs in  1.167629953 : 401/500
No pairs found.


Screen for pairs in  1.167662065 : 402/500
No pairs found.


Screen for pairs in  1.167343068 : 403/500
No pairs found.


Screen for pairs in  1.166947630 : 404/500
Pairs found = 2. Writing file.


Screen for pairs in  1.168800264 : 405/500
No pairs found.


Screen for pairs in  1.168846811 : 406/500
Pairs found = 1. Writing file.


Screen for pairs in  1.168620964 : 407/500
No pairs found.


Screen for pairs in  1.168759016 : 408/500
No pairs found.


Screen for pairs in  1.167787623 : 409/500
No pairs found.


Screen for pairs in  1.167485967 : 410/500
No pairs found.


Screen for pairs in  1.167206380 : 41

In [None]:
# Improvements
# Data analysis - 'market types' where this happends most (clustering) e.g. where two main market leaders

# Way to reduce correlation matirx size - speed!

# granger causality?

# cor_matric - indexes func - # is this efficient? ideal would be to remove duplicates from matrix

# add in something to say no pairs found 

# clear directories to write new files for different correlation levels

# write out only columns which are pairs

--- 

In [199]:
# for d in all_files:
#     print(f"data/{d}/pairs/")
#     try:
#         print('remove pairs data if exists')
#         os.remove(f"data/{d}/pairs/*")
        
#         print('remove pairs directory')
#         os.rmdir(f"data/{d}/pairs")
            
#     except FileNotFoundError:
#         print('File never written')

In [None]:
# remove pairs data
