In [1]:
import numpy as np 
import pandas as pd
import glob
import os

from matplotlib import pyplot as plt, pylab as pl
%matplotlib inline
plt.style.use("bmh")
plt.rcParams["figure.figsize"] = (12,4)
import seaborn as sns
import plotly.express as px

#loading:
from base64 import b64decode, b64encode
from gzip import decompress, compress
import json

import plotly.graph_objects as go
import plotly.express as px

import pyarrow.parquet as pq
import pyarrow as pa

from datetime import datetime

from scipy.stats import skew, kurtosis
from sklearn.linear_model import LinearRegression

In [2]:
# Vectors and Metrics
PT_LARGE_SYN = 'PT_LARGE_SYN'
PT_SYN = 'PT_SYN'
PT_TCP = 'PT_TCP'
PT_DNS = 'PT_DNS'
PT_DNS_RESPONSE = 'PT_DNS_RESPONSE'
PT_NTP = 'PT_NTP'
PT_SSDP = 'PT_SSDP'
PT_ICMP = 'PT_ICMP'
PT_GENERAL = 'PT_GENERAL'
PT_TOTAL = 'PT_TOTAL'
PT_UDP = 'PT_UDP'
PT_FRAG = 'PT_FRAG'
PT_NETFLOW = 'PT_NETFLOW'

PPS = 'PPS'
BW = 'BW'

vectors = [PT_LARGE_SYN, PT_SYN,PT_TCP, PT_DNS, PT_DNS_RESPONSE, PT_NTP,PT_SSDP,PT_ICMP,PT_GENERAL,PT_TOTAL,PT_UDP,PT_FRAG, PT_NETFLOW]
metrics = [PPS,BW]

In [3]:
VECTOR = 'PT_TCP'
METRIC = 'PPS'

## Loading functions
* Loading data
* ```filter_data_frame``` used for filtering by vector metric the result is used in ```get_ts_by_vec_and_metric_filtered```
* ```get_ts_by_vec_and_metric_filtered```: returns dictionary with info about the time series the results is used by ```add_features_to_df```
* ```add_features_to_df```: gets dictionary with time series data calculates features and adds these features to the original loaded data frame

In [4]:
def plot_ts(df_ts, threshold_name, threshold_value, pred_lr, pred_rf,values_to_print='',  show_total=False):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_ts.index, y=df_ts['passed_val'], mode='lines', name='passed_val'))
    fig.add_trace(go.Scatter(x=df_ts.index, y=df_ts['blocked_val'], mode='lines', line=dict(color='red', dash='dot'), name='blocked_val'))
    if show_total:
        fig.add_trace(go.Scatter(x=df_ts.index, y=df_ts['total_val'], mode='lines', name='total_val'))

    # plot thresholds:
    fig.add_trace(go.Scatter(x=[df_ts.index.min(), df_ts.index.max()], y=[pred_lr, pred_lr], mode='lines', line=dict(color='orange', dash='dash'), name='pred_lr'))
    fig.add_trace(go.Scatter(x=[df_ts.index.min(), df_ts.index.max()], y=[pred_rf, pred_rf], mode='lines', line=dict(color='purple', dash='dash'), name='pred_rf'))
    fig.add_trace(go.Scatter(x=[df_ts.index.min(), df_ts.index.max()], y=[threshold_value, threshold_value], mode='lines', line=dict(color='red', dash='dash'), name=threshold_name))

    fig.show()

In [5]:
tcp_pps = pd.read_csv('./tcp_pps_v6.csv')
tcp_pps

Unnamed: 0,id,prediction_id,file,vector,metric_x,request_id,ip_ss,ip_sc,ip_nw,ip_rl,...,iqr_share,daily_max_q90_share,max_share,max_daily_q90_share,is_dominant_vector_iqr,is_dominant_vector_max_daily_q90,has_dom_iqr,has_dom_daily_q90,dom_vector_iqr,dom_vector_daily_q90
0,4717,a85101f9a44c3e7fa26ad3a8f5258b1f6b780aed7e1c38...,TimeSeriesDataB64_0_0_100.parquet,PT_TCP,PPS,395,12.0,16.0,22.0,30.0,...,0.29,0.21,0.65,0.43,True,True,True,True,False,False
1,4773,f6ab823e547c7ec805b878d2250509cb810addb988ce57...,TimeSeriesDataB64_0_0_100.parquet,PT_TCP,PPS,399,30.0,33.0,48.0,65.0,...,0.67,0.57,0.67,0.57,False,False,True,True,True,True
2,15757,ac5b3676bb10fc8ae4cd494e3724053321f533b28c3a64...,TimeSeriesDataB64_0_9900_10000.parquet,PT_TCP,PPS,991,65.0,76.0,103.0,127.0,...,0.99,0.82,0.99,0.82,False,False,True,True,True,True
3,15783,3ca2d2ca5e29360360beef068d732605d4d4499209cdf3...,TimeSeriesDataB64_0_10000_10100.parquet,PT_TCP,PPS,992,68.0,80.0,108.0,133.0,...,0.84,0.66,0.84,0.66,False,False,True,True,True,True
4,15809,4446402910b942444736abdc234a69c1be8cab947550b2...,TimeSeriesDataB64_0_10000_10100.parquet,PT_TCP,PPS,993,41.0,49.0,66.0,83.0,...,0.43,0.42,0.56,0.42,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1782,105395,79296fdcd9d94f6a606db159963999bfc33cb8698ba71f...,TimeSeriesDataB64_4_6600_6700.parquet,PT_TCP,PPS,5521,45.0,51.0,68.0,80.0,...,0.12,0.11,0.88,0.87,True,True,True,True,False,False
1783,99413,161e1a1669916c2486a5d2debc2f03ae43052b01c062c6...,TimeSeriesDataB64_4_700_800.parquet,PT_TCP,PPS,4914,32.0,36.0,48.0,57.0,...,0.72,0.76,0.72,0.76,False,False,True,True,True,True
1784,99465,99b741be8b52a7ff1e704abf80d85c000b0d9d33feedc3...,TimeSeriesDataB64_4_800_900.parquet,PT_TCP,PPS,4923,87.0,98.0,129.0,151.0,...,0.19,0.69,0.80,0.69,True,False,True,True,False,True
1785,99517,84bee7e6f334fb45dfa5b2089e06cece732e898ad7e1a2...,TimeSeriesDataB64_4_800_900.parquet,PT_TCP,PPS,4925,16.7,19.0,26.0,30.0,...,0.96,0.45,0.96,0.45,False,False,True,False,True,False


In [6]:
tcp_v6_pred_id = tcp_pps.prediction_id
tcp_v6_pred_id.unique().shape

(1787,)

In [7]:
data_v2_p1 = pd.read_csv('./FileCatalog_v2_p1.csv')
data_v2_p2 = pd.read_csv('./FileCatalog_v2_p2.csv')
data = pd.concat([data_v2_p1, data_v2_p2])
data


  data_v2_p1 = pd.read_csv('./FileCatalog_v2_p1.csv')
  data_v2_p2 = pd.read_csv('./FileCatalog_v2_p2.csv')


Unnamed: 0,id,prediction_id,original_file,vector,metric,request_id,ip_ss,ip_sc,ip_nw,ip_rl,...,total_val_slope,total_val_crest_factor,total_val_shape_factor,total_val_avg_first_order_diff,missing_tmstmp_percentage,percent_of_largest_dead,percent_of_zeros,total_time,total_val_median_share,super_peaks_file
0,4722,a85101f9a44c3e7fa26ad3a8f5258b1f6b780aed7e1c38...,TimeSeriesDataB64_0_0_100.parquet,PT_DNS_RESPONSE,BW,395,2.0,4.0,6.0,9.0,...,2.181106e-10,40.211269,8.916736,2.668500e-24,0.069444,0.338,0.976257,30.0,0.000000,
1,4716,a85101f9a44c3e7fa26ad3a8f5258b1f6b780aed7e1c38...,TimeSeriesDataB64_0_0_100.parquet,PT_LARGE_SYN,BW,395,0.1,0.2,0.2,0.2,...,-2.288744e-11,123.812391,46.670364,5.023059e-24,0.069444,0.240,0.999166,30.0,0.000000,
2,4730,a85101f9a44c3e7fa26ad3a8f5258b1f6b780aed7e1c38...,TimeSeriesDataB64_0_0_100.parquet,PT_UDP,BW,395,7.0,12.0,16.0,30.0,...,4.463537e-08,162.524787,10.914687,-5.258403e-09,0.069444,0.000,0.001181,30.0,0.008846,
3,4720,a85101f9a44c3e7fa26ad3a8f5258b1f6b780aed7e1c38...,TimeSeriesDataB64_0_0_100.parquet,PT_DNS,BW,395,1.0,2.0,3.0,4.0,...,5.696970e-09,24.139679,13.466244,-4.077000e-09,0.069444,0.001,0.497081,30.0,0.000821,
4,4732,a85101f9a44c3e7fa26ad3a8f5258b1f6b780aed7e1c38...,TimeSeriesDataB64_0_0_100.parquet,PT_ICMP,BW,395,5.0,7.5,10.0,20.0,...,3.013283e-10,31.148846,2.104985,-1.482545e-09,0.069444,0.000,0.000023,30.0,0.004885,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40977,99635,a081170b6a3b5a899dc186b563488c5edd626214a9f310...,TimeSeriesDataB64_4_900_1000.parquet,PT_LARGE_SYN,PPS,4941,0.4,0.6,0.6,0.6,...,7.913066e-11,66.776758,29.192142,2.208907e-24,27.299872,0.080,0.996708,30.0,0.000000,
40978,99629,a081170b6a3b5a899dc186b563488c5edd626214a9f310...,TimeSeriesDataB64_4_900_1000.parquet,PT_ICMP,PPS,4941,5.0,7.0,9.0,15.0,...,-2.653187e-10,23.467987,5.881327,7.639066e-09,0.000000,0.000,0.240486,30.0,0.000164,
40979,99631,a081170b6a3b5a899dc186b563488c5edd626214a9f310...,TimeSeriesDataB64_4_900_1000.parquet,PT_SYN,PPS,4941,10.0,12.0,18.0,20.0,...,-9.371713e-07,71.943775,1.532923,-3.078775e-08,0.000000,,0.000000,30.0,0.054190,
40980,99641,a081170b6a3b5a899dc186b563488c5edd626214a9f310...,TimeSeriesDataB64_4_900_1000.parquet,PT_SSDP,PPS,4941,0.4,0.5,0.7,0.7,...,-9.576488e-10,37.016018,3.255654,4.185067e-22,0.000000,0.000,0.059306,30.0,0.000304,


In [8]:
data[(~data.super_peaks_file.isna()) & (data.vector == PT_TCP) & (data.metric == PPS) & (data.prediction_id.isin(tcp_v6_pred_id))]

Unnamed: 0,id,prediction_id,original_file,vector,metric,request_id,ip_ss,ip_sc,ip_nw,ip_rl,...,total_val_slope,total_val_crest_factor,total_val_shape_factor,total_val_avg_first_order_diff,missing_tmstmp_percentage,percent_of_largest_dead,percent_of_zeros,total_time,total_val_median_share,super_peaks_file
129,15757,ac5b3676bb10fc8ae4cd494e3724053321f533b28c3a64...,TimeSeriesDataB64_0_9900_10000.parquet,PT_TCP,PPS,991,65.0,76.0,103.0,127.0,...,-0.000061,10.181630,1.153405,-1.210012e-04,0.0,,0.000000,30.0,0.487892,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...
143,15783,3ca2d2ca5e29360360beef068d732605d4d4499209cdf3...,TimeSeriesDataB64_0_10000_10100.parquet,PT_TCP,PPS,992,68.0,80.0,108.0,133.0,...,0.000069,3.150070,1.214814,6.717285e-05,0.0,,0.000000,30.0,0.462755,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...
167,15809,4446402910b942444736abdc234a69c1be8cab947550b2...,TimeSeriesDataB64_0_10000_10100.parquet,PT_TCP,PPS,993,41.0,49.0,66.0,83.0,...,-0.000004,5.481656,1.424059,1.461221e-04,0.0,,0.000000,30.0,0.183726,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...
1679,17161,686822b21e9e3b5fd607d32c8d045a2a9ad8af90cabfb5...,TimeSeriesDataB64_0_11200_11300.parquet,PT_TCP,PPS,1046,91.0,106.0,144.0,175.0,...,-0.000042,10.600329,1.230871,-2.413806e-05,0.0,,0.000000,30.0,0.460384,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...
1705,17187,2a6260ff3417353e98bdd0fcfba4a04dcde1819051847f...,TimeSeriesDataB64_0_11200_11300.parquet,PT_TCP,PPS,1047,108.0,120.0,150.0,180.0,...,0.000004,20.182156,2.869610,-6.400132e-04,0.0,,0.000000,30.0,0.223914,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40740,105395,79296fdcd9d94f6a606db159963999bfc33cb8698ba71f...,TimeSeriesDataB64_4_6600_6700.parquet,PT_TCP,PPS,5521,45.0,51.0,68.0,80.0,...,0.000071,7.656432,1.717695,1.049330e-06,0.0,,0.000000,30.0,0.157329,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...
40794,99413,161e1a1669916c2486a5d2debc2f03ae43052b01c062c6...,TimeSeriesDataB64_4_700_800.parquet,PT_TCP,PPS,4914,32.0,36.0,48.0,57.0,...,0.000010,25.928804,3.437735,4.490845e-07,0.0,,0.000000,30.0,0.072127,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...
40838,99465,99b741be8b52a7ff1e704abf80d85c000b0d9d33feedc3...,TimeSeriesDataB64_4_800_900.parquet,PT_TCP,PPS,4923,87.0,98.0,129.0,151.0,...,0.000003,37.453768,1.834589,-3.132017e-07,0.0,,0.000000,30.0,0.402095,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...
40884,99517,84bee7e6f334fb45dfa5b2089e06cece732e898ad7e1a2...,TimeSeriesDataB64_4_800_900.parquet,PT_TCP,PPS,4925,16.7,19.0,26.0,30.0,...,0.000011,5.081980,1.156040,4.458437e-06,0.0,0.002,0.002315,30.0,0.472412,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...


In [9]:
sp_files = data[(~data.super_peaks_file.isna()) & (data.vector == PT_TCP) & (data.metric == PPS) & (data.prediction_id.isin(tcp_v6_pred_id))][['file','super_peaks_file','ip_ss']]
sp_files


Unnamed: 0,file,super_peaks_file,ip_ss
129,./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP...,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...,65.0
143,./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP...,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...,68.0
167,./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP...,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...,41.0
1679,./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP...,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...,91.0
1705,./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP...,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...,108.0
...,...,...,...
40740,./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP...,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...,45.0
40794,./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP...,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...,32.0
40838,./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP...,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...,87.0
40884,./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP...,./../../Itay_&_Mila_data/super_peaks/PT_TCP_PP...,16.7


In [10]:
sp_file = sp_files.iloc[0,0]
sp_file

'./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PPS/PT_TCP_PPS_ac5b3676bb10fc8ae4cd494e3724053321f533b28c3a6433df24c57bc291ff9d.parquet'

In [20]:
# specific index to explore it's time series traffic    
idx = 13
ts_file = sp_files.iloc[idx,0]
sp_file = sp_files.iloc[idx,1]
threshold_value = sp_files.iloc[idx,2]

# pd.DataFrame(file)
print(threshold_value)
ts = pd.read_parquet(ts_file)
sp = pd.read_csv(sp_file, index_col='passed_tmstmp')

display(ts[ts.blocked_val >0 ])
display(sp[sp.blocked_val >0 ])

# q_99 = ts.passed_val.quantile(0.97)
# print(ts.passed_val.max(), threshold_value - ts.passed_val.max(),(threshold_value - ts.passed_val.max())/threshold_value)
plot_ts(ts, 'ip_ss', threshold_value, pred_lr=0,pred_rf=0)
plot_ts(sp, 'ip_ss', threshold_value, pred_lr=0,pred_rf=0)


16.4


Unnamed: 0_level_0,passed_val,blocked_val,total_val
passed_tmstmp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


Unnamed: 0_level_0,passed_val,blocked_val,total_val
passed_tmstmp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [None]:
# So I rewatched our last meeting, specificly how to handle the blocked_val
# With the help of GPT here is how to do the threshold:


# Step 1: Filtering based on Blocked Traffic Ratio
threshold_ratio = 0.2  # Setting threshold for significant event

# Calculate the ratio of blocked traffic to total traffic
df['blocked_ratio'] = df['blocked_val'] / df['total_val']

# Create a binary mask where each timestamp is True if the blocked ratio is greater than or equal to the threshold_ratio
df['mask'] = df['blocked_ratio'] >= threshold_ratio

# Step 2: Forward Filling with Limit
# First, replace False with NaN so we can use forward filling
df['mask'].replace(False, np.nan, inplace=True)

# Define a time limit for forward filling
time_limit = pd.Timedelta('30 minutes')

# Apply forward fill with limit
df['mask'].fillna(method='ffill', limit=int(time_limit/pd.Timedelta('1H')), inplace=True)  # Assuming that the time index has 1H frequency

# Step 3: Creating Final Mask
# Replace NaN values with False
df['mask'].fillna(False, inplace=True)

Check if blocked data is absent from all super peaks

- if yes ask Johnathan what does it mean, how should we treat it

In [14]:

cnt_has_ts_only_blocked = []
cnt = 0
for i,row in sp_files.iterrows():
    cnt += 1
    sp_file = row['super_peaks_file']
    ts_file = row['file']
    sp = pd.read_csv(sp_file, index_col='passed_tmstmp')
    ts = pd.read_parquet(ts_file)

    if len(ts[ts.blocked_val > 0]) > 0 :
        cnt_has_ts_only_blocked.append(ts_file)
    if cnt%100 == 0 :
        print(cnt)
    



100
200
300
400
500
600
700
800
900
1000


In [15]:
cnt_has_ts_only_blocked

['./../../Itay_&_Mila_data/all_vec_met/PT_TCP_PPS/PT_TCP_PPS_ac5b3676bb10fc8ae4cd494e3724053321f533b28c3a6433df24c57bc291ff9d.parquet',
 './../../Itay_&_Mila_data/all_vec_met/PT_TCP_PPS/PT_TCP_PPS_3ca2d2ca5e29360360beef068d732605d4d4499209cdf300b121120bbdf42529.parquet',
 './../../Itay_&_Mila_data/all_vec_met/PT_TCP_PPS/PT_TCP_PPS_4446402910b942444736abdc234a69c1be8cab947550b2602acb7c362b0cfdb5.parquet',
 './../../Itay_&_Mila_data/all_vec_met/PT_TCP_PPS/PT_TCP_PPS_686822b21e9e3b5fd607d32c8d045a2a9ad8af90cabfb5aa2541a44222920553.parquet',
 './../../Itay_&_Mila_data/all_vec_met/PT_TCP_PPS/PT_TCP_PPS_fc7625f00e0fc8161b01163a14acbc6a11ec88c562b1e77c0fd92c9976544b4e.parquet',
 './../../Itay_&_Mila_data/all_vec_met/PT_TCP_PPS/PT_TCP_PPS_7328a9773b01f899232665bba2fc48429fd6afafc7453c1a24095a236a853bbf.parquet',
 './../../Itay_&_Mila_data/all_vec_met/PT_TCP_PPS/PT_TCP_PPS_ffa22788370b1ac7f54c6ece5e0e2b61dc24f1669bda81df04eb8373f279b5b9.parquet',
 './../../Itay_&_Mila_data/all_vec_met/PT_TCP_PP

In [None]:
# def extract_ts_from_dict(ts_dict):
#     passed_values = []
#     blocked_values = []

#     for day, values in ts_dict['passedDays'].items():
#         for value in values['values']:
#             passed_values.append(value)

#     for day, values in ts_dict['blockedDays'].items():
#         for value in values['values']:
#             blocked_values.append(value)

#     # Some times passed_Values or blocked values are empty
#     if len(passed_values) == 0:
#         passed = pd.DataFrame({'passed_val':[],'passed_tmstmp':[]})
#     else:    
#         passed = pd.DataFrame(passed_values).rename(columns={'value': 'passed_val', 'timeStamp': 'passed_tmstmp'})
    
#     if len(blocked_values) == 0:
#         blocked = pd.DataFrame({'blocked_val':[],'blocked_tmstmp':[]})
#     else:    
#         blocked = pd.DataFrame(blocked_values).rename(columns={'value': 'blocked_val', 'timeStamp': 'blocked_tmstmp'})

#     #display(blocked.blocked_val.sum())
#     ts = pd.concat([passed, blocked], axis=1).drop(['blocked_tmstmp'], axis = 1)  
#     ts.set_index(['passed_tmstmp'], inplace=True) 
#     ts.index = pd.to_datetime(ts.index, unit = 's')
#     ts.sort_index()
#     ts.fillna(0, inplace=True)
#     ts['total_val'] = ts['passed_val'] + ts['blocked_val']

#     return ts


In [None]:
# # Creating files and adding path

# data['super_peaks_file'] = np.nan

# folder_path = './../../Itay_&_Mila_data/time_series/' 

# # List all files in the folder
# files = os.listdir(folder_path)
# files

# main_folder = './../../Itay_&_Mila_data/super_peaks/'
# for c, file in enumerate(files):
#     ts = pd.read_parquet('./../../Itay_&_Mila_data/time_series/'+file, columns=['prediction_id','vector','metric','super_peak'])
#     ts_sp = ts[~ts.super_peak.isna()].copy()
#     for i, row in ts_sp.iterrows():
#         pred_id = row['prediction_id']
#         vec     = row['vector']
#         met     = row['metric']
#         ts_hash = row['super_peak']

#         sub_folder = vec + '_' + met + '/'
#         sp_file_name = vec + '_' + met + '_'+ pred_id + '.csv'
        
#         # # Extracting ts data
#         # ts_dict = json.loads(decompress(b64decode(ts_hash)))
#         # ts = extract_ts_from_dict(ts_dict=ts_dict)
        
#         # # saving the file in the folder
#         # ts.to_csv(sp_file_name)

#         # saving the folder path in data column
#         sp_path = main_folder + sub_folder + sp_file_name
#         data.loc[(data.vector == vec) & (data.metric == met) & (data.prediction_id == pred_id),['super_peaks_file']] = sp_path
#     if (c % 100) == 0:
#         print(c)


In [117]:
# # Renaming files path in data

# data['super_peaks_file'] = np.nan

# main_folder = './../../Itay_&_Mila_data/super_peaks/'

# # List all files in the folder
# files = os.listdir(main_folder)
# # cnts = []
# for file in files:
#     file_ls = file.split('_')
#     vec     = file_ls[0] + '_' +file_ls[1]
#     met     = file_ls[2]
#     pred_id = file_ls[3].split('.')[0]
#     file_path = main_folder + vec + '_' + met + '_' + file_ls[3]
#     # cnt = data.loc[(data.vector == vec) & (data.metric == met) & (data.prediction_id == pred_id)].shape[0]
#     # cnts.append(cnt)
#     data.loc[(data.vector == vec) & (data.metric == met) & (data.prediction_id == pred_id), ['super_peaks_file']] = file_path
    



In [123]:
# half = int(data.shape[0]/2)
# data[:half].to_csv('./FileCatalog_v2_p1.csv', index=False)
# data[half:].to_csv('./FileCatalog_v2_p2.csv', index=False)