In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor
import warnings
from scipy.optimize import minimize
import os
from tqdm import tqdm

In [3]:
def process_file(filename, dirname):
    data = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    data.drop('step', axis=1, inplace=True)
    return data.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname):
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    data = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    data['id'] = indexes
    return data

In [4]:
train_parquet = load_time_series("../data/input/series_train.parquet")

100%|██████████| 996/996 [00:33<00:00, 30.06it/s]


In [5]:
train_parquet.describe()

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,...,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95
count,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,...,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0
mean,315832.478916,315832.478916,315832.478916,315832.478916,315832.478916,315832.478916,315832.478916,315832.478916,315832.478916,315832.478916,...,1.875645,3.674156,88.888246,0.650602,2335.652809,4180.122239,86289800000000.0,6.933735,2.615462,78.834337
std,133011.574731,133011.574731,133011.574731,133011.574731,133011.574731,133011.574731,133011.574731,133011.574731,133011.574731,133011.574731,...,0.911899,1.459785,3.313411,0.477019,898.044846,93.299368,898497100000.0,0.504877,1.158635,86.447984
min,927.0,927.0,927.0,927.0,927.0,927.0,927.0,927.0,927.0,927.0,...,0.167161,0.140138,11.565893,0.0,26.5,3996.0,69805000000000.0,2.0,1.0,-119.0
25%,253592.75,253592.75,253592.75,253592.75,253592.75,253592.75,253592.75,253592.75,253592.75,253592.75,...,1.230632,2.790156,88.972979,0.0,2512.600098,4170.0,86395000000000.0,7.0,2.0,30.0
50%,383544.0,383544.0,383544.0,383544.0,383544.0,383544.0,383544.0,383544.0,383544.0,383544.0,...,1.740934,3.806256,89.377281,1.0,2613.625,4180.0,86395000000000.0,7.0,3.0,51.0
75%,402597.0,402597.0,402597.0,402597.0,402597.0,402597.0,402597.0,402597.0,402597.0,402597.0,...,2.234637,4.593709,89.651743,1.0,2637.0,4187.0,86395000000000.0,7.0,4.0,90.0
max,756212.0,756212.0,756212.0,756212.0,756212.0,756212.0,756212.0,756212.0,756212.0,756212.0,...,8.125557,11.3262,89.98114,1.0,20445.5,6000.0,86395000000000.0,7.0,4.0,748.0


In [6]:
train_parquet.head()

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,...,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95,id
0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,...,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0,00115b9f
1,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,...,2.952888,89.476036,1.0,2597.800049,4175.0,86395000000000.0,7.0,3.0,91.0,001f3379
2,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,...,4.041167,89.862823,1.0,2611.0,4187.0,86395000000000.0,7.0,2.0,38.0,00f332d1
3,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,...,4.097965,88.478432,1.0,2650.5,4184.0,86395000000000.0,7.0,4.0,61.0,01085eb3
4,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,...,6.307294,88.389679,0.0,2618.0,4157.0,86395000000000.0,7.0,4.0,2.0,012cadd8


In [7]:
train_parquet.columns

Index(['stat_0', 'stat_1', 'stat_2', 'stat_3', 'stat_4', 'stat_5', 'stat_6',
       'stat_7', 'stat_8', 'stat_9', 'stat_10', 'stat_11', 'stat_12',
       'stat_13', 'stat_14', 'stat_15', 'stat_16', 'stat_17', 'stat_18',
       'stat_19', 'stat_20', 'stat_21', 'stat_22', 'stat_23', 'stat_24',
       'stat_25', 'stat_26', 'stat_27', 'stat_28', 'stat_29', 'stat_30',
       'stat_31', 'stat_32', 'stat_33', 'stat_34', 'stat_35', 'stat_36',
       'stat_37', 'stat_38', 'stat_39', 'stat_40', 'stat_41', 'stat_42',
       'stat_43', 'stat_44', 'stat_45', 'stat_46', 'stat_47', 'stat_48',
       'stat_49', 'stat_50', 'stat_51', 'stat_52', 'stat_53', 'stat_54',
       'stat_55', 'stat_56', 'stat_57', 'stat_58', 'stat_59', 'stat_60',
       'stat_61', 'stat_62', 'stat_63', 'stat_64', 'stat_65', 'stat_66',
       'stat_67', 'stat_68', 'stat_69', 'stat_70', 'stat_71', 'stat_72',
       'stat_73', 'stat_74', 'stat_75', 'stat_76', 'stat_77', 'stat_78',
       'stat_79', 'stat_80', 'stat_81', 'stat_82',