In [1]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import warnings
import os
from tqdm import tqdm
warnings.filterwarnings('ignore')


In [2]:
def process_file(filename, dirname):
    data = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    data = data.sort_values(by='step', ascending=True)
    data = data.drop('step', axis=1)
    return data.describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95]).values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname):
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    data = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    data['id'] = indexes
    return data

In [3]:
train_parquet = load_time_series("../data/input/series_train.parquet")
test_parquet = load_time_series("../data/input/series_test.parquet")

100%|██████████| 996/996 [00:34<00:00, 28.71it/s]
100%|██████████| 2/2 [00:00<00:00, 14.49it/s]


In [4]:

train_parquet.shape, test_parquet.shape

((996, 121), (2, 121))

In [5]:
train_parquet.isna().sum().sum(), test_parquet.isna().sum().sum()

(0, 0)

In [6]:
train_parquet

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,...,stat_111,stat_112,stat_113,stat_114,stat_115,stat_116,stat_117,stat_118,stat_119,id
0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,...,4.004276,89.751656,0.0,2633.250000,4188.5,8.611000e+13,7.0,3.0,85.0,00115b9f
1,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,396396.0,...,2.952888,89.476036,1.0,2597.800049,4175.0,8.639500e+13,7.0,3.0,91.0,001f3379
2,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,414384.0,...,4.041167,89.862823,1.0,2611.000000,4187.0,8.639500e+13,7.0,2.0,38.0,00f332d1
3,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,370956.0,...,4.097965,88.478432,1.0,2650.500000,4184.0,8.639500e+13,7.0,4.0,61.0,01085eb3
4,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,97048.0,...,6.307294,88.389679,0.0,2618.000000,4157.0,8.639500e+13,7.0,4.0,2.0,012cadd8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,426684.0,426684.0,426684.0,426684.0,426684.0,426684.0,426684.0,426684.0,426684.0,426684.0,...,4.031051,89.431595,1.0,2527.199951,4183.0,8.639500e+13,7.0,1.0,33.0,fe9c71d8
992,414300.0,414300.0,414300.0,414300.0,414300.0,414300.0,414300.0,414300.0,414300.0,414300.0,...,1.179653,89.310356,1.0,304.000000,4181.0,8.639500e+13,7.0,1.0,24.0,fecc07d6
993,384900.0,384900.0,384900.0,384900.0,384900.0,384900.0,384900.0,384900.0,384900.0,384900.0,...,3.683850,89.768860,0.0,2644.250000,4176.0,8.639500e+13,7.0,3.0,-13.0,ff18b749
994,416275.0,416275.0,416275.0,416275.0,416275.0,416275.0,416275.0,416275.0,416275.0,416275.0,...,3.078876,89.693832,1.0,2605.750000,4185.0,8.639500e+13,7.0,1.0,72.0,ffcd4dbd


In [7]:
!ls ../data/


features  input


In [8]:
train_parquet.dtypes.value_counts(), test_parquet.dtypes.value_counts()

(float64    120
 object       1
 Name: count, dtype: int64,
 float64    120
 object       1
 Name: count, dtype: int64)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_parquet.loc[:, train_parquet.columns != 'id'] = scaler.fit_transform(train_parquet.loc[:, train_parquet.columns != 'id'])
test_parquet.loc[:, test_parquet.columns != 'id'] = scaler.transform(test_parquet.loc[:, test_parquet.columns != 'id'])
train_parquet.dtypes.value_counts(), test_parquet.dtypes.value_counts()

(float64    120
 object       1
 Name: count, dtype: int64,
 float64    120
 object       1
 Name: count, dtype: int64)

In [10]:
train_parquet.head()

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,...,stat_111,stat_112,stat_113,stat_114,stat_115,stat_116,stat_117,stat_118,stat_119,id
0,-2.049742,-2.049742,-2.049742,-2.049742,-2.049742,-2.049742,-2.049742,-2.049742,-2.049742,-2.049742,...,0.226256,0.260711,-1.364576,0.33155,0.08984,-0.200212,0.131316,0.332056,0.071358,00115b9f
1,0.605992,0.605992,0.605992,0.605992,0.605992,0.605992,0.605992,0.605992,0.605992,0.605992,...,-0.49434,0.177486,0.732828,0.292056,-0.054929,0.117144,0.131316,0.332056,0.140799,001f3379
2,0.741297,0.741297,0.741297,0.741297,0.741297,0.741297,0.741297,0.741297,0.741297,0.741297,...,0.251541,0.294279,0.732828,0.306761,0.073754,0.117144,0.131316,-0.531463,-0.472595,00f332d1
3,0.414635,0.414635,0.414635,0.414635,0.414635,0.414635,0.414635,0.414635,0.414635,0.414635,...,0.290468,-0.123746,0.732828,0.350768,0.041583,0.117144,0.131316,1.195574,-0.206405,01085eb3
4,-1.645679,-1.645679,-1.645679,-1.645679,-1.645679,-1.645679,-1.645679,-1.645679,-1.645679,-1.645679,...,1.804691,-0.150545,-1.364576,0.31456,-0.247953,0.117144,0.131316,1.195574,-0.889239,012cadd8


In [11]:
train_parquet.to_csv("../data/features/train_time_series.csv", index=False)
test_parquet.to_csv("../data/features/test_time_series.csv", index=False)