In [1]:
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
from tsfresh import extract_features
from tsfresh.feature_extraction.feature_calculators import set_property
import pycatch22

from autogluon.tabular import TabularDataset, TabularPredictor

import sklearn.metrics as skm
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import random
import json

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:

random_seed = 29

old_class_list = [
    '1__single', 
    # '1__single_outside', 
    '1__double',
    '1__double_outside',
    '1__double_inside',
    '0__single',
    # '0__single_outside',
    '0__double',
    '0__double_outside',
    '0__double_inside',
    'mm__single',
    # 'mm__single_outside',
    # 'mm__single_outside__add',
    'mm__double',
    'mm__double_outside',
    'mm__double_inside',
]

old_class_short_list = [
    '1_sgl', 
    # '1_sgl_out', 
    '1_dbl',
    '1_dbl_out',
    '1_dbl_in',
    '0_sgl',
    # '0_sgl_out',
    '0_dbl',
    '0_dbl_out',
    '0_dbl_in',
    'mm_sgl',
    # 'mm_sgl_out',
    'mm_dbl',
    'mm_dbl_out',
    'mm_dbl_in',
]

old_class2new_class = {
    '1__single': 'first', 
    # '1__single_outside': 'first', 
    '1__double': 'second',
    '1__double_outside': 'second',
    '1__double_inside': 'second',
    '0__single': 'zero',
    # '0__single_outside': 'zero',
    '0__double': 'first',
    '0__double_outside': 'first',
    '0__double_inside': 'first',
    'mm__single': 'mm_1',
    # 'mm__single_outside': 'mm_1_out',
    # 'mm__single_outside__add': 'mm_1_out',
    'mm__double': 'mm_2',
    'mm__double_outside': 'mm_2',
    'mm__double_inside': 'mm_2',
}

class_list = [
    'zero',
    'first',
    'second',
    'mm_1',
    # 'mm_1_out',
    'mm_2',
]

In [4]:
def collect_data(cat_conc='norm_cat', random_seed=42):
    data_root = os.path.join(f'../Ye/previous/{cat_conc}/')
    task_names = [
        '1__single', 
        # '1__single_outside', 
        '1__double',
        '1__double_outside',
        '1__double_inside',
        '0__single',
        # '0__single_outside',
        '0__double',
        '0__double_outside',
        '0__double_inside',
        'mm__single',
        # 'mm__single_outside',
        # 'mm__single_outside__add',
        'mm__double',
        'mm__double_outside',
        'mm__double_inside',
    ]
    for task_name in tqdm(task_names):
        task_data_root = os.path.join(data_root, task_name)
        save_root = os.path.join('data', cat_conc, 'raw')
        if not os.path.exists(save_root):
            os.makedirs(save_root)

        class_name = old_class2new_class[task_name]
        new_class_data_path = os.path.join(save_root, f'{class_name}__all.csv')
        if os.path.exists(new_class_data_path):
            df_new_class = pd.read_csv(new_class_data_path)
        else:
            df_new_class = pd.DataFrame()

        data_column = ['id']
        for i in range(30):
            data_column += [f's{i}']
        for i in range(30):
            data_column += [f'p{i}']
        data = pd.DataFrame(columns=data_column)

        for file in os.listdir(task_data_root):
            if file.endswith('.json'):
                with open(os.path.join(task_data_root, file), 'r') as f:
                    task_data = json.load(f)
                
                id = f'{class_name}-{task_name}-{file[:-5]}'
                data.loc[data.shape[0]] = [id] + task_data['s'] + task_data['p']

        # data.to_csv(os.path.join(save_root, f'{task_name}__all.csv'), index=False)
        data_noDup = data.drop_duplicates('id')
        assert data_noDup.shape[0] == data.shape[0]
        print(f'{task_name}: {data.shape[0]}')

        df_new_class = pd.concat([df_new_class, data], ignore_index=True)
        df_new_class.to_csv(new_class_data_path, index=False)



# def check_A_a_B(data):
#     A = data['A']
#     a = data['a']
#     B = data['B']
#     return (a+B)/A < 0.005

# def save_data_file(task_name, save_root='data', data_size=10000, random_seed=0):
#     # print('task_name:', task_name)
#     data = collect_data(task_name)
#     data_noDup = data.drop_duplicates()
#     assert data_noDup.shape[0] == data.shape[0]
#     data.to_csv(os.path.join(save_root, f'{task_name}_data_all.csv'), index=False)
#     data = data.sample(n=data_size, random_state=random_seed)
#     data.to_csv(os.path.join(save_root, f'{task_name}_data_{data_size}.csv'), index=False)

In [5]:
save_root = 'data'
if not os.path.exists(save_root):
    os.makedirs(save_root)
    
collect_data('norm_cat', random_seed)

  8%|▊         | 1/12 [00:18<03:23, 18.52s/it]

1__single: 11404


 17%|█▋        | 2/12 [00:35<02:55, 17.59s/it]

1__double: 10710
1__double_outside: 11156


 25%|██▌       | 3/12 [00:53<02:39, 17.73s/it]

1__double_inside: 13332


 42%|████▏     | 5/12 [01:37<02:23, 20.53s/it]

0__single: 13671
0__double: 12214


 50%|█████     | 6/12 [01:57<02:01, 20.32s/it]

0__double_outside: 20369


 58%|█████▊    | 7/12 [02:34<02:07, 25.56s/it]

0__double_inside: 12902


 75%|███████▌  | 9/12 [03:12<01:06, 22.01s/it]

mm__single: 10742


 83%|████████▎ | 10/12 [03:29<00:40, 20.42s/it]

mm__double: 10728
mm__double_outside: 10772


 92%|█████████▏| 11/12 [03:46<00:19, 19.44s/it]

mm__double_inside: 11022


100%|██████████| 12/12 [04:04<00:00, 20.38s/it]


In [6]:
def stratified_sample(df, col, n_samples):
    return df.groupby(col, group_keys=False).apply(lambda x: x.sample(min(len(x), n_samples), random_state=random_seed)).reset_index(drop=True)

data_size_for_tasks = {
    '0__single': 10000,
    # '0__single': 5000,
    # '0__single_outside': 5000,

    'mm__single': 10000,

    # 'mm__single_outside': 10000,
    # 'mm__single_outside__add': 200,

    '1__single': 3220, 
    # '1__single_outside': 2415, 
    '0__double': 3220,
    '0__double_outside': 3220,
    '0__double_inside': 340,

    'mm__double': 3334,
    'mm__double_outside': 3333,
    'mm__double_inside': 3333,
    
    '1__double': 3334,
    '1__double_outside': 3333,
    '1__double_inside': 3333,
}
cat_conc = 'norm_cat'
for class_name in tqdm(class_list):
    data_path = os.path.join('data', cat_conc, 'raw', f'{class_name}__all.csv')
    data = pd.read_csv(data_path)
    data['old_class'] = data['id'].apply(lambda x: x.split('-')[1])
    sampled_data = pd.DataFrame()
    for old_class in data['old_class'].unique().tolist():
        print(old_class)
        data_ = data[data['old_class'] == old_class].sample(n=data_size_for_tasks[old_class], random_state=random_seed)
        sampled_data = pd.concat([sampled_data, data_], ignore_index=True)
    print(class_name)
    print(sampled_data.shape)
    print(sampled_data.drop_duplicates('id').shape)
    sampled_data.to_csv(os.path.join('data', cat_conc, 'raw', f'{class_name}__10k.csv'), index=False)
        # break
    # break


 20%|██        | 1/5 [00:00<00:00,  5.07it/s]

0__single
zero
(10000, 62)
(10000, 62)


 40%|████      | 2/5 [00:00<00:00,  3.91it/s]

1__single
0__double
0__double_outside
0__double_inside
first
(10000, 62)
(10000, 62)


 60%|██████    | 3/5 [00:00<00:00,  3.98it/s]

1__double
1__double_outside
1__double_inside
second
(10000, 62)
(10000, 62)
mm__single
mm_1
(10000, 62)
(10000, 62)


100%|██████████| 5/5 [00:01<00:00,  4.33it/s]

mm__double
mm__double_outside
mm__double_inside
mm_2
(10000, 62)
(10000, 62)





In [7]:
# add error
def add_error(data:pd.DataFrame, random_seed=42, ignore_cols=['id', 'class', 'label', 'old_class', 'cat_conc']):
    error_list = [0, 0.5, 1, 2]
    data_ = data.copy()
    for col in ignore_cols:
        if col in data_.columns:
            data_ = data_.drop(columns=[col])
    random.seed(random_seed)
    for i in range(len(data_)):
        error = random.choice(error_list)
        # print(error)
        data_.loc[i] += [random.gauss(0, error / 100) for i in range(data_.shape[1])]
    for col in ignore_cols:
        if col in data.columns:
            data_[col] = data[col]
    # rearrange columns
    cols = data.columns.tolist()
    data_ = data_[cols]
    return data_

In [8]:
# concat data, add error and split train_val test
for cat_conc in ['norm_cat']:
    data_root = os.path.join(f'data/{cat_conc}/raw/')
    data = pd.DataFrame()
    for task_name in tqdm(class_list):
        data_path = os.path.join(data_root, f'{task_name}__10k.csv')
        data_tmp = pd.read_csv(data_path)
        # data_tmp.drop(columns=['old_class'], inplace=True)
        data_tmp['class'] = task_name
        data = pd.concat([data, data_tmp], axis=0, ignore_index=True)
    print(cat_conc, data.shape)
    data = add_error(data)
    print('first:', data[data['class'] == 'first'].shape)
    print('second:', data[data['class'] == 'second'].shape)
    print('zero:', data[data['class'] == 'zero'].shape)
    print('mm_1:', data[data['class'] == 'mm_1'].shape)
    # print('mm_1_out:', data[data['class'] == 'mm_1_out'].shape)
    print('mm_2:', data[data['class'] == 'mm_2'].shape)
    # break
    train_val_data, test_data = train_test_split(data, test_size=0.1, random_state=random_seed, stratify=data['class'])
    train_val_data.to_csv(os.path.join('data', cat_conc, f'{cat_conc}_train_val.csv'), index=False)
    test_data.to_csv(os.path.join('data', cat_conc, f'{cat_conc}_test.csv'), index=False)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:00<00:00, 34.94it/s]


norm_cat (50000, 63)
first: (10000, 63)
second: (10000, 63)
zero: (10000, 63)
mm_1: (10000, 63)
mm_2: (10000, 63)


In [9]:
df = pd.read_csv('data/norm_cat/norm_cat_train_val.csv')
# df[df['id'].str.contains('0__single_')].shape
# df[df['class'] == 'second'].shape
print(df.shape)
df.head()
# df[df['id'].str.contains('add')]

(45000, 63)


Unnamed: 0,id,s0,s1,s2,s3,s4,s5,s6,s7,s8,...,p22,p23,p24,p25,p26,p27,p28,p29,old_class,class
0,second-1__double-6886,1.014078,0.941292,0.874194,0.831934,0.796985,0.752483,0.710213,0.682713,0.618944,...,0.25149,0.25582,0.273534,0.26947,0.26676,0.267846,0.282937,0.28153,1__double,second
1,mm_2-mm__double-8713,1.0,0.857185,0.740969,0.644123,0.562685,0.493658,0.434723,0.384089,0.340341,...,0.303795,0.304273,0.304664,0.304983,0.305245,0.30546,0.305636,0.305781,mm__double,mm_2
2,first-0__double_outside-275,1.003719,0.943307,0.8438,0.748718,0.696003,0.690904,0.62657,0.550884,0.533442,...,0.400171,0.394868,0.370069,0.398233,0.393137,0.419216,0.387829,0.406436,0__double_outside,first
3,first-0__double_outside-6283,1.0,0.876857,0.769023,0.674643,0.59204,0.519743,0.45646,0.401073,0.35259,...,0.42624,0.42961,0.432611,0.43529,0.437688,0.439839,0.441774,0.443521,0__double_outside,first
4,mm_2-mm__double_outside-288,0.99208,0.95408,0.862516,0.828265,0.802843,0.746227,0.698473,0.675103,0.621021,...,0.237389,0.260133,0.251989,0.260898,0.263738,0.270799,0.270729,0.291732,mm__double_outside,mm_2


In [10]:
@set_property("fctype", "combiner")
def catch22(x, param):
    """
    pycatch22

    :param x: the time series to calculate the feature of
    :type x: pandas.Series
    :return: list of tuples (s, f) where s are the feature name in catch22, serialized as a string,
             and f the respective feature value as bool, int or float
    :return type: pandas.Series
    """
    data = pycatch22.catch22_all(x)

    return [(name, value) for name, value in zip(data['names'], data['values'])]

settings = ComprehensiveFCParameters()
settings[catch22] = None



In [11]:
# fit dataset format for tsfresh
for cat_conc in ['norm_cat']:
    data_root = os.path.join(f'data/{cat_conc}')
    for set_ in ['train_val', 'test']:
        data_path = os.path.join(data_root, f'{cat_conc}_{set_}.csv')
        data = pd.read_csv(data_path)
        data.drop(columns=['old_class'], inplace=True)
        data_tsfresh = pd.DataFrame(columns=['id', 't', 's', 'p'])
        data_tsfresh_ids = []
        data_tsfresh_ts = []
        data_tsfresh_ss = []
        data_tsfresh_ps = []
        for i in tqdm(range(data.shape[0]), desc=set_):
            id = data.loc[i, 'id']
            for j in range(30):
                data_tsfresh_ss.append(data.loc[i, f's{j}'])
                data_tsfresh_ps.append(data.loc[i, f'p{j}'])
            data_tsfresh_ids += [id] * 30
            data_tsfresh_ts += list(range(30))
        data_tsfresh['id'] = data_tsfresh_ids
        data_tsfresh['t'] = data_tsfresh_ts
        data_tsfresh['s'] = data_tsfresh_ss
        data_tsfresh['p'] = data_tsfresh_ps
        data_tsfresh.to_csv(os.path.join(data_root, f'{cat_conc}_{set_}_tsfresh.csv'), index=False)
    


train_val: 100%|██████████| 45000/45000 [00:08<00:00, 5071.29it/s]
test: 100%|██████████| 5000/5000 [00:00<00:00, 5109.46it/s]


In [12]:
for cat_conc in [
    'norm_cat',
]:
    data_root = os.path.join(f'data/{cat_conc}')
    for set_ in ['train_val', 'test']:
        print(cat_conc, set_)
        data_path = os.path.join(data_root, f'{cat_conc}_{set_}_tsfresh.csv')
        data = pd.read_csv(data_path)
        data_feat = extract_features(data, column_id='id', column_sort='t', default_fc_parameters=settings)
        data_feat['id'] = data_feat.index
        data_feat['class'] = data_feat['id'].apply(lambda x: x.split('-')[0])
        data_feat.to_csv(os.path.join(data_root, f'{cat_conc}_{set_}_tsfresh_feat.csv'), index=False)

norm_cat train_val


Feature Extraction: 100%|██████████| 60/60 [03:09<00:00,  3.15s/it]


norm_cat test


Feature Extraction: 100%|██████████| 60/60 [00:20<00:00,  2.91it/s]


In [13]:
for cat_conc in [
    'norm_cat',
]:
    data_root = os.path.join(f'data/{cat_conc}')
    for set_ in ['train_val', 'test']:
        print(cat_conc, set_)
        data_path = os.path.join(data_root, f'{cat_conc}_{set_}.csv')
        data = pd.read_csv(data_path)
        # data.drop(columns=['old_class'], inplace=True)
        # data.to_csv(os.path.join(data_root, f'{cat_conc}_{set_}.csv'), index=False)

norm_cat train_val
norm_cat test
