In [1]:
import numpy as np
import pandas as pd
import json

import tsfresh
from tsfresh import extract_features
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.utilities.distribution import MultiprocessingDistributor

labels = [5, 6, 7]
basedir = 'D:/lalamove/lalamove/data/Clean_extracted_240115_uncal/train'
with open(basedir + '/../data_split_params.json', 'r') as file:
    features = json.load(file)['FEATURES']
print(features)
WINDOW = 5 # seconds
SAMPLING_RATE = 20 # hz
window_length = WINDOW * SAMPLING_RATE

{'5': ['z_gyro_clean', 'y_gyro_clean', 'x_gyro_clean', 'x_acc_clean', 'y_acc_clean', 'z_acc_clean', 'acceleration', 'speed_kmh'], '6': ['z_gyro_clean', 'y_gyro_clean', 'x_gyro_clean', 'x_acc_clean', 'y_acc_clean', 'z_acc_clean', 'acceleration', 'speed_kmh'], '7': ['z_gyro_clean', 'y_gyro_clean', 'x_gyro_clean', 'x_acc_clean', 'y_acc_clean', 'z_acc_clean', 'acceleration', 'speed_kmh']}




In [8]:
for label_title in labels:
    # load our data shape (sample_id, timestamp, time series)
    data_file = basedir + f'/{label_title}/chunk_1.npy'
    data = np.load(data_file)
    print('x', data.shape)

    label_file = basedir + f'/{label_title}/train_label_{label_title}.csv'
    label = pd.read_csv(label_file)

    pos = label[(label.label == 1) & (label.type == 0)]
    pos_1 = label[(label.label == 1) & (label.type == 1)].sample(n=len(pos)*2, replace=False, random_state=123)
    neg = label[label.label == 0].sample(n=len(pos)*3, replace=False, random_state=123)
    label_ = pd.concat((pos, neg)).reset_index().label
    data_ = data[label_.index]

    print('y', label.shape, label_.shape, data_.shape)

    # Create a list of column names for the DataFrame
    column_names = ['id', 'timestamp'] + features[str(label_title)]

    # Reshape the numpy array to have 2 dimensions
    reshaped_array = data_.reshape(-1, data_.shape[2])

    # Create a pandas DataFrame
    my_df = pd.DataFrame(reshaped_array, columns=column_names[2:])
    print(my_df.shape)

    # Assign sample ids and timestamps to the DataFrame
    num_samples = data_.shape[0]
    num_timestamps = data_.shape[1]
    my_df['id'] = np.repeat(np.arange(num_samples), num_timestamps)
    my_df['timestamp'] = np.tile(np.arange(num_timestamps), num_samples)
    # my_df = my_df[column_names]

    # Print the resulting DataFrame
    Distributor = MultiprocessingDistributor(n_workers=4,
                                            disable_progressbar=False,
                                            progressbar_title="Feature Extraction")
    
              
    print('Extracting feature only...', len(my_df))
    # extract all possible features
    temp = extract_features(my_df, 
                            column_id='id', 
                            column_sort='timestamp', 
                            column_kind=None, 
                            column_value=None, 
                            impute_function=impute)    
    
    # calculate independent p-value for relevance
    relevance_table = calculate_relevance_table(temp, label_, fdr_level=0.5)

    # filter relevant features (True if the Benjamini Hochberg procedure rejected the null hypothesis [the feature is not relevant])
    relevance_table = relevance_table[relevance_table['relevant'] == True]

    # Among relevant features, sort out top window_length0 with smallest p-values
    sorted_relevance_table = relevance_table.sort_values(by='p_value', ascending=True) 
    top_extracted_features = sorted_relevance_table.head(500)
    temp = temp[top_extracted_features.feature]

    # save the features setting
    kind_to_fc_parameters = tsfresh.feature_extraction.settings.from_columns(temp)
    with open(basedir + f'/{label_title}/feature_config.json', 'w') as file:
        json.dump(kind_to_fc_parameters, file, indent=4)
    print(kind_to_fc_parameters)
    
    ###########################################
    # extract features using the saved features
    label_ = label.label
    data_ = data

    print('y', label.shape, label_.shape, data_.shape)

    # Create a list of column names for the DataFrame
    column_names = ['id', 'timestamp'] + features[str(label_title)]

    # Reshape the numpy array to have 2 dimensions
    reshaped_array = data_.reshape(-1, data_.shape[2])

    # Create a pandas DataFrame
    my_df = pd.DataFrame(reshaped_array, columns=column_names[2:])
    print(my_df.shape)

    # Assign sample ids and timestamps to the DataFrame
    num_samples = data_.shape[0]
    num_timestamps = data_.shape[1]
    my_df['id'] = np.repeat(np.arange(num_samples), num_timestamps)
    my_df['timestamp'] = np.tile(np.arange(num_timestamps), num_samples)
    # my_df = my_df[column_names]

    # Print the resulting DataFrame
    print(my_df.shape)
    # print(my_df.head(4))

    step = 30_000
    df_features = []
    i = 0
    for n in range(step, len(label)+step, step):
        print(n)
        print(my_df[(n-step)*window_length:n*window_length].id.nunique())
        print(label[n-step:n].shape)
        temp = extract_features(my_df[(n-step)*window_length:n*window_length], 
                                kind_to_fc_parameters=kind_to_fc_parameters, 
                                column_id='id', 
                                column_sort='timestamp', 
                                column_kind=None, 
                                column_value=None, 
                                impute_function=impute)
        temp.to_csv(basedir + f'/{label_title}/extract_features_{label_title}_{i}.csv')
        print('extracted shape', temp.shape)
        i += 1

x (381607, 100, 8)
y (381607, 6) (16448,) (16448, 100, 8)
(1644800, 8)
Extracting feature only... 1644800


Feature Extraction: 100%|██████████| 80/80 [07:34<00:00,  5.69s/it] 


{'acceleration': {'number_peaks': [{'n': 3}, {'n': 1}, {'n': 10}, {'n': 5}], 'approximate_entropy': [{'m': 2, 'r': 0.9}, {'m': 2, 'r': 0.7}, {'m': 2, 'r': 0.5}, {'m': 2, 'r': 0.3}, {'m': 2, 'r': 0.1}], 'permutation_entropy': [{'dimension': 5, 'tau': 1}, {'dimension': 6, 'tau': 1}, {'dimension': 7, 'tau': 1}, {'dimension': 4, 'tau': 1}, {'dimension': 3, 'tau': 1}], 'sample_entropy': None, 'lempel_ziv_complexity': [{'bins': 3}, {'bins': 5}, {'bins': 2}, {'bins': 10}], 'fourier_entropy': [{'bins': 2}, {'bins': 100}, {'bins': 5}, {'bins': 3}, {'bins': 10}], 'cid_ce': [{'normalize': True}, {'normalize': False}], 'benford_correlation': None, 'fft_aggregated': [{'aggtype': 'variance'}, {'aggtype': 'kurtosis'}, {'aggtype': 'skew'}, {'aggtype': 'centroid'}], 'augmented_dickey_fuller': [{'attr': 'teststat', 'autolag': 'AIC'}, {'attr': 'pvalue', 'autolag': 'AIC'}, {'attr': 'usedlag', 'autolag': 'AIC'}], 'autocorrelation': [{'lag': 9}, {'lag': 8}, {'lag': 7}, {'lag': 6}, {'lag': 5}, {'lag': 4}], '

Feature Extraction: 100%|██████████| 80/80 [02:32<00:00,  1.90s/it]


extracted shape (30000, 500)
60000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [02:31<00:00,  1.90s/it]


extracted shape (30000, 500)
90000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [02:30<00:00,  1.89s/it]


extracted shape (30000, 500)
120000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:13<00:00,  2.41s/it]


extracted shape (30000, 500)
150000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:29<00:00,  2.62s/it]


extracted shape (30000, 500)
180000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:33<00:00,  2.67s/it]


extracted shape (30000, 500)
210000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:32<00:00,  2.65s/it]


extracted shape (30000, 500)
240000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:33<00:00,  2.67s/it]


extracted shape (30000, 500)
270000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:30<00:00,  2.63s/it]


extracted shape (30000, 500)
300000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:33<00:00,  2.67s/it]


extracted shape (30000, 500)
330000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:35<00:00,  2.69s/it]


extracted shape (30000, 500)
360000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:36<00:00,  2.71s/it]


extracted shape (30000, 500)
390000
21607
(21607, 6)


Feature Extraction: 100%|██████████| 80/80 [02:13<00:00,  1.67s/it]


extracted shape (21607, 500)
x (389287, 100, 8)
y (389287, 6) (17456,) (17456, 100, 8)
(1745600, 8)
Extracting feature only... 1745600


Feature Extraction: 100%|██████████| 80/80 [06:33<00:00,  4.91s/it] 


{'z_acc_clean': {'quantile': [{'q': 0.1}, {'q': 0.3}, {'q': 0.2}, {'q': 0.4}, {'q': 0.6}, {'q': 0.7}, {'q': 0.8}, {'q': 0.9}], 'count_above': [{'t': 0}], 'count_below': [{'t': 0}], 'range_count': [{'max': 0, 'min': -1000000000000.0}, {'max': 1000000000000.0, 'min': 0}], 'fft_coefficient': [{'attr': 'angle', 'coeff': 0}, {'attr': 'real', 'coeff': 0}, {'attr': 'abs', 'coeff': 0}, {'attr': 'abs', 'coeff': 37}, {'attr': 'abs', 'coeff': 38}, {'attr': 'abs', 'coeff': 40}, {'attr': 'abs', 'coeff': 39}, {'attr': 'abs', 'coeff': 35}, {'attr': 'abs', 'coeff': 42}, {'attr': 'abs', 'coeff': 46}, {'attr': 'abs', 'coeff': 47}, {'attr': 'abs', 'coeff': 25}, {'attr': 'abs', 'coeff': 43}, {'attr': 'abs', 'coeff': 49}, {'attr': 'abs', 'coeff': 45}, {'attr': 'abs', 'coeff': 26}, {'attr': 'abs', 'coeff': 41}, {'attr': 'abs', 'coeff': 48}, {'attr': 'abs', 'coeff': 34}, {'attr': 'abs', 'coeff': 44}, {'attr': 'abs', 'coeff': 1}, {'attr': 'abs', 'coeff': 21}, {'attr': 'abs', 'coeff': 8}, {'attr': 'abs', 'coef

Feature Extraction: 100%|██████████| 80/80 [03:30<00:00,  2.63s/it]


extracted shape (30000, 500)
60000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:28<00:00,  2.61s/it]


extracted shape (30000, 500)
90000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:29<00:00,  2.62s/it]


extracted shape (30000, 500)
120000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:26<00:00,  2.58s/it]


extracted shape (30000, 500)
150000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:27<00:00,  2.60s/it]


extracted shape (30000, 500)
180000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:28<00:00,  2.61s/it]


extracted shape (30000, 500)
210000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:28<00:00,  2.60s/it]


extracted shape (30000, 500)
240000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:26<00:00,  2.58s/it]


extracted shape (30000, 500)
270000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:16<00:00,  2.45s/it]


extracted shape (30000, 500)
300000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:29<00:00,  2.62s/it]


extracted shape (30000, 500)
330000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:29<00:00,  2.62s/it]


extracted shape (30000, 500)
360000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:27<00:00,  2.59s/it]


extracted shape (30000, 500)
390000
29287
(29287, 6)


Feature Extraction: 100%|██████████| 80/80 [03:19<00:00,  2.49s/it]


extracted shape (29287, 500)
x (381067, 100, 8)
y (381067, 6) (16364,) (16364, 100, 8)
(1636400, 8)
Extracting feature only... 1636400


Feature Extraction: 100%|██████████| 80/80 [07:31<00:00,  5.64s/it] 


{'z_acc_clean': {'fft_coefficient': [{'attr': 'angle', 'coeff': 0}, {'attr': 'real', 'coeff': 0}, {'attr': 'abs', 'coeff': 0}, {'attr': 'abs', 'coeff': 47}, {'attr': 'abs', 'coeff': 37}], 'range_count': [{'max': 1000000000000.0, 'min': 0}, {'max': 0, 'min': -1000000000000.0}], 'count_below': [{'t': 0}], 'count_above': [{'t': 0}], 'sum_values': None, 'mean': None, 'median': None, 'quantile': [{'q': 0.3}, {'q': 0.4}, {'q': 0.6}, {'q': 0.2}, {'q': 0.7}, {'q': 0.8}, {'q': 0.1}, {'q': 0.9}], 'agg_linear_trend': [{'attr': 'intercept', 'chunk_len': 50, 'f_agg': 'mean'}, {'attr': 'intercept', 'chunk_len': 10, 'f_agg': 'mean'}, {'attr': 'intercept', 'chunk_len': 5, 'f_agg': 'mean'}, {'attr': 'intercept', 'chunk_len': 10, 'f_agg': 'max'}, {'attr': 'intercept', 'chunk_len': 50, 'f_agg': 'max'}, {'attr': 'intercept', 'chunk_len': 5, 'f_agg': 'max'}, {'attr': 'intercept', 'chunk_len': 5, 'f_agg': 'min'}, {'attr': 'intercept', 'chunk_len': 10, 'f_agg': 'min'}, {'attr': 'intercept', 'chunk_len': 50, 

Feature Extraction: 100%|██████████| 80/80 [03:14<00:00,  2.44s/it]


extracted shape (30000, 500)
60000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:14<00:00,  2.44s/it]


extracted shape (30000, 500)
90000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:16<00:00,  2.45s/it]


extracted shape (30000, 500)
120000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:14<00:00,  2.44s/it]


extracted shape (30000, 500)
150000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:15<00:00,  2.45s/it]


extracted shape (30000, 500)
180000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:17<00:00,  2.46s/it]


extracted shape (30000, 500)
210000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:18<00:00,  2.48s/it]


extracted shape (30000, 500)
240000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:16<00:00,  2.45s/it]


extracted shape (30000, 500)
270000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:13<00:00,  2.41s/it]


extracted shape (30000, 500)
300000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:15<00:00,  2.45s/it]


extracted shape (30000, 500)
330000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:15<00:00,  2.45s/it]


extracted shape (30000, 500)
360000
30000
(30000, 6)


Feature Extraction: 100%|██████████| 80/80 [03:18<00:00,  2.48s/it]


extracted shape (30000, 500)
390000
21067
(21067, 6)


Feature Extraction: 100%|██████████| 80/80 [02:17<00:00,  1.72s/it]


extracted shape (21067, 500)
