# Random Forest - baseline

* Referencing: 
> **Bibliography:** Zanella-Calzada, L.A., Galván-Tejada, C.E., Chávez-Lamas, N.M., Gracia-Cortés, M. del C., Magallanes-Quintanar, R., Celaya-Padilla, J.M., Galván-Tejada, J.I. and Gamboa-Rosales, H. (2019) Feature Extraction in Motor Activity Signal: Towards a Depression Episodes Detection in Unipolar and Bipolar Patients. Diagnostics [online]. 9 (1), p. 8. Available from: https://www.mdpi.com/2075-4418/9/1/8 [Accessed 28 November 2023].
* [article notes](../literature/Zanella-FeatureExtraction.md)

Objective: 

* To fit a modified Random Forest model, taking inspiration from the above article

## Plan 

1. Load and process `depresjon`
   * load into pandas df
   * select `control` and `condition` -> it seems that they used first 4 control and first 5 condition participants
   * normalise data (mean = 0, std = 1)
   * remove incomplete cases
   * take only first value of each hour??  maybe mean for each hour

2. Extract features - 14 features
   * mean
   * standard deviation
   * variance
   * trimmed mean
   * coefficient of variation
   * inversse coefficient of variation
   * kurtosis
   * skewness
   * quantailes (1, 5, 25, 75, 95, 99)

3. 

In [96]:
# libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis, mstats
from statsmodels.robust import mad


## Load and preprocess data

### Load data files

In [10]:
def extract_folder(folderpath, add_scores=False, downsample=None):
    """
    Extract CSV data from folder and subfolders into a dataframe.

    Args:
      folderpath (str): Path to the folder containing CSV files.
      add_scores (bool, optional): Boolean to add scores.csv to the dataframe. Defaults to False.
      downsample (int, optional): Number of rows to downsample CSVs to. Defaults to None.

    Returns:
      pandas.DataFrame: DataFrame of concatenated CSV data.
    """

    # Dict to store dataframes by condition  
    dfs = {'control': [], 'condition': []}

    try:
        # Handle top-level scores CSV
        if add_scores and 'scores.csv' in os.listdir(folderpath):
            scores_path = os.path.join(folderpath, 'scores.csv')  
            dfs['scores'] = pd.read_csv(scores_path)

        # Get subfolders
        subfolders = [f for f in os.listdir(folderpath) if os.path.isdir(os.path.join(folderpath, f))]

        for subfolder in subfolders:
            subfolderpath = os.path.join(folderpath, subfolder)  

            # Get list of CSV files
            files = os.listdir(subfolderpath)

            for file in files:
                filepath = os.path.join(subfolderpath, file)

                # Extract ID from filename 
                id = file.split('.')[0]

                df = pd.read_csv(filepath)

                # Downsample if needed
                if downsample:
                    df = df.sample(downsample)

                # Add ID column - this is the filename without the extension
                df['id'] = id

                # Add 'condition' column
                df['condition'] = subfolder

                # Convert 'timestamp' and 'date' to datetime
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df['date'] = pd.to_datetime(df['date'])

                # Append to dict by condition
                if subfolder == 'control':
                    dfs['control'].append(df)
                else:  
                    dfs['condition'].append(df)

    except OSError:
        print(f"Error reading folder: {folderpath}")

    # concatenate dfs for each condition
    dfs['control'] = pd.concat(dfs['control'])
    dfs['condition'] = pd.concat(dfs['condition'])

    # Reset index on the final df
    df = pd.concat([dfs['control'], dfs['condition']]).reset_index(drop=True)

    # add label column
    df['label'] = 0
    df.loc[df['condition'] == 'condition', 'label'] = 1
    
    # remove old 'condition' column
    df.drop('condition', axis=1, inplace=True)

    # Final concat
    return df

In [43]:
# set folder path
folderpath = '../data/depresjon/'
# extract all files
all_files = extract_folder(folderpath)
# print rows 21-24
#print(all_files.iloc[21:25])
print(all_files.head(-5))


                  timestamp       date  activity           id  label
0       2003-03-18 15:00:00 2003-03-18        60    control_1      0
1       2003-03-18 15:01:00 2003-03-18         0    control_1      0
2       2003-03-18 15:02:00 2003-03-18       264    control_1      0
3       2003-03-18 15:03:00 2003-03-18       662    control_1      0
4       2003-03-18 15:04:00 2003-03-18       293    control_1      0
...                     ...        ...       ...          ...    ...
1571696 2004-06-10 14:58:00 2004-06-10         0  condition_9      1
1571697 2004-06-10 14:59:00 2004-06-10         0  condition_9      1
1571698 2004-06-10 15:00:00 2004-06-10         0  condition_9      1
1571699 2004-06-10 15:01:00 2004-06-10         5  condition_9      1
1571700 2004-06-10 15:02:00 2004-06-10         0  condition_9      1

[1571701 rows x 5 columns]


### Select subset

In [63]:
control_subjects = ['control_1', 'control_2', 'control_3', 'control_4']
condition_subjects = ['condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5']

# Filter for control subjects
control_df = all_files[all_files['id'].isin(control_subjects)] 

# Filter for condition subjects
condition_df = all_files[all_files['id'].isin(condition_subjects)]

# Concatenate 
subset_df = pd.concat([control_df, condition_df])

# print the first 5 rows
print(subset_df.head(5), '\n')

# print info
print(subset_df.info(), '\n')

# print random 10 rows
print(subset_df.sample(10), '\n')

# print number of rows by 'id'
print(subset_df['id'].value_counts(), '\n')

# print number of rows by 'label'
print(subset_df['label'].value_counts())

# print proportion of 'label' column
print(subset_df['label'].value_counts(normalize=True))

            timestamp       date  activity         id  label
0 2003-03-18 15:00:00 2003-03-18        60  control_1      0
1 2003-03-18 15:01:00 2003-03-18         0  control_1      0
2 2003-03-18 15:02:00 2003-03-18       264  control_1      0
3 2003-03-18 15:03:00 2003-03-18       662  control_1      0
4 2003-03-18 15:04:00 2003-03-18       293  control_1      0 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306813 entries, 0 to 1488480
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   timestamp  306813 non-null  datetime64[ns]
 1   date       306813 non-null  datetime64[ns]
 2   activity   306813 non-null  int64         
 3   id         306813 non-null  object        
 4   label      306813 non-null  int64         
dtypes: datetime64[ns](2), int64(2), object(1)
memory usage: 14.0+ MB
None 

                  timestamp       date  activity           id  label
307776  2002-10-04 11:01:00 2002-10-04 

### Resample to hourly 

"In the selection of the samples, only the first value of the 60 values acquired during 1 h was kept, equivalent to the minutes correspondent to that time lapse, counting now the activity in intervals of 1 h. This procedure was performed for each hour of the total data."

-> take first value of each hour
-> take mean of hour

#### Version 1 - first value on the hour (as per article?)

In [65]:
# Copy the subset_df DataFrame
subset_first_hour_df = subset_df.copy()

# Set 'timestamp' as index
#subset_first_hour_df = subset_first_hour_df.set_index('timestamp')

# Convert index to datetime 
#subset_first_hour_df.index = pd.to_datetime(subset_first_hour_df.index)

# floor and resample
#subset_first_hour_df.index = subset_first_hour_df.index.floor('H')
#subset_first_hour_df = subset_first_hour_df.resample('H').first()

# floor and resample
subset_first_hour_df['timestamp'] = subset_first_hour_df['timestamp'].dt.floor('H')
subset_first_hour_df = subset_first_hour_df.groupby(['id', 'timestamp']).first().reset_index()

# print the first 5 rows
print(subset_first_hour_df, '\n')

# print distribution of 'label' column
print(subset_first_hour_df['label'].value_counts(normalize=True))


               id           timestamp       date  activity  label
0     condition_1 2003-05-07 12:00:00 2003-05-07         0      1
1     condition_1 2003-05-07 13:00:00 2003-05-07       306      1
2     condition_1 2003-05-07 14:00:00 2003-05-07         0      1
3     condition_1 2003-05-07 15:00:00 2003-05-07        38      1
4     condition_1 2003-05-07 16:00:00 2003-05-07        15      1
...           ...                 ...        ...       ...    ...
5115    control_4 2002-11-18 07:00:00 2002-11-18         5      0
5116    control_4 2002-11-18 08:00:00 2002-11-18       106      0
5117    control_4 2002-11-18 09:00:00 2002-11-18         3      0
5118    control_4 2002-11-18 10:00:00 2002-11-18         3      0
5119    control_4 2002-11-18 11:00:00 2002-11-18         0      0

[5120 rows x 5 columns] 

0    0.586328
1    0.413672
Name: label, dtype: float64


#### Version 2 - mean of hour

In [66]:
# copy dataframe
subset_mean_hour_df = subset_df.copy()

# resample to hourly mean
subset_mean_hour_df['timestamp'] = subset_mean_hour_df['timestamp'].dt.floor('H')
subset_mean_hour_df = subset_mean_hour_df.groupby(['id', 'timestamp']).mean().reset_index()

# print the first 5 rows
print(subset_mean_hour_df.head(5), '\n')

            id           timestamp    activity  label
0  condition_1 2003-05-07 12:00:00  346.550000    1.0
1  condition_1 2003-05-07 13:00:00  284.566667    1.0
2  condition_1 2003-05-07 14:00:00  279.183333    1.0
3  condition_1 2003-05-07 15:00:00  218.783333    1.0
4  condition_1 2003-05-07 16:00:00  238.550000    1.0 



  subset_mean_hour_df = subset_mean_hour_df.groupby(['id', 'timestamp']).mean().reset_index()


### Normalise subset (z-score)

Could use: 

* `sklearn.preprocessing.scale` - Standardises features by removing the mean and scaling to unit variance (similar to manual z-score normalisation)
* `sklearn.preprocessing.minmax_scale` - Transforms features to a given range (often 0-1 for minmax scaling)
* `sklearn.preprocessing.normalize` - L2 vector normalisation

* pandas -> df.normalize(axis=0)

#### v1 first hour

In [67]:
# calculate mean and standard deviation
mu = subset_first_hour_df['activity'].mean()
sigma = subset_first_hour_df['activity'].std()

# normalise
subset_first_hour_df['activity_norm'] = (subset_first_hour_df['activity'] - mu)/sigma

# print summary statistics
print(subset_first_hour_df.describe())

          activity        label  activity_norm
count  5120.000000  5120.000000   5.120000e+03
mean    176.685742     0.413672  -1.110223e-17
std     364.604038     0.492539   1.000000e+00
min       0.000000     0.000000  -4.845962e-01
25%       0.000000     0.000000  -4.845962e-01
50%       4.000000     0.000000  -4.736254e-01
75%     184.000000     1.000000   2.006083e-02
max    4009.000000     1.000000   1.051089e+01


#### v2 hour mean

In [68]:
# calculate mean and standard deviation
mu = subset_mean_hour_df['activity'].mean()
sigma = subset_mean_hour_df['activity'].std()

# normalise
subset_mean_hour_df['activity_norm'] = (subset_mean_hour_df['activity'] - mu)/sigma

# print summary statistics
print(subset_mean_hour_df.describe())

          activity        label  activity_norm
count  5120.000000  5120.000000   5.120000e+03
mean    168.363036     0.413672  -2.220446e-17
std     256.015579     0.492539   1.000000e+00
min       0.000000     0.000000  -6.576281e-01
25%       1.495833     0.000000  -6.517853e-01
50%      19.616667     0.000000  -5.810051e-01
75%     269.012500     1.000000   3.931380e-01
max    2095.983333     1.000000   7.529309e+00


### Missing values

In [69]:
# Check if dataframe has any NaN 
print(subset_first_hour_df.isnull().values.any(), '\n')

# Count number of NaN per column
print(subset_first_hour_df.isnull().sum(), '\n')

# See indices of NaN values 
print(subset_first_hour_df[subset_first_hour_df.isnull().any(axis=1)].index)

False 

id               0
timestamp        0
date             0
activity         0
label            0
activity_norm    0
dtype: int64 

Int64Index([], dtype='int64')


In [70]:
# Check if dataframe has any NaN 
print(subset_mean_hour_df.isnull().values.any(), '\n')

# Count number of NaN per column
print(subset_mean_hour_df.isnull().sum(), '\n')

# See indices of NaN values 
print(subset_mean_hour_df[subset_mean_hour_df.isnull().any(axis=1)].index)

False 

id               0
timestamp        0
activity         0
label            0
activity_norm    0
dtype: int64 

Int64Index([], dtype='int64')


In [55]:
# drop NaN values rows
subset_hour_df.dropna(inplace=True)

# Check if dataframe has any NaN
print(subset_hour_df.isnull().values.any())

# print info
print(subset_hour_df.info())

False
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3938 entries, 2002-10-02 15:00:00 to 2003-06-27 08:00:00
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           3938 non-null   datetime64[ns]
 1   activity       3938 non-null   float64       
 2   id             3938 non-null   object        
 3   label          3938 non-null   float64       
 4   activity_norm  3938 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 184.6+ KB
None


## Extract features (14)

**Mean**:

* Calculation: Average value of the activity data.
* Significance: Represents the central tendency of the data.

**Trimmed Standard Deviation:**
* Calculation: A measure of the amount of variation in the data, with a specified percentage of outliers removed.
* Significance: More robust to outliers than the regular standard deviation.

**Trimmed Variance:**

* Calculation: Similar to trimmed standard deviation but squared, providing a measure of the spread of the data.
* Significance: Robust measure of data spread with specified outliers removed.

**Quantiles (1st, 5th, 25th, 75th, 95th, 99th):**

* Calculation: Values below which a given percentage of the data falls.
* Significance: Describes the distribution of the data and helps identify potential outliers.

**Skewness:**

* Calculation: A measure of the asymmetry of the data distribution.
* Significance: Positive skewness indicates a right-skewed distribution (tail to the right), negative skewness indicates a left-skewed distribution (tail to the left).

**Kurtosis:**

* Calculation: A measure of the "tailedness" of the data distribution.
* Significance: High kurtosis indicates heavy tails and more outliers compared to a normal distribution. Low kurtosis indicates light tails.

**Coefficient of Variation (Coef_Var):**

* Calculation: Standard deviation divided by the mean, expressing the relative variability of the data.
* Significance: Useful for comparing the variability of datasets with different units or scales.

**Inverse Coefficient of Variation (Inverse_Coef_Var):**

* Calculation: The reciprocal of the coefficient of variation.
* Significance: Measures the efficiency of data representation; a higher value suggests more efficient data representation.

In [95]:
print(subset_first_hour_df.columns, '\n')
print(subset_first_hour_df.head(5), '\n')
print(subset_first_hour_df.info(), '\n')
print(subset_mean_hour_df.columns, '\n')
print(subset_mean_hour_df.head(5), '\n')
print(subset_mean_hour_df.info(), '\n')

Index(['id', 'timestamp', 'date', 'activity', 'label', 'activity_norm'], dtype='object') 

            id           timestamp       date  activity  label  activity_norm
0  condition_1 2003-05-07 12:00:00 2003-05-07         0      1      -0.484596
1  condition_1 2003-05-07 13:00:00 2003-05-07       306      1       0.354670
2  condition_1 2003-05-07 14:00:00 2003-05-07         0      1      -0.484596
3  condition_1 2003-05-07 15:00:00 2003-05-07        38      1      -0.380374
4  condition_1 2003-05-07 16:00:00 2003-05-07        15      1      -0.443456 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5120 entries, 0 to 5119
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             5120 non-null   object        
 1   timestamp      5120 non-null   datetime64[ns]
 2   date           5120 non-null   datetime64[ns]
 3   activity       5120 non-null   int64         
 4   label          5120 no

## SCRAPS - to be deleted

### v1 first hour

In [None]:
# Groupby to calculate other features
features_first_hour = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].agg([np.mean,
                                lambda x: mstats.trimmed_std(x, 0.1),  # Adjust the trim parameter as needed
                                lambda x: mstats.trimmed_var(x, 0.1),
                                lambda x: np.quantile(x, 0.01), 
                                lambda x: np.quantile(x, 0.05),
                                lambda x: np.quantile(x, 0.25),
                                lambda x: np.quantile(x, 0.75),
                                lambda x: np.quantile(x, 0.95),
                                lambda x: np.quantile(x, 0.99)]) 

features_first_hour.columns = ['mean', 'std', 'variance', 'quantile_01', 'quantile_05', 'quantile_25', 'quantile_75', 'quantile_95', 'quantile_99']

# Calculate skewness and kurtosis separately
skewness_values = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].apply(mstats.skew)
kurtosis_values = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].apply(mstats.kurtosis)

# Assign skewness and kurtosis to the dataframe
features_first_hour['skewness'] = skewness_values
features_first_hour['kurtosis'] = kurtosis_values

# Calculate remaining features
features_first_hour['coef_var'] = features_first_hour['std'] / features_first_hour['mean'] 
features_first_hour['inverse_coef_var'] = features_first_hour['mean'] / (np.abs(features_first_hour['std']) + 1e-9)


# print the first 5 rows
print(features_first_hour.head(5), '\n')


       mean  std  variance  quantile_01  quantile_05  quantile_25  \
0 -0.484596  0.0       0.0    -0.484596    -0.484596    -0.484596   
1  0.354670  0.0       0.0     0.354670     0.354670     0.354670   
2 -0.484596  0.0       0.0    -0.484596    -0.484596    -0.484596   
3 -0.380374  0.0       0.0    -0.380374    -0.380374    -0.380374   
4 -0.443456  0.0       0.0    -0.443456    -0.443456    -0.443456   

   quantile_75  quantile_95  quantile_99 skewness  kurtosis  coef_var  \
0    -0.484596    -0.484596    -0.484596      0.0      -3.0      -0.0   
1     0.354670     0.354670     0.354670      0.0      -3.0       0.0   
2    -0.484596    -0.484596    -0.484596      0.0      -3.0      -0.0   
3    -0.380374    -0.380374    -0.380374      0.0      -3.0      -0.0   
4    -0.443456    -0.443456    -0.443456      0.0      -3.0      -0.0   

   inverse_coef_var  
0     -4.845962e+08  
1      3.546704e+08  
2     -4.845962e+08  
3     -3.803736e+08  
4     -4.434557e+08   



### v2 mean hour

In [None]:
# Groupby to calculate other features
features_mean_hour = subset_mean_hour_df.groupby(subset_mean_hour_df.index)['activity_norm'].agg([np.mean,
                                lambda x: mstats.trimmed_std(x, 0.1),  # Adjust the trim parameter as needed
                                lambda x: mstats.trimmed_var(x, 0.1),
                                lambda x: np.quantile(x, 0.01), 
                                lambda x: np.quantile(x, 0.05),
                                lambda x: np.quantile(x, 0.25),
                                lambda x: np.quantile(x, 0.75),
                                lambda x: np.quantile(x, 0.95),
                                lambda x: np.quantile(x, 0.99)]) 

features_mean_hour.columns = ['mean', 'std', 'variance', 'quantile_01', 'quantile_05', 'quantile_25', 'quantile_75', 'quantile_95', 'quantile_99']

# skewness and kurtosis 
skewness_values = subset_mean_hour_df.groupby(subset_mean_hour_df.index)['activity_norm'].apply(mstats.skew)
kurtosis_values = subset_mean_hour_df.groupby(subset_mean_hour_df.index)['activity_norm'].apply(mstats.kurtosis)

# assign skewness and kurtosis to the dataframe
features_mean_hour['skewness'] = skewness_values
features_mean_hour['kurtosis'] = kurtosis_values

# coef and inverse_coef features
features_mean_hour['coef_var'] = features_mean_hour['std'] / features_mean_hour['mean'] 
features_mean_hour['inverse_coef_var'] = features_mean_hour['mean'] / (np.abs(features_mean_hour['std']) + 1e-9)


# print the first 5 rows
print(features_mean_hour.head(5), '\n')


       mean  std  variance  quantile_01  quantile_05  quantile_25  \
0  0.696000  0.0       0.0     0.696000     0.696000     0.696000   
1  0.453893  0.0       0.0     0.453893     0.453893     0.453893   
2  0.432865  0.0       0.0     0.432865     0.432865     0.432865   
3  0.196942  0.0       0.0     0.196942     0.196942     0.196942   
4  0.274151  0.0       0.0     0.274151     0.274151     0.274151   

   quantile_75  quantile_95  quantile_99 skewness  kurtosis  coef_var  \
0     0.696000     0.696000     0.696000      0.0      -3.0       0.0   
1     0.453893     0.453893     0.453893      0.0      -3.0       0.0   
2     0.432865     0.432865     0.432865      0.0      -3.0       0.0   
3     0.196942     0.196942     0.196942      0.0      -3.0       0.0   
4     0.274151     0.274151     0.274151      0.0      -3.0       0.0   

   inverse_coef_var  
0      6.960005e+08  
1      4.538928e+08  
2      4.328654e+08  
3      1.969423e+08  
4      2.741511e+08   



In [None]:
print(features_first_hour.shape)
print(features_mean_hour.shape)

(5120, 13)
(5120, 13)


In [None]:
features_first_hour = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].agg([np.mean, np.std, np.var, 
                                lambda x: np.quantile(x, 0.01), 
                                lambda x: np.quantile(x, 0.05),
                                lambda x: np.quantile(x, 0.25),
                                lambda x: np.quantile(x, 0.75),
                                lambda x: np.quantile(x, 0.95),
                                lambda x: np.quantile(x, 0.99),
                                skew, kurtosis]) 
features_first_hour.columns = ['mean', 'std', 'variance', 'quantile_01', 'quantile_05', 'quantile_25', 'quantile_75', 'quantile_95', 'quantile_99','skewness', 'kurtosis']

features_first_hour['coef_var'] = features_first_hour['std'] / features_first_hour['mean'] 
features_first_hour['inverse_coef_var'] = features_first_hour['mean'] / features_first_hour['std']

# print the first 5 rows
print(features_first_hour.head(5), '\n')

  f = lambda x: func(x, *args, **kwargs)


       mean  std  variance  quantile_01  quantile_05  quantile_25  \
0 -0.484596  NaN       NaN    -0.484596    -0.484596    -0.484596   
1  0.354670  NaN       NaN     0.354670     0.354670     0.354670   
2 -0.484596  NaN       NaN    -0.484596    -0.484596    -0.484596   
3 -0.380374  NaN       NaN    -0.380374    -0.380374    -0.380374   
4 -0.443456  NaN       NaN    -0.443456    -0.443456    -0.443456   

   quantile_75  quantile_95  quantile_99  skewness  kurtosis  coef_var  \
0    -0.484596    -0.484596    -0.484596       NaN       NaN       NaN   
1     0.354670     0.354670     0.354670       NaN       NaN       NaN   
2    -0.484596    -0.484596    -0.484596       NaN       NaN       NaN   
3    -0.380374    -0.380374    -0.380374       NaN       NaN       NaN   
4    -0.443456    -0.443456    -0.443456       NaN       NaN       NaN   

   inverse_coef_var  
0               NaN  
1               NaN  
2               NaN  
3               NaN  
4               NaN   



### v1 first hour

In [97]:




# Calculate Mean
mean_value = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].mean()

# Calculate Standard Deviation
std_value = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].std()

# Calculate Variance
variance_value = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].var()

# Calculate Trimmed Mean (using 5% trimming as an example)
trimmed_mean_value = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].apply(lambda x: np.mean(x[(x >= np.percentile(x, 5)) & (x <= np.percentile(x, 95))]))

# Calculate Coefficient of Variation
coef_var_value = std_value / mean_value

# Calculate Inverse Coefficient of Variation
inverse_coef_var_value = mean_value / std_value

# Calculate Kurtosis
kurtosis_value = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].apply(lambda x: kurtosis(x))

# Calculate Skewness
skewness_value = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].apply(lambda x: skew(x))

# Calculate Quantiles (1%, 5%, 25%, 75%, 95%, 99%)
quantiles_value = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].quantile([0.01, 0.05, 0.25, 0.75, 0.95, 0.99]).unstack(level=-1)

# Create a dataframe to store the features
features_first_hour = pd.DataFrame({
    'mean': mean_value,
    'std': std_value,
    'variance': variance_value,
    'trimmed_mean': trimmed_mean_value,
    'coef_var': coef_var_value,
    'inverse_coef_var': inverse_coef_var_value,
    'kurtosis': kurtosis_value,
    'skewness': skewness_value
})

# Add Quantiles to the dataframe
features_first_hour = pd.concat([features_first_hour, quantiles_value], axis=1)

# Rename columns for clarity
features_first_hour.columns = ['mean', 'std', 'variance', 'trimmed_mean', 'coef_var', 'inverse_coef_var', 'kurtosis', 'skewness', 'quantile_01', 'quantile_05', 'quantile_25', 'quantile_75', 'quantile_95', 'quantile_99']

# Print or further use features_hour
print(features_first_hour)


  kurtosis_value = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].apply(lambda x: kurtosis(x))
  skewness_value = subset_first_hour_df.groupby(subset_first_hour_df.index)['activity_norm'].apply(lambda x: skew(x))


          mean  std  variance  trimmed_mean  coef_var  inverse_coef_var  \
0    -0.484596  NaN       NaN     -0.484596       NaN               NaN   
1     0.354670  NaN       NaN      0.354670       NaN               NaN   
2    -0.484596  NaN       NaN     -0.484596       NaN               NaN   
3    -0.380374  NaN       NaN     -0.380374       NaN               NaN   
4    -0.443456  NaN       NaN     -0.443456       NaN               NaN   
...        ...  ...       ...           ...       ...               ...   
5115 -0.470883  NaN       NaN     -0.470883       NaN               NaN   
5116 -0.193870  NaN       NaN     -0.193870       NaN               NaN   
5117 -0.476368  NaN       NaN     -0.476368       NaN               NaN   
5118 -0.476368  NaN       NaN     -0.476368       NaN               NaN   
5119 -0.484596  NaN       NaN     -0.484596       NaN               NaN   

      kurtosis  skewness  quantile_01  quantile_05  quantile_25  quantile_75  \
0          NaN     

# Calculate features per time window
features = []
for idx, df_window in df.groupby(df.index): # grouped by time window
    features_dict = {
        'mean': df_window['activity_norm'].mean(),
        'std': df_window['activity_norm'].std(),
        'variance': df_window['activity_norm'].var(),
        'variance': df_window['activity_norm'].var(),
        'trimmed_mean': df_window['activity_norm'].quantile(0.05),
        'coef_var': df_window['activity_norm'].std() / df_window['activity_norm'].mean(),
        'inverse_coef_var': df_window['activity_norm'].mean() / df_window['activity_norm'].std(),
        'kurtosis': df_window['activity_norm'].kurtosis(),
        'skewness': df_window['activity_norm'].skew(),
        'quantile_1': df_window['activity_norm'].quantile(0.01),
        'quantile_5': df_window['activity_norm'].quantile(0.05),
        'quantile_25': df_window['activity_norm'].quantile(0.25),
        'quantile_75': df_window['activity_norm'].quantile(0.75),
        'quantile_95': df_window['activity_norm'].quantile(0.95),
        'quantile_99': df_window['activity_norm'].quantile(0.99)
    }
    
    features.append(features_dict) 

features_df = pd.DataFrame(features)

In [59]:
print(features_hour.head(5))

                         mean  std  variance  quantile_01  quantile_05  \
timestamp                                                                
2002-10-02 15:00:00  0.030959  NaN       NaN     0.030959     0.030959   
2002-10-02 16:00:00  1.656566  NaN       NaN     1.656566     1.656566   
2002-10-02 17:00:00  2.274023  NaN       NaN     2.274023     2.274023   
2002-10-02 18:00:00  3.926951  NaN       NaN     3.926951     3.926951   
2002-10-02 19:00:00  1.126536  NaN       NaN     1.126536     1.126536   

                     quantile_25  quantile_75  quantile_95  quantile_99  \
timestamp                                                                 
2002-10-02 15:00:00     0.030959     0.030959     0.030959     0.030959   
2002-10-02 16:00:00     1.656566     1.656566     1.656566     1.656566   
2002-10-02 17:00:00     2.274023     2.274023     2.274023     2.274023   
2002-10-02 18:00:00     3.926951     3.926951     3.926951     3.926951   
2002-10-02 19:00:00     1.12653