# Recreating Rodriguez-Ruiz 

* Using Tom's code/plan
* And my code

## Extract data

In [1]:

def extract_from_folder(folderpath, downsample=None, save_to_csv=False, output_csv_path=None):
    """
    Extract CSV data from folder and subfolders into a dataframe.

    Args:
      folderpath (str): folder containing CSV files.
      downsample (int, optional): number of rows to downsample CSVs to. Defaults to None.
      save_to_csv (bool, optional): save the updated df to a CSV file? defaults to False.
      output_csv_path (str, optional): csv filepath. required if save_to_csv is True.

    Returns:
      pandas.DataFrame: DataFrame of concatenated CSV data.
    """
    import os
    import pandas as pd
    
    # dict to store dataframes by condition  
    dfs = {'control': [], 'condition': []}

    try:
        # subfolders
        subfolders = [f for f in os.listdir(folderpath) if os.path.isdir(os.path.join(folderpath, f))]

        for subfolder in subfolders:
            subfolderpath = os.path.join(folderpath, subfolder)  

            # list of CSV files
            files = os.listdir(subfolderpath)

            for file in files:
                filepath = os.path.join(subfolderpath, file)

                # extract ID from filename 
                id = file.split('.')[0]

                df = pd.read_csv(filepath)

                # optional downsample 
                if downsample:
                    df = df.sample(downsample)

                # ID column - this is the filename without the extension
                df['id'] = id

                # 'condition' column
                df['condition'] = subfolder

                # convert 'timestamp' and 'date' to datetime
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df['date'] = pd.to_datetime(df['date'])

                # append to dict by condition
                if subfolder == 'control':
                    dfs['control'].append(df)
                else:  
                    dfs['condition'].append(df)

    except OSError:
        print(f"Error reading folder: {folderpath}")

    # concatenate dfs for each condition
    dfs['control'] = pd.concat(dfs['control'])
    dfs['condition'] = pd.concat(dfs['condition'])

    # reset index on the final df
    df = pd.concat([dfs['control'], dfs['condition']]).reset_index(drop=True)

    # add label column
    df['label'] = 0
    df.loc[df['condition'] == 'condition', 'label'] = 1
    
    # remove old 'condition' column
    df.drop('condition', axis=1, inplace=True)


    try:
        if save_to_csv:
            if output_csv_path:
                df.to_csv(output_csv_path, index=False)
                print(f"df saved to {output_csv_path}")
            else:
                print("Error: Please provide an output CSV path.")
        
        
        return df
    except OSError:
        print("Error saving to CSV.")


In [2]:
# extraction of all the activity data into one data frame
folderpath = '../data/depresjon'
# full ds, no csv
df = extract_from_folder(folderpath)


In [3]:
#print(df.head())
#print(df.info())

## Preprocessing

1. equal observations per subject
2. segmentation into hourly
3. day, night, full 
4. NaNs and standardisation

### Dataset Reduction

>"For the pre-processing stage, the next step are proposed. Since the total amount of data recorded for each subject is different, a new subset of data is extracted, adjusting the number of observations to be equal for each subject."

This step is not adequately described as there are many ways to approach this.

Below, I have tried three approaches: 

1. Reducing data to the maximum viable number of rows - that is, finding the minimum of all ids and reducing all other id's rows to this value. 
   * this can be done by `head()` or `sample()` methods
2. Reducing to 'full days' first and then minimising the dataset
3. Reducing to match 'num_days' in scores.csv and then minimising the dataset. 

Finally, I did none of the above and simply used the whole dataset.

#### min rows per id

* this is reducing to maximum viable number of rows, no other processing

In [4]:
# reduce df to min number of rows per id - so each id has the same number of rows
min_rows = df['id'].value_counts().min()
trim = df.groupby('id').apply(lambda x: x.head(min_rows), include_groups=False).reset_index()

# drop the 'level_1' column
trim = trim.drop(columns='level_1')

In [5]:
# print unique row counts per id
print(trim['id'].value_counts().unique())


[19299]


In [6]:
#print(trim.info())
#print(trim.head())

#### full days

This includes the 'preprocess to full days' step before reducing to max viable.

In [7]:
def preprocess_full_days(df, save_to_csv=False, output_csv_path=None, print_info=True):
    """
    Extracts full days from a dataframe.

    Args::
    df (DataFrame): input df.
    save_to_csv (bool, optional): save the updated df to a CSV file? defaults to False.
    output_csv_path (str, optional): csv filepath. required if save_to_csv is True.
    print_info (bool, optional): print info about the df. defaults to True.

    Returns:
    DataFrame: df containing only full days (1440 rows per day).

    """
    

    # group by id and date, count rows, and filter where count equals 1440
    full_days_df = df.groupby(['id', 'date']).filter(lambda x: len(x) == 1440)

    # set index to timestamp
    #full_days_df.set_index(['timestamp'], inplace=True)
    
    if print_info:
        # print id and date combinations that don't have 1440 rows
        not_full_days = df.groupby(['id', 'date']).size().reset_index(name='count').query('count != 1440')
        print("\nid and date combinations that don't have 1440 rows and have been removed:\n")
        print(not_full_days)

        # print info
        print("\nfull_days_df info:\n")
        print(full_days_df.info())

        #print full days per id
        print("\nfull days per id:\n")
        print(full_days_df.groupby('id').size()/1440)

        # print min number of days
        print("\nmin number of days per id:\n")
        print(full_days_df.groupby('id').size().min()/1440)
        

    try:
        if save_to_csv:
            if output_csv_path:
                full_days_df.to_csv(output_csv_path, index=False)
                print(f"df saved to {output_csv_path}")
            else:
                print("Error: Please provide an output CSV path.")
        
        
        return full_days_df
    except OSError:
        print("Error saving to CSV.")

    return full_days_df


In [8]:
# full days
full = preprocess_full_days(df, print_info=False)
#print(full.info())

# reduce df to min number of rows per id - so each id has the same number of rows
min_rows = full['id'].value_counts().min()
trim2 = full.groupby('id').apply(lambda x: x.head(min_rows), include_groups=False).reset_index()

# drop the 'level_1' column
trim2 = trim2.drop(columns='level_1')

# print unique row counts per id
print(trim2['id'].value_counts().unique())

[17280]


#### reduce to 'number of days' from scores

This step reduces to num_days as per scores.csv

In [9]:
import pandas as pd

def extract_days_per_scores(df, scores_csv_path='..\data\depresjon\scores.csv', save_to_csv=True, output_csv_path=None):
    """
    Extract the number of days per ID from the 'scores' data.

    Args:
        df (pd.DataFrame): df containing the 'id' column.
        scores_csv_path (str, optional): path to the 'scores' CSV file. Defaults to '..\data\depresjon\scores.csv'.
        save_to_csv (bool, optional): save the updated df to a CSV file? Defaults to True.
        output_csv_path (str, optional): csv filepath. Required if save_to_csv is True.
        

    Returns:
        pd.DataFrame: df with the specified number of days per ID based on 'scores'.
    """
    # scores from the CSV file
    scores_df = pd.read_csv(scores_csv_path)

    # merge scores with the df based on the 'id' column
    merged_df = pd.merge(df, scores_df, left_on='id', right_on='number', how='left')

    # filter rows to keep the specified number of days
    df_filtered = merged_df.groupby('id', group_keys=False, as_index=False, sort=False).apply(lambda group: group.head(group['days'].min() * 1440)).reset_index(drop=True)

    # drop cols number, days, gender, age, afftype, melanch, inpatient, edu, marriage, work, madrs1, madrs2
    cols = ['number', 'number', 'days', 'gender', 'age', 'afftype', 'melanch', 'inpatient', 'edu', 'marriage', 'work', 'madrs1', 'madrs2']
    df_filtered.drop(cols, axis=1, inplace=True)
    

    # save to CSV
    if save_to_csv:
        if output_csv_path:
            df_filtered.to_csv(output_csv_path, index=False)
            print(f"\n\ndf saved to {output_csv_path}")
        else:
            print("Error: Please provide an output CSV path.")

    return df_filtered


In [10]:
num_days = extract_days_per_scores(df, save_to_csv=False, output_csv_path=None)

#print(num_days.info())

# reduce df to min number of rows per id - so each id has the same number of rows
min_rows = num_days['id'].value_counts().min()
trim3 = num_days.groupby('id').apply(lambda x: x.head(min_rows), include_groups=False).reset_index()

# drop the 'level_1' column
trim3 = trim3.drop(columns='level_1')

# print unique row counts per id
print(trim3['id'].value_counts().unique())

  df_filtered = merged_df.groupby('id', group_keys=False, as_index=False, sort=False).apply(lambda group: group.head(group['days'].min() * 1440)).reset_index(drop=True)


[7200]


### Segment into hourly intervals

"The structure of the data for every observation is contained by 61 columns; one column for the monitored hour and one column for each minute (60 columns) of motor activity. This segmentation allowed the classification of depressive episodes per hour."

#### No trimming

In [11]:
# copy df
no_trim = df.copy()

# extract hour and minute from timestamp
no_trim['hour'] = no_trim['timestamp'].dt.hour
no_trim['minute'] = no_trim['timestamp'].dt.minute

# pivot the DataFrame
no_trim_piv = no_trim.pivot(index=['date', 'id', 'label', 'hour'], columns='minute', values='activity')

# rename columns
no_trim_piv.columns = [f'min_{minute:02d}' for minute in range(60)]

# Reset index
no_trim_piv.reset_index(inplace=True)

#  NaN with 0 (for missing minute values)
no_trim = no_trim_piv.fillna(0)

# print hourly_data shape
print(no_trim.shape)

# print info
#print(no_trim.info())


(26230, 64)


In [12]:
print(no_trim.head(1))

        date            id  label  hour  min_00  min_01  min_02  min_03  \
0 2002-05-24  condition_20      1    11     0.0     0.0     0.0     0.0   

   min_04  min_05  ...  min_50  min_51  min_52  min_53  min_54  min_55  \
0     0.0     0.0  ...    83.0     0.0     0.0     0.0     3.0     0.0   

   min_56  min_57  min_58  min_59  
0   249.0   209.0   360.0    36.0  

[1 rows x 64 columns]


#### Trim 1 - no processing 

In [13]:
# extract hour and minute from timestamp
trim['hour'] = trim['timestamp'].dt.hour
trim['minute'] = trim['timestamp'].dt.minute

# pivot the DataFrame
df_pivot = trim.pivot(index=['date', 'id', 'label', 'hour'], columns='minute', values='activity')

# rename columns
df_pivot.columns = [f'min_{minute:02d}' for minute in range(60)]

# Reset index
df_pivot.reset_index(inplace=True)

In [14]:
#print(df_pivot.head(5))
#print(df_pivot.info())
#print(df_pivot.shape)

In [15]:
# print any missing minute data
missing = df_pivot[df_pivot.isnull().any(axis=1)]
#print(missing)

In [16]:
#  NaN with 0 (for missing minute values)
trim1_piv = df_pivot.fillna(0)

# print hourly_data shape
print(trim1_piv.shape)

# print info
#print(df.info())

(17722, 64)


#### Trim2 - full days

In [17]:
# extract hour and minute from timestamp
trim2['hour'] = trim2['timestamp'].dt.hour
trim2['minute'] = trim2['timestamp'].dt.minute

# pivot the DataFrame
df_pivot2 = trim2.pivot(index=['date', 'id', 'label', 'hour'], columns='minute', values='activity')

# rename columns
df_pivot2.columns = [f'min_{minute:02d}' for minute in range(60)]

# Reset index
df_pivot2.reset_index(inplace=True)

#  NaN with 0 (for missing minute values)
trim2_piv = df_pivot2.fillna(0)

# print hourly_data shape
print(trim2_piv.shape)

# print info
#print(df2.info())


(15840, 64)


#### Trim 3 - 'num_days'

In [18]:
# extract hour and minute from timestamp
trim3['hour'] = trim3['timestamp'].dt.hour
trim3['minute'] = trim3['timestamp'].dt.minute

# pivot the DataFrame
df_pivot3 = trim3.pivot(index=['date', 'id', 'label', 'hour'], columns='minute', values='activity')

# rename columns
df_pivot3.columns = [f'min_{minute:02d}' for minute in range(60)]

# Reset index
df_pivot3.reset_index(inplace=True)

#  NaN with 0 (for missing minute values)
trim3_piv = df_pivot3.fillna(0)

# print hourly_data shape
print(trim3_piv.shape)

# print info
#print(df3.info())


(6613, 64)


As can be seen, these dataframes are the following lengths:

* no trim - 26230
* trim to min - 17722
* trim to min of full days - 15840
* trim to min of num_days - 6613

### Create new dataframes - day, night, full

"Therefore, based on the hourly segmentation, three different subsets are constructed; night motor activity (from 21 to 7 h taking into account the sunrise standard hours) [21], day motor activity (from 8 to 20 h) and finally all day motor activity with the total day hours."

* 8 am - 8 pm (12 hours)
* 9 pm - 7 am (10 hours)

Why do it this way? 

#### Trim 1

In [19]:
print(trim.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1061445 entries, 0 to 1061444
Data columns (total 7 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   id         1061445 non-null  object        
 1   timestamp  1061445 non-null  datetime64[ns]
 2   date       1061445 non-null  datetime64[ns]
 3   activity   1061445 non-null  int64         
 4   label      1061445 non-null  int64         
 5   hour       1061445 non-null  int32         
 6   minute     1061445 non-null  int32         
dtypes: datetime64[ns](2), int32(2), int64(2), object(1)
memory usage: 48.6+ MB
None


In [20]:
#  subsets based on time ranges
trim1_day_df = trim1_piv[(trim1_piv['hour'] >= 8) & (trim1_piv['hour'] < 20)]  # day: 8 am to 8 pm
trim1_night_df = trim1_piv[(trim1_piv['hour'] >= 21) | (trim1_piv['hour'] < 7)]  # night: 9 pm to 7 am
trim1_full_day_df = trim1_piv  # full day:  24 hours

# print shapes
print(trim1_day_df.shape)
print(trim1_night_df.shape)
print(trim1_full_day_df.shape)

(9055, 64)
(7213, 64)
(17722, 64)


This does not match Rodriquez dataframe lengths: 

* day = 14168 obs
* night = 11945 obs
* full day = 26113 obs

![](2024-03-23-22-15-02.png)

#### Trim 2 and 3 

* will also not match

#### No trim

This is much closer to to the dataset lengths reported by Rodgriguez.

Questions:

* Did they actually do what they have written?
* What exactly did they do?
* How have they created the day/night segments? Are hours inclusive?

In [21]:
#  subsets based on time ranges
no_trim_day_df = no_trim[(no_trim['hour'] >= 8) & (no_trim['hour'] < 20)]  # day: 8 am to 8 pm
no_trim_night_df = no_trim[(no_trim['hour'] >= 21) | (no_trim['hour'] < 7)]  # night: 9 pm to 7 am
no_trim_full_day_df = no_trim  # full day:  24 hours

# print shapes
print(no_trim_day_df.shape)
print(no_trim_night_df.shape)
print(no_trim_full_day_df.shape)

(13172, 64)
(10880, 64)
(26230, 64)


In [22]:
# print missing data from no_trim dfs
missing_no_trim_day = no_trim_day_df[no_trim_day_df.isnull().any(axis=1)]
missing_no_trim_night = no_trim_night_df[no_trim_night_df.isnull().any(axis=1)]
missing_no_trim_full_day = no_trim_full_day_df[no_trim_full_day_df.isnull().any(axis=1)]
print(missing_no_trim_day)
print(missing_no_trim_night)
print(missing_no_trim_full_day)



Empty DataFrame
Columns: [date, id, label, hour, min_00, min_01, min_02, min_03, min_04, min_05, min_06, min_07, min_08, min_09, min_10, min_11, min_12, min_13, min_14, min_15, min_16, min_17, min_18, min_19, min_20, min_21, min_22, min_23, min_24, min_25, min_26, min_27, min_28, min_29, min_30, min_31, min_32, min_33, min_34, min_35, min_36, min_37, min_38, min_39, min_40, min_41, min_42, min_43, min_44, min_45, min_46, min_47, min_48, min_49, min_50, min_51, min_52, min_53, min_54, min_55, min_56, min_57, min_58, min_59]
Index: []

[0 rows x 64 columns]
Empty DataFrame
Columns: [date, id, label, hour, min_00, min_01, min_02, min_03, min_04, min_05, min_06, min_07, min_08, min_09, min_10, min_11, min_12, min_13, min_14, min_15, min_16, min_17, min_18, min_19, min_20, min_21, min_22, min_23, min_24, min_25, min_26, min_27, min_28, min_29, min_30, min_31, min_32, min_33, min_34, min_35, min_36, min_37, min_38, min_39, min_40, min_41, min_42, min_43, min_44, min_45, min_46, min_47, min_4

### Standardisation of motor activity

* Going to proceed with `trim1_` day/night/full and `no_trim_` day/night/full dataframes.


* standardisation: $[ z_i = \frac{{x_i - \bar{x}}}{{s}} ]$

Where:
* $(z_i)$ is the standardized value for observation $(i)$.
* $(x_i)$ is the original value for observation $(i)$.
* $(\bar{x})$ is the mean (average) of the entire dataset.
* $(s)$ is the standard deviation of the entire dataset

In [23]:
# print head(1) of trim1_day_df
print(trim1_day_df.head(1))

        date            id  label  hour  min_00  min_01  min_02  min_03  \
0 2002-05-24  condition_20      1    11     0.0     0.0     0.0     0.0   

   min_04  min_05  ...  min_50  min_51  min_52  min_53  min_54  min_55  \
0     0.0     0.0  ...    83.0     0.0     0.0     0.0     3.0     0.0   

   min_56  min_57  min_58  min_59  
0   249.0   209.0   360.0    36.0  

[1 rows x 64 columns]


In [24]:
# list of df
dataframes = [no_trim_day_df, no_trim_night_df, no_trim_full_day_df,
              trim1_day_df, trim1_night_df, trim1_full_day_df]

# list of new df names
new_df_names = ['no_trim_day_stand', 'no_trim_night_stand', 'no_trim_full_day_stand',
                'trim1_day_stand', 'trim1_night_stand', 'trim1_full_day_stand']

# standardise each df
for df, new_df_name in zip(dataframes, new_df_names):
    # mean and standard deviation for the entire dataset
    mean_values = df.loc[:, 'min_00':'min_59'].mean()
    std_values = df.loc[:, 'min_00':'min_59'].std()

    # create new df
    new_df = df.copy()

    # standardise each minute column
    for minute in range(60):
        column_name = f'min_{minute:02d}'
        new_df[column_name] = (df[column_name] - mean_values[minute]) / std_values[minute]
    
    # assign new df to variable with new df name
    globals()[new_df_name] = new_df



  new_df[column_name] = (df[column_name] - mean_values[minute]) / std_values[minute]


## Features

Time Features:

* mean
* median
* std
* variance
* kurtosis
* coefficient of variance
* interquartile range 
* min
* max
* trimmed mean

Frequency Features: 

* Spectral density
* Entropy
* Skewness
* Spectral flatness

In [25]:
# print lengths of new dataframes
for df_name in new_df_names:
    print(f"{df_name}: {len(globals()[df_name])}")

no_trim_day_stand: 13172
no_trim_night_stand: 10880
no_trim_full_day_stand: 26230
trim1_day_stand: 9055
trim1_night_stand: 7213
trim1_full_day_stand: 17722


In [31]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis, skew
from scipy.signal import welch
from scipy.stats import entropy



def calculate_all_features(df):
    # Fast Fourier Transform (FFT) to each row
    fft_columns = df.iloc[:, 4:].apply(lambda row: np.fft.fft(row), axis=1)
    
    # power spectral density (PSD)
    psd = fft_columns.apply(lambda row: np.abs(row) ** 2)
    
    # time domain features
    df['TDmean'] = df.iloc[:, 4:].mean(axis=1)
    df['TDmedian'] = df.iloc[:, 4:].median(axis=1)
    df['TDstd'] = df.iloc[:, 4:].std(axis=1)
    df['TDvariance'] = df.iloc[:, 4:].var(axis=1)
    df['TDmin'] = df.iloc[:, 4:].min(axis=1)
    df['TDmax'] = df.iloc[:, 4:].max(axis=1)
    # trimmed mean 20%
    df['TDtrimmed_mean'] = df.iloc[:, 4:].apply(lambda row: np.mean(row[(row >= np.percentile(row, 10)) & (row <= np.percentile(row, 90))]), axis=1)
    df['TDkurtosis'] = df.iloc[:, 4:].apply(lambda row: kurtosis(row), axis=1)
    #df['TDskewness'] = df.iloc[:, 4:].apply(lambda row: skew(row), axis=1)
    df['TDcoefficient_of_variance'] = df['TDstd'] / df['TDmean']
    df['TDinterquartile_range'] = df.iloc[:, 4:].apply(lambda row: np.percentile(row, 75) - np.percentile(row, 25), axis=1)
    
    # frequency domain features
    df['FDmean'] = psd.apply(np.mean)
    df['FDmedian'] = psd.apply(np.median)
    df['FDstd'] = psd.apply(np.std)
    df['FDvariance'] = psd.apply(np.var)
    df['FDmin'] = psd.apply(np.min)
    df['FDmax'] = psd.apply(np.max)
    df['FDtrimmed_mean'] = psd.apply(lambda row: np.mean(row[(row >= np.percentile(row, 10)) & (row <= np.percentile(row, 90))]))
    df['FDkurtosis'] = psd.apply(lambda row: kurtosis(row))
    df['FDskewness'] = psd.apply(lambda row: skew(row))
    df['FDcoefficient_of_variance'] = df['FDstd'] / df['FDmean']
    df['FDinterquartile_range'] = psd.apply(lambda row: np.percentile(row, 75) - np.percentile(row, 25))
    df['FDentropy'] = fft_columns.apply(lambda row: entropy(np.abs(row)))
    df['FDskewness'] = fft_columns.apply(lambda row: skew(np.abs(row)))
    df['FDspectral_flatness'] = fft_columns.apply(lambda row: np.exp(np.mean(np.log(np.abs(row) + 1e-10))))
    
    return df


In [32]:

#  all features for each dataframe
trim_day = calculate_all_features(trim1_day_stand)
trim_night = calculate_all_features(trim1_night_stand)
trim_full = calculate_all_features(trim1_full_day_stand)

# all features for each no trim dataframe
no_trim_day = calculate_all_features(no_trim_day_stand)
no_trim_night = calculate_all_features(no_trim_night_stand)
no_trim_full = calculate_all_features(no_trim_full_day_stand)


In [33]:
# print shapes of new dataframes
for df_name in ['trim_day', 'trim_night', 'trim_full', 'no_trim_day', 'no_trim_night', 'no_trim_full']:
    print(f"{df_name}: {globals()[df_name].shape}")

trim_day: (9055, 87)
trim_night: (7213, 87)
trim_full: (17722, 87)
no_trim_day: (13172, 87)
no_trim_night: (10880, 87)
no_trim_full: (26230, 87)


In [None]:
# list of dataframes
df_names = ['trim_day', 'trim_night', 'trim_full', 'no_trim_day', 'no_trim_night', 'no_trim_full']

# drop minute (min_00 to min_59) and 'hour', 'date' columns from dfs
for df_name in df_names:
    # drop minute columns
    globals()[df_name].drop(columns=[f'min_{minute:02d}' for minute in range(60)], inplace=True)
    # drop 'hour', 'date' columns
    globals()[df_name].drop(columns=['hour', 'date'], inplace=True)

In [41]:
# save all dfs to csv
for df_name in df_names:
    globals()[df_name].to_csv(f'../data/petter/{df_name}.csv', index=False)
    print(f"{df_name}.csv saved")

trim_day.csv saved
trim_night.csv saved
trim_full.csv saved
no_trim_day.csv saved
no_trim_night.csv saved
no_trim_full.csv saved


## LOAD CSVs HERE

In [50]:
from sklearn.model_selection import train_test_split

In [42]:
# load the data
trim_day = pd.read_csv('../data/petter/trim_day.csv')
trim_night = pd.read_csv('../data/petter/trim_night.csv')
trim_full = pd.read_csv('../data/petter/trim_full.csv')
no_trim_day = pd.read_csv('../data/petter/no_trim_day.csv')
no_trim_night = pd.read_csv('../data/petter/no_trim_night.csv')
no_trim_full = pd.read_csv('../data/petter/no_trim_full.csv')


In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, matthews_corrcoef

def split(df, group_by_id=True):
    """
    Split the given DataFrame into training and testing datasets.

    Parameters:
    - df (DataFrame): The input DataFrame to be split.
    - group_by_id (bool): If True, group the data by 'id' before splitting. Default is True.  This will allow prediction of depression state by person.

    Returns:
    - X_train (DataFrame): The features of the training dataset.
    - y_train (Series): The labels of the training dataset.
    - X_test (DataFrame): The features of the testing dataset.
    - y_test (Series): The labels of the testing dataset.
    """

    if group_by_id:
        # group by 'id' and split
        train_ids, test_ids = train_test_split(df['id'].unique(), test_size=0.2, random_state=42)
        train_data = df[df['id'].isin(train_ids)]
        test_data = df[df['id'].isin(test_ids)]
    else:
        # split without grouping by 'id'
        train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

    # separate features and labels
    X_train = train_data.drop(['label', 'id'], axis=1)
    y_train = train_data['label']
    X_test = test_data.drop(['label', 'id'], axis=1)
    y_test = test_data['label']

    return X_train, y_train, X_test, y_test

In [59]:
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_score, recall_score, roc_auc_score, classification_report
from sklearn.preprocessing import OneHotEncoder

def fit_and_evaluate(X_train, y_train, X_test, y_test):
    # separate numerical and categorical columns
    numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # one-hot encode categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])

    # preprocess the data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # initialize 
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # fit 
    rf.fit(X_train_processed, y_train)

    # predictions
    y_pred = rf.predict(X_test_processed)

    # evaluate 
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    specificity = report['0']['recall']
    support = report['1']['support']

    return accuracy, f1, conf_matrix, recall, mcc, precision, roc_auc, specificity, support


In [65]:
# split, fit and eval - trim_day
X_train, y_train, X_test, y_test = split(trim_day, group_by_id=True)
accuracy, f1, conf_matrix, recall, mcc, precision, roc_auc, specificity, support = fit_and_evaluate(X_train, y_train, X_test, y_test)
print("Dataset used: trim_day")
print("Predicting 'label' by 'id':")
print(f"Accuracy: {accuracy:.4f}, \nF1-score: {f1:.4f}, \nConfusion Matrix:\n{conf_matrix}\nRecall: {recall:.4f}, \nMCC: {mcc:.4f}")

Dataset used: trim_day
Predicting 'label' by 'id':
Accuracy: 0.6715, 
F1-score: 0.4255, 
Confusion Matrix:
[[994 319]
 [275 220]]
Recall: 0.4444, 
MCC: 0.1964


In [66]:
# split, fit and eval - trim_night
X_train, y_train, X_test, y_test = split(trim_night, group_by_id=True)
accuracy, f1, conf_matrix, recall, mcc, precision, roc_auc, specificity, support = fit_and_evaluate(X_train, y_train, X_test, y_test)
print("Dataset used: trim_night")
print("Predicting 'label' by 'id':")
print(f"Accuracy: {accuracy:.4f}, \nF1-score: {f1:.4f}, \nConfusion Matrix:\n{conf_matrix}\nRecall: {recall:.4f}, \nMCC: {mcc:.4f}")

Dataset used: trim_night
Predicting 'label' by 'id':
Accuracy: 0.6235, 
F1-score: 0.4087, 
Confusion Matrix:
[[713 339]
 [205 188]]
Recall: 0.4784, 
MCC: 0.1443


In [67]:
# split, fit and eval - trim_full
X_train, y_train, X_test, y_test = split(trim_full, group_by_id=True)
accuracy, f1, conf_matrix, recall, mcc, precision, roc_auc, specificity, support = fit_and_evaluate(X_train, y_train, X_test, y_test)
print("Dataset used: trim_full")
print("Predicting 'label' by 'id':")
print(f"Accuracy: {accuracy:.4f}, \nF1-score: {f1:.4f}, \nConfusion Matrix:\n{conf_matrix}\nRecall: {recall:.4f}, \nMCC: {mcc:.4f}")


Dataset used: trim_full
Predicting 'label' by 'id':
Accuracy: 0.6402, 
F1-score: 0.4017, 
Confusion Matrix:
[[1841  735]
 [ 540  428]]
Recall: 0.4421, 
MCC: 0.1488


In [71]:
no_trim_dfs = [no_trim_day, no_trim_night, no_trim_full]
df_names = ['no_trim_day', 'no_trim_night', 'no_trim_full']

group_by_id = True  

for df, df_name in zip(no_trim_dfs, df_names):
    X_train, y_train, X_test, y_test = split(df, group_by_id=group_by_id)
    accuracy, f1, conf_matrix, recall, mcc, precision, roc_auc, specificity, support = fit_and_evaluate(X_train, y_train, X_test, y_test)
    print(f"Dataset used: {df_name}")
    if group_by_id:
        print("Predicting 'label' for 'id':")
    else: 
        print("Predicting 'label' for rows:")

    print(f"Accuracy: {accuracy:.4f}, \nF1-score: {f1:.4f}, \nConfusion Matrix:\n{conf_matrix}\nRecall: {recall:.4f}, \nMCC: {mcc:.4f}")
    print("\n")

Dataset used: no_trim_day
Predicting 'label' for 'id':
Accuracy: 0.7289, 
F1-score: 0.3984, 
Confusion Matrix:
[[1745  443]
 [ 297  245]]
Recall: 0.4520, 
MCC: 0.2293


Dataset used: no_trim_night
Predicting 'label' for 'id':
Accuracy: 0.7095, 
F1-score: 0.3531, 
Confusion Matrix:
[[1423  385]
 [ 271  179]]
Recall: 0.3978, 
MCC: 0.1706


Dataset used: no_trim_full
Predicting 'label' for 'id':
Accuracy: 0.7131, 
F1-score: 0.3672, 
Confusion Matrix:
[[3426  932]
 [ 629  453]]
Recall: 0.4187, 
MCC: 0.1877




In [47]:
dataframes = [trim_day, trim_night, trim_full, no_trim_day, no_trim_night, no_trim_full]

In [58]:
for df in dataframes:
    print(f"Evaluating: {df.head(0)}")
    
    # Predict the 'label' by 'id'
    X_train, y_train, X_test, y_test = split(df, group_by_id=True)
    accuracy, f1, conf_matrix, recall, mcc = fit_and_evaluate(X_train, y_train, X_test, y_test)
    print("Predicting 'label' by 'id':")
    print(f"Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}, Confusion Matrix:\n{conf_matrix}\nRecall: {recall:.4f}, MCC: {mcc:.4f}")

    # Predict the 'label' by row
    X_train, y_train, X_test, y_test = split(df, group_by_id=False)
    accuracy, f1, conf_matrix, recall, mcc = fit_and_evaluate(X_train, y_train, X_test, y_test)
    print("\nPredicting 'label' by row:")
    print(f"Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}, Confusion Matrix:\n{conf_matrix}\nRecall: {recall:.4f}, MCC: {mcc:.4f}")
    print("-" * 80)

Evaluating: Empty DataFrame
Columns: [id, label, TDmean, TDmedian, TDstd, TDvariance, TDmin, TDmax, TDtrimmed_mean, TDkurtosis, TDcoefficient_of_variance, TDinterquartile_range, FDmean, FDmedian, FDstd, FDvariance, FDmin, FDmax, FDtrimmed_mean, FDkurtosis, FDentropy, FDskewness, FDspectral_flatness, FDcoefficient_of_variance, FDinterquartile_range]
Index: []

[0 rows x 25 columns]
Predicting 'label' by 'id':
Accuracy: 0.6715, F1-score: 0.4255, Confusion Matrix:
[[994 319]
 [275 220]]
Recall: 0.4444, MCC: 0.1964

Predicting 'label' by row:
Accuracy: 0.6433, F1-score: 0.5305, Confusion Matrix:
[[800 270]
 [376 365]]
Recall: 0.4926, MCC: 0.2475
--------------------------------------------------------------------------------
Evaluating: Empty DataFrame
Columns: [id, label, TDmean, TDmedian, TDstd, TDvariance, TDmin, TDmax, TDtrimmed_mean, TDkurtosis, TDcoefficient_of_variance, TDinterquartile_range, FDmean, FDmedian, FDstd, FDvariance, FDmin, FDmax, FDtrimmed_mean, FDkurtosis, FDentropy, F

### NEXT Steps

TODO

1. ~~new no trim dataframes~~
2. ~~drop cols (minutes, hour)~~
3. ~~prep dataframes for predicting person depression and by row~~
4. train rf model for 'trim' and 'no trim' dataframes - by rows (as per Rodriguez)
5. train rf model for 'trim' and 'no trom' dataframes - to predict persons
6. compare results to Tom's and article results
7. discussion - what is this all about?

### scraps

In [None]:
def fit_and_evaluate(X_train, y_train, X_test, y_test):
    # model
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # fit the model
    rf.fit(X_train, y_train)

    # predictions
    y_pred = rf.predict(X_test)

    # evaluate
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    return accuracy, f1, conf_matrix, recall, mcc