In [1]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import datetime
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
def extract_folder(folderpath, add_scores=False, downsample=None):
    """
    Extract CSV data from folder and subfolders into a dataframe.

    Args:
      folderpath (str): Path to the folder containing CSV files.
      add_scores (bool, optional): Boolean to add scores.csv to the dataframe. Defaults to False.
      downsample (int, optional): Number of rows to downsample CSVs to. Defaults to None.

    Returns:
      pandas.DataFrame: DataFrame of concatenated CSV data.
    """
    import os
    import pandas as pd
    
    # Dict to store dataframes by condition  
    dfs = {'control': [], 'condition': []}

    try:
        # Handle top-level scores CSV
        if add_scores and 'scores.csv' in os.listdir(folderpath):
            scores_path = os.path.join(folderpath, 'scores.csv')  
            dfs['scores'] = pd.read_csv(scores_path)

        # Get subfolders
        subfolders = [f for f in os.listdir(folderpath) if os.path.isdir(os.path.join(folderpath, f))]

        for subfolder in subfolders:
            subfolderpath = os.path.join(folderpath, subfolder)  

            # Get list of CSV files
            files = os.listdir(subfolderpath)

            for file in files:
                filepath = os.path.join(subfolderpath, file)

                # Extract ID from filename 
                id = file.split('.')[0]

                df = pd.read_csv(filepath)

                # Downsample if needed
                if downsample:
                    df = df.sample(downsample)

                # Add ID column - this is the filename without the extension
                df['id'] = id

                # Add 'condition' column
                df['condition'] = subfolder

                # Convert 'timestamp' and 'date' to datetime
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df['date'] = pd.to_datetime(df['date'])

                # Append to dict by condition
                if subfolder == 'control':
                    dfs['control'].append(df)
                else:  
                    dfs['condition'].append(df)

    except OSError:
        print(f"Error reading folder: {folderpath}")

    # concatenate dfs for each condition
    dfs['control'] = pd.concat(dfs['control'])
    dfs['condition'] = pd.concat(dfs['condition'])

    # Reset index on the final df
    df = pd.concat([dfs['control'], dfs['condition']]).reset_index(drop=True)

    # add label column
    df['label'] = 0
    df.loc[df['condition'] == 'condition', 'label'] = 1
    
    # remove old 'condition' column
    df.drop('condition', axis=1, inplace=True)

    # Final concat
    return df

In [3]:
df = extract_folder('../data/depresjon')
df.head(5)

Unnamed: 0,timestamp,date,activity,id,label
0,2003-03-18 15:00:00,2003-03-18,60,control_1,0
1,2003-03-18 15:01:00,2003-03-18,0,control_1,0
2,2003-03-18 15:02:00,2003-03-18,264,control_1,0
3,2003-03-18 15:03:00,2003-03-18,662,control_1,0
4,2003-03-18 15:04:00,2003-03-18,293,control_1,0


In [4]:
# Assuming 'timestamp' is your timestamp column, 'activity' is the data you want to sum,
# 'id' is the identifier column, and 'label' is the label column
# Replace these with your actual column names
df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert timestamp to datetime if it's not already
df.set_index('timestamp', inplace=True)  # Set timestamp as the index

# Define aggregation functions for each column
agg_functions = {'activity': 'sum', 'id': 'first', 'label': 'first'}

# Resample data to hourly increments and apply aggregation functions
hourly_aggregated = df.resample('H').agg(agg_functions)

# Print or use the resulting DataFrame
print(hourly_aggregated)

                     activity            id  label
timestamp                                         
2002-05-24 11:00:00      4359  condition_20    1.0
2002-05-24 12:00:00      7718  condition_20    1.0
2002-05-24 13:00:00      8124  condition_20    1.0
2002-05-24 14:00:00      4321  condition_20    1.0
2002-05-24 15:00:00     14982  condition_20    1.0
...                       ...           ...    ...
2006-02-14 09:00:00     16947    control_30    0.0
2006-02-14 10:00:00     16015    control_30    0.0
2006-02-14 11:00:00     20795    control_30    0.0
2006-02-14 12:00:00     13587    control_30    0.0
2006-02-14 13:00:00      9889    control_30    0.0

[32691 rows x 3 columns]
