# Extract day night and sleep data

In [1]:
# Import modules

import os
import pandas as pd
import numpy as np
import scipy as sp
import datetime
import seaborn as sns
from datetime import date, timedelta
from matplotlib import pyplot as plt
from scipy.stats import skew

In [2]:
# Petter's code for extracting all the data files and then combining them.

def extract_folder(folderpath, add_scores=False, downsample=None):
    """
    Extract CSV data from folder and subfolders into a dataframe.

    Args:
      folderpath (str): Path to the folder containing CSV files.
      add_scores (bool, optional): Boolean to add scores.csv to the dataframe. Defaults to False.
      downsample (int, optional): Number of rows to downsample CSVs to. Defaults to None.

    Returns:
      pandas.DataFrame: DataFrame of concatenated CSV data.
    """
    import os
    import pandas as pd
    
    # Dict to store dataframes by condition  
    dfs = {'control': [], 'condition': []}

    try:
        # Handle top-level scores CSV
        if add_scores and 'scores.csv' in os.listdir(folderpath):
            scores_path = os.path.join(folderpath, 'scores.csv')  
            dfs['scores'] = pd.read_csv(scores_path)

        # Get subfolders
        subfolders = [f for f in os.listdir(folderpath) if os.path.isdir(os.path.join(folderpath, f))]

        for subfolder in subfolders:
            subfolderpath = os.path.join(folderpath, subfolder)  

            # Get list of CSV files
            files = os.listdir(subfolderpath)

            for file in files:
                filepath = os.path.join(subfolderpath, file)

                # Extract ID from filename 
                id = file.split('.')[0]

                df = pd.read_csv(filepath)

                # Downsample if needed
                if downsample:
                    df = df.sample(downsample)

                # Add ID column - this is the filename without the extension
                df['id'] = id

                # Add 'condition' column
                df['condition'] = subfolder

                # Convert 'timestamp' and 'date' to datetime
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df['date'] = pd.to_datetime(df['date'])

                # Append to dict by condition
                if subfolder == 'control':
                    dfs['control'].append(df)
                else:  
                    dfs['condition'].append(df)

    except OSError:
        print(f"Error reading folder: {folderpath}")

    # concatenate dfs for each condition
    dfs['control'] = pd.concat(dfs['control'])
    dfs['condition'] = pd.concat(dfs['condition'])

    # Reset index on the final df
    df = pd.concat([dfs['control'], dfs['condition']]).reset_index(drop=True)

    # add label column
    df['label'] = 0
    df.loc[df['condition'] == 'condition', 'label'] = 1
    
    # remove old 'condition' column
    df.drop('condition', axis=1, inplace=True)

    # Final concat
    return df

In [3]:
# Extration of all the acitvity data into one data frame

df = extract_folder('../data/depresjon')
df.shape


(1571706, 5)

In [4]:
# Function to clasify each row of data as either day or night 

def day_or_night (dataframe,day_start,day_end):
    
    # day night classifying function
    def day_night_test(time):
        if day_start <= time.hour < day_end:
            return ('day')
        else:
            return ('night')
    
    # Create a copy of the data frame
    dayNight_df = dataframe.copy()
    
    # Add in the new column with day or night values
    dayNight_df['day_night'] = dayNight_df['timestamp'].apply(day_night_test)
    return (dayNight_df)

DN_df = day_or_night(df,8,20)

In [16]:
# Function to create a field of acitve and non-active time

def active_nonactive (dataframe):
    
    # function to test if a minute is active ('1') or inactive('0')
    def time_test(activity):
        if activity < 5:
            return ('0')
        else:
            return ('1') 
    
    # Create a copy of the data frame    
    new_df = dataframe.copy()
    
    # Create a new column classifying each minute as acitve or inactive
    new_df['col1'] = new_df['activity'].apply(time_test)
    
    # Create a new column that sums the 11 values (5 below and 5 above) the trigger value 
    new_df['col2'] = new_df['col1'].rolling(window=11, center=True).sum()
    
    # Function to test if a period is active or inactive
    def activ_test(value):
        if value >= 2:
            return ('active')    
        elif value < 2:
            return ('inactive')
        else:
            return ('NaN')
        
    # Create the column of active or inactive periods   
    new_df['active_inactive'] = new_df['col2'].apply(activ_test)
    
    # Drop the unnecessary columns created
    new_df.drop(['col1','col2'], axis=1, inplace=True)
    
    return(new_df)

active_df = active_nonactive(DN_df)


In [6]:
active_df.head(100)

Unnamed: 0,timestamp,date,activity,id,label,day_night,active_notactive
0,2003-03-18 15:00:00,2003-03-18,60,control_1,0,day,
1,2003-03-18 15:01:00,2003-03-18,0,control_1,0,day,
2,2003-03-18 15:02:00,2003-03-18,264,control_1,0,day,
3,2003-03-18 15:03:00,2003-03-18,662,control_1,0,day,
4,2003-03-18 15:04:00,2003-03-18,293,control_1,0,day,
...,...,...,...,...,...,...,...
95,2003-03-18 16:35:00,2003-03-18,194,control_1,0,day,active
96,2003-03-18 16:36:00,2003-03-18,208,control_1,0,day,active
97,2003-03-18 16:37:00,2003-03-18,146,control_1,0,day,active
98,2003-03-18 16:38:00,2003-03-18,30,control_1,0,day,active


In [7]:
# Function to crop to only full days of data

def fullDays(dataframe):
    
    # Create a list of participants
    participants = dataframe['id'].unique()
    
    # Create an empty dataframe to be populated
    df_new = pd.DataFrame({})
    
    # Create a subset of the df for each participant and find the initial timestamp and date
    for participant in participants:
        df_tochange = dataframe[dataframe['id'] == participant]
        
        min_timestamp = df_tochange['timestamp'].min()    # first timestamp entry 
        min_date = df_tochange['date'].min()       # first date entry 
        
        # Remove all values before 00:00 on the first full day  
        if min_date < min_timestamp:
            min_fullday_date = min_date + timedelta(1)
            df_minchange = df_tochange[df_tochange['date'] >= min_fullday_date]
        
        else:
            df_minchange = df_tochange.copy()
  
        max_timestamp = df_minchange['timestamp'].max()    # last timestamp entry 
        max_date = df_minchange['date'].max()     # last date entry 
        
        # Remove all values after 23:59 on the last full day
        if max_date < max_timestamp:
            df_maxchange = df_minchange[df_minchange['date'] < max_date]
        else:
            df_maxchange = df_minchange.copy()
        
        # Combine all subsets back into one dataframe 
        df_new = pd.concat([df_new,df_maxchange])
        
    return (df_new)

fulldays_df = fullDays(active_df)

In [8]:
# Read scores.csv in as a dataframe

scores = pd.read_csv('data/depresjon/scores.csv')

# Function to match the number of dates stated in the scores.csv with the activity data
def trueDates (dataframe):
    
    # Create a new empty dataframe
    df_new = pd.DataFrame({})
    
    # Create a list of participants
    participants = dataframe['id'].unique()
    
    # Create a subset of the df for each participant
    for participant in participants:
        df_tochange = dataframe[dataframe['id'] == participant]
        
        # Identify the expected number of days for each participant
        trueDays = scores.loc[scores['number'] == participant, 'days'].values[0]
        
        # Identify the first date for each participant
        min_date = dataframe.loc[dataframe['id'] == participant, 'date'].min()
        
        # Indentify the expected last date for each participant
        min_truedate = min_date + timedelta(int(trueDays))
        
        # Remove all data outside the expected range
        df_reduce = df_tochange[df_tochange['date'] < min_truedate]
        
        # Combine all subsets back into one dataframe 
        df_new = pd.concat([df_new,df_reduce])
    
    return (df_new)

trueDates_df = trueDates(fulldays_df)

In [9]:
trueDates_df

Unnamed: 0,timestamp,date,activity,id,label,day_night,active_notactive
540,2003-03-19 00:00:00,2003-03-19,0,control_1,0,night,nonactive
541,2003-03-19 00:01:00,2003-03-19,0,control_1,0,night,nonactive
542,2003-03-19 00:02:00,2003-03-19,0,control_1,0,night,nonactive
543,2003-03-19 00:03:00,2003-03-19,0,control_1,0,night,active
544,2003-03-19 00:04:00,2003-03-19,175,control_1,0,night,active
...,...,...,...,...,...,...,...
1570793,2004-06-09 23:55:00,2004-06-09,169,condition_9,1,night,active
1570794,2004-06-09 23:56:00,2004-06-09,169,condition_9,1,night,active
1570795,2004-06-09 23:57:00,2004-06-09,169,condition_9,1,night,active
1570796,2004-06-09 23:58:00,2004-06-09,169,condition_9,1,night,active


In [15]:
print((trueDates_df['active_notactive'] == 'NaN').sum())

0
