# IGP 5 Models

## Preprocessing to Wide

In [1]:
# load functions in python file with magic command
%run ../code/preprocess.py

In [2]:
import pandas as pd
folderpath = '../depresjon'
output_csv_path = '../output/'
scores_csv_path = '../depresjon/scores.csv'

# extract files
df = extract_from_folder(folderpath)

# extract full days (true days)
full_df = preprocess_full_days(df)

# extract days per scores with 7 day min cut-off
final = extract_days_per_scores(full_df, scores_csv_path)

# pivot df to wide format
final_pivot = pivot_dataframe(final)

In [3]:
# save to csv
final_pivot.to_csv(output_csv_path + 'preprocessed-wide.csv', index=False)
final.to_csv(output_csv_path+ 'preprocessed-long.csv', index=False)

In [4]:
# list of variable names to delete
var_list = ['df', 'full_df',  'final', 'final_pivot']

# loop over the list and delete variables if they exist
for var in var_list:
    if var in locals():
        del locals()[var]


### Notes

* Kept all id, date combinations to maximise data
* will split into train, test, val
* will keep proportions



## Import from CSV

In [1]:
import pandas as pd
output_csv_path = '../output/'
scores_csv_path = '../depresjon/scores.csv'

# import from csv
df = pd.read_csv(output_csv_path + 'preprocessed-long.csv', parse_dates=['timestamp', 'date'])

## Features



In [2]:
# load functions in python file with magic command
%run ../code/features.py

In [6]:
# calculate features
features_full = calculate_all_features(df, sunlight_df)


In [7]:
print(features_full.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693 entries, 0 to 692
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           693 non-null    datetime64[ns]
 1   id             693 non-null    object        
 2   label          693 non-null    int64         
 3   days           693 non-null    int64         
 4   gender         693 non-null    int64         
 5   inactiveDay    693 non-null    float64       
 6   activeNight    693 non-null    float64       
 7   inactiveLight  693 non-null    float64       
 8   activeDark     693 non-null    float64       
 9   mean           693 non-null    float64       
 10  std            693 non-null    float64       
 11  percent_zero   693 non-null    float64       
 12  kurtosis       693 non-null    float64       
dtypes: datetime64[ns](1), float64(8), int64(3), object(1)
memory usage: 70.5+ KB
None


In [48]:
features_full

Unnamed: 0,timestamp,date,activity,id,label,gender,day_night,active_inactive,active_inactive_period,month,...,sunset_time,light_dark,inactiveDay,activeNight,inactiveLight,activeDark,mean,std,percent_zero,kurtosis
0,2003-03-19 00:00:00,2003-03-19,0,control_1,0,2,1,0,0,3,...,18:21:00,1,0.144444,0.336111,0.257022,0.451923,185.568056,346.555786,38.680556,16.792497
1,2003-03-19 00:01:00,2003-03-19,0,control_1,0,2,1,0,0,3,...,18:21:00,1,0.144444,0.336111,0.257022,0.451923,185.568056,346.555786,38.680556,16.792497
2,2003-03-19 00:02:00,2003-03-19,0,control_1,0,2,1,0,0,3,...,18:21:00,1,0.144444,0.336111,0.257022,0.451923,185.568056,346.555786,38.680556,16.792497
3,2003-03-19 00:03:00,2003-03-19,0,control_1,0,2,1,0,0,3,...,18:21:00,1,0.144444,0.336111,0.257022,0.451923,185.568056,346.555786,38.680556,16.792497
4,2003-03-19 00:04:00,2003-03-19,175,control_1,0,2,1,1,0,3,...,18:21:00,1,0.144444,0.336111,0.257022,0.451923,185.568056,346.555786,38.680556,16.792497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1464475,2004-06-09 23:55:00,2004-06-09,169,condition_9,1,2,1,1,0,6,...,22:44:00,1,0.261111,0.412500,0.358907,0.333333,168.656250,305.014675,39.722222,12.310389
1464476,2004-06-09 23:56:00,2004-06-09,169,condition_9,1,2,1,1,0,6,...,22:44:00,1,0.261111,0.412500,0.358907,0.333333,168.656250,305.014675,39.722222,12.310389
1464477,2004-06-09 23:57:00,2004-06-09,169,condition_9,1,2,1,1,0,6,...,22:44:00,1,0.261111,0.412500,0.358907,0.333333,168.656250,305.014675,39.722222,12.310389
1464478,2004-06-09 23:58:00,2004-06-09,169,condition_9,1,2,1,1,0,6,...,22:44:00,1,0.261111,0.412500,0.358907,0.333333,168.656250,305.014675,39.722222,12.310389


In [None]:

# find percentage activity at night/day/light/dark
def active_at_periods(dataframe):
    # Calculate the length of periods for each participant and day
    period_lengths = dataframe.groupby(['id', 'date', 'day_night', 'active_inactive']).size().unstack(fill_value=0)
    
    # Calculate the sum of active/inactive periods for each participant and day
    # We specify that we only want to sum the 'activity' column
    activity_sums = dataframe.groupby(['id', 'date', 'day_night', 'active_inactive'])['activity'].sum().unstack(fill_value=0)
    
    # Calculate percentages
    period_activity = activity_sums.div(period_lengths, level=['id', 'date', 'day_night'])
    
    # Reshape the dataframe for merging
    period_activity_flat = period_activity.stack().reset_index(name='percentage')
    
    # Pivot the table to have separate columns for each period's activity
    period_activity_pivot = period_activity_flat.pivot_table(index=['id', 'date'], columns=['day_night', 'active_inactive'], values='percentage', fill_value=0)
    
    # Rename columns for clarity
    period_activity_pivot.columns = ['inactiveDay', 'activeNight', 'inactiveLight', 'activeDark']
    
    return period_activity_pivot.reset_index()



In [None]:


# Calculate the total count of day periods (minutes in Day) for each unique (id, date) combination
day_period_counts = dataframe.loc[dataframe['day_night'] == 0, ['id', 'date']].groupby(['id', 'date']).size().reset_index(name='day_period_count')

# Calculate the count of inactive periods during the day for each unique (id, date) combination
inactive_day_counts = dataframe.loc[(dataframe['day_night'] == 0) & (dataframe['active_inactive'] == 0), ['id', 'date']].groupby(['id', 'date']).size().reset_index(name='inactive_day_count')

# Merge the two dataframes and calculate the proportion
inactive_day_prop = day_period_counts.merge(inactive_day_counts, on=['id', 'date'], how='left').fillna(0)
inactive_day_prop['inactiveDay'] = inactive_day_prop['inactive_day_count'] / inactive_day_prop['day_period_count']
# print
#print(inactive_day_prop.head())

# Add the 'inactiveDay' column to the original dataframe
dataframe = dataframe.merge(inactive_day_prop[['id', 'date', 'inactiveDay']], on=['id', 'date'], how='left')

# Calculate the total count of night periods for each unique (id, date) combination
night_period_counts = dataframe.loc[dataframe['day_night'] == 1, ['id', 'date']].groupby(['id', 'date']).size().reset_index(name='night_period_count')

# Calculate the count of active periods during the night for each unique (id, date) combination
active_night_counts = dataframe.loc[(dataframe['day_night'] == 1) & (dataframe['active_inactive'] == 1), ['id', 'date']].groupby(['id', 'date']).size().reset_index(name='active_night_count')

# Merge the two dataframes and calculate the proportion
active_night_prop = night_period_counts.merge(active_night_counts, on=['id', 'date'], how='left').fillna(0)
active_night_prop['activeNight'] = active_night_prop['active_night_count'] / active_night_prop['night_period_count']

# Add the 'activeNight' column to the original dataframe
dataframe = dataframe.merge(active_night_prop[['id', 'date', 'activeNight']], on=['id', 'date'], how='left')

# Calculate the total count of light periods for each unique (id, date) combination
light_period_counts = dataframe.loc[dataframe['light_dark'] == 0, ['id', 'date']].groupby(['id', 'date']).size().reset_index(name='light_period_count')

# Calculate the count of inactive periods during light for each unique (id, date) combination
inactive_light_counts = dataframe.loc[(dataframe['light_dark'] == 0) & (dataframe['active_inactive'] == 0), ['id', 'date']].groupby(['id', 'date']).size().reset_index(name='inactive_light_count')

# Merge the two dataframes and calculate the proportion
inactive_light_prop = light_period_counts.merge(inactive_light_counts, on=['id', 'date'], how='left').fillna(0)
inactive_light_prop['inactiveLight'] = inactive_light_prop['inactive_light_count'] / inactive_light_prop['light_period_count']

# Add the 'inactiveLight' column to the original dataframe
dataframe = dataframe.merge(inactive_light_prop[['id', 'date', 'inactiveLight']], on=['id', 'date'], how='left')

# Calculate the total count of dark periods for each unique (id, date) combination
dark_period_counts = dataframe.loc[dataframe['light_dark'] == 1, ['id', 'date']].groupby(['id', 'date']).size().reset_index(name='dark_period_count')

# Calculate the count of active periods during dark for each unique (id, date) combination
active_dark_counts = dataframe.loc[(dataframe['light_dark'] == 1) & (dataframe['active_inactive'] == 1), ['id', 'date']].groupby(['id', 'date']).size().reset_index(name='active_dark_count')

# Merge the two dataframes and calculate the proportion
active_dark_prop = dark_period_counts.merge(active_dark_counts, on=['id', 'date'], how='left').fillna(0)
active_dark_prop['activeDark'] = active_dark_prop['active_dark_count'] / active_dark_prop['dark_period_count']

# Add the 'activeDark' column to the original dataframe
dataframe = dataframe.merge(active_dark_prop[['id', 'date', 'activeDark']], on=['id', 'date'], how='left')

