# IGP 5 Models

## Preprocessing to Wide

In [1]:
# load functions in python file with magic command
%run ../code/preprocess.py

In [2]:
import pandas as pd
folderpath = '../depresjon'
output_csv_path = '../output/'
scores_csv_path = '../depresjon/scores.csv'

# extract files
df = extract_from_folder(folderpath)

# extract full days (true days)
full_df = preprocess_full_days(df)

# extract days per scores with 7 day min cut-off
final = extract_days_per_scores(full_df, scores_csv_path, min_days=7)

# pivot df to wide format
final_pivot = pivot_dataframe(final)

In [3]:
# save to csv
final_pivot.to_csv(output_csv_path + 'preprocessed-wide.csv', index=False)
final.to_csv(output_csv_path+ 'preprocessed-long.csv', index=False)

In [4]:
# filter full_df for id = 'condition_8'
condition_8 = extract_days_per_scores(full_df, scores_csv_path, id='condition_8')
condition_8_pivot = pivot_dataframe(condition_8)
# save to csv
condition_8_pivot.to_csv(output_csv_path + 'condition_8-wide.csv', index=False)
condition_8.to_csv(output_csv_path + 'condition_8-long.csv', index=False)



In [5]:
# list of variable names to delete
var_list = ['df', 'full_df',  'final', 'final_pivot', 'condition_8', 'condition_8_pivot']

# loop over the list and delete variables if they exist
for var in var_list:
    if var in locals():
        del locals()[var]


## Import from CSV

In [19]:
import pandas as pd
output_csv_path = '../output/'
scores_csv_path = '../depresjon/scores.csv'

# import from csv
df = pd.read_csv(output_csv_path + 'preprocessed-long.csv')

In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464480 entries, 0 to 1464479
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   timestamp  1464480 non-null  object
 1   date       1464480 non-null  object
 2   activity   1464480 non-null  int64 
 3   id         1464480 non-null  object
 4   label      1464480 non-null  int64 
 5   gender     1464480 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 67.0+ MB
None


## Features

* mean, std, %0
* 

In [32]:
import pandas as pd
from datetime import datetime

# date parser function
date_parser = lambda x: datetime.strptime(x, '%H:%M')

# CSV file
sunlight_df = pd.read_csv('../norway/Norway_Sunlight.csv', parse_dates=['sunrise', 'sunset'], date_format=date_parser)


trouble shoot:

1. check the functions work
2. day or night - 0 if day, 1 if night - new column
3. active nonactive - < 5 is inactive (0) - new column
   1. rolling_sum - window summing 5 rows up/down
   2. active_inactive window - if rolling_sum >=2 then active (1) else 0
4. sunlight - bring in sunrise, set for each month
5. light/dark - 0 if dark, 1 if light
6. proportion of activity at night, day, light, dark 

In [37]:
import pandas as pd
import scipy.stats as sp

# classify each row of data as either day (0) or night (1)
def day_or_night(dataframe, day_start, day_end):
    dataframe['day_night'] = dataframe['timestamp'].dt.hour.apply(lambda hour: 0 if day_start <= hour < day_end else 1)
    return dataframe


# create a field of active (1) and non-active (0) time
def active_nonactive(dataframe, activity_threshold=5, rolling_window=11, rolling_threshold=2):
    dataframe['active_inactive'] = dataframe['activity'].apply(lambda x: 1 if x >= activity_threshold else 0)
    dataframe['rolling_sum'] = dataframe['active_inactive'].rolling(window=rolling_window, center=True).sum()
    dataframe['active_inactive_period'] = dataframe['rolling_sum'].apply(lambda x: 1 if x >= rolling_threshold else 0)
    dataframe.drop('rolling_sum', axis=1, inplace=True)
    return dataframe

# Function to create a field of active (1) and non-active (0) time
def active_nonactive(dataframe, activity_threshold=5):
    dataframe['active_inactive'] = dataframe['activity'].apply(lambda x: 1 if x >= activity_threshold else 0)
    return dataframe


# calculate the percentage of zeros in a series
def percent_zero(series):
    zeros = (series == 0).sum()
    total_values = series.size
    return zeros / total_values * 100

# extract statistical features 
def extract_features(dataframe):
    grouped = dataframe.groupby(['id', 'date'])['activity']
    features_df = grouped.agg(
        mean='mean',
        std='std',
        percent_zero=percent_zero,
        kurtosis=lambda x: sp.kurtosis(x, fisher=False)
    ).reset_index()
    return features_df

# find percentage activity at night/day/light/dark
def active_at_periods(dataframe):
    # Calculate the length of periods for each participant and day
    period_lengths = dataframe.groupby(['id', 'date', 'day_night', 'active_inactive']).size().unstack(fill_value=0)
    
    # Calculate the sum of active/inactive periods for each participant and day
    # We specify that we only want to sum the 'activity' column
    activity_sums = dataframe.groupby(['id', 'date', 'day_night', 'active_inactive'])['activity'].sum().unstack(fill_value=0)
    
    # Calculate percentages
    period_activity = activity_sums.div(period_lengths, level=['id', 'date', 'day_night'])
    
    # Reshape the dataframe for merging
    period_activity_flat = period_activity.stack().reset_index(name='percentage')
    
    # Pivot the table to have separate columns for each period's activity
    period_activity_pivot = period_activity_flat.pivot_table(index=['id', 'date'], columns=['day_night', 'active_inactive'], values='percentage', fill_value=0)
    
    # Rename columns for clarity
    period_activity_pivot.columns = ['inactiveDay', 'activeNight', 'inactiveLight', 'activeDark']
    
    return period_activity_pivot.reset_index()


# classify each row of data as either light (0) or dark (1)
def light_dark(dataframe, sunlight_df):
    # merge the sunlight data with the main df
    dataframe['month'] = dataframe['timestamp'].dt.month
    merged_df = pd.merge(dataframe, sunlight_df, left_on='month', right_on='month', how='left')
    
    # convert sunrise and sunset times to datetime.time for comparison
    merged_df['sunrise_time'] = pd.to_datetime(merged_df['sunrise']).dt.time
    merged_df['sunset_time'] = pd.to_datetime(merged_df['sunset']).dt.time
    
    # classify as light or dark based on the timestamp
    merged_df['light_dark'] = merged_df.apply(lambda row: 0 if row['sunrise_time'] <= row['timestamp'].time() < row['sunset_time'] else 1, axis=1)
    
    return merged_df

# function to calculate all features
def calculate_all_features(dataframe, sunlight_df):
    # convert 'timestamp' to datetime
    dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'])

    # light/dark classification using Norway sunlight data
    dataframe = light_dark(dataframe, sunlight_df)
    
    # day/night classification
    dataframe = day_or_night(dataframe, 8, 20)  # using 8:00 - 20:00 as day time
    
    # active/non-active classification
    dataframe = active_nonactive(dataframe)
    
    # statistical features
    statistical_features = extract_features(dataframe)
    
    # active/inactive periods
    period_features = active_at_periods(dataframe)
    
    # merge all features
    all_features = pd.merge(period_features, statistical_features, on=['id', 'date'], how='inner')
    
    return all_features




In [11]:


sunlight_df = pd.read_csv('../norway/Norway_Sunlight.csv')  
features_full = calculate_all_features(df, sunlight_df)
# print(features_full)

  merged_df['sunrise_time'] = pd.to_datetime(merged_df['sunrise']).dt.time
  merged_df['sunset_time'] = pd.to_datetime(merged_df['sunset']).dt.time
  kurtosis=lambda x: sp.kurtosis(x, fisher=False)


In [12]:
features_full

Unnamed: 0,id,date,inactiveDay,activeNight,inactiveLight,activeDark,mean,std,percent_zero,kurtosis
0,condition_1,2003-05-08,0.115385,320.543831,0.089286,127.240741,156.247222,229.109777,40.902778,8.792571
1,condition_1,2003-05-09,0.138249,266.994036,0.045161,174.141176,124.135417,211.241278,46.180556,10.550960
2,condition_1,2003-05-10,0.233333,260.119048,0.113924,123.556911,134.961806,230.954732,37.430556,15.449014
3,condition_1,2003-05-11,0.259615,200.099026,0.091255,102.355670,99.439583,177.719972,42.013889,21.223210
4,condition_1,2003-05-12,0.262500,662.801563,0.053678,147.732719,316.874306,496.184847,39.375000,7.679689
...,...,...,...,...,...,...,...,...,...,...
1012,control_9,2003-11-26,2.582888,98.266417,0.819512,55.525806,48.894444,145.182237,22.500000,22.516061
1013,control_9,2003-11-27,0.000000,5.912500,0.000000,5.650000,5.781250,8.760265,0.000000,1404.039760
1014,control_9,2003-11-28,0.000000,6.241667,0.000000,6.561111,6.401389,0.916226,0.000000,1.768223
1015,control_9,2003-11-29,0.000000,7.000000,0.000000,6.775000,6.887500,0.460968,0.000000,15.837380
