In [1]:
# import packages
import pandas as pd
import numpy as np
import scipy as scp
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib_venn as venn
from helper import *
import ipywidgets as widgets

# variables
time_period = (1, 2, 3, 4, 5) # 5 different implementation of progression order  
n_patients = 0 # number of patients included in the model  
n_sessions = 0 # number of sessions included in the model  
usage_time: how many months the patient has been using the app  
usage_freq: how frequent the patient uses the app  

Clean Data

In [2]:
# querying from the SQL database, 2 mins for q3 table

# from connection import *

# con = connect()
# df = SQL("select * from constant_therapy.q3", con)
# df = df.rename(columns={'session_id':"id"})

In [3]:
# 1 min, data saved from previous runs
df = pd.read_csv("data/context_action.csv")
df.drop(df.columns[[0, 1]], axis=1, inplace=True)
df.head()

Unnamed: 0,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,domain_id,condition_since,birth_year,disorder_id,start_time,start_time_min,end_time,end_time_min,deficit_id
0,635715,15666,20,1,3,0.875,1,1y,1964,2,2014-07-31,2014-07-31 00:38:38,2014-07-31,2014-07-31 00:39:24,"{1, 2, 3, 4, 8}"
1,635715,15666,20,1,3,0.875,1,1y,1964,3,2014-07-31,2014-07-31 00:38:38,2014-07-31,2014-07-31 00:39:24,"{1, 2, 3, 4, 8}"
2,695385,15666,20,1,5,0.8,1,1y,1964,2,2014-08-18,2014-08-18 22:03:20,2014-08-18,2014-08-18 22:05:21,"{1, 2, 3, 4, 8}"
3,695385,15666,20,1,5,0.8,1,1y,1964,3,2014-08-18,2014-08-18 22:03:20,2014-08-18,2014-08-18 22:05:21,"{1, 2, 3, 4, 8}"
4,705234,15666,20,1,5,0.85,1,1y,1964,2,2014-08-21,2014-08-21 17:51:23,2014-08-21,2014-08-21 17:52:40,"{1, 2, 3, 4, 8}"


In [4]:
# 20 seconds
disorder_ids = df.groupby("patient_id")["disorder_id"].apply(set).reset_index()
df = df.drop(columns="disorder_id")
df = df.merge(disorder_ids, on="patient_id", how="left")

In [5]:
# 1 min
domain_ids = df.groupby("id")["domain_id"].apply(set).reset_index()
df = df.drop(columns="domain_id")
df = df.merge(domain_ids, on="id", how="left")

In [6]:
df = df.drop_duplicates(subset="id")

In [7]:
# df.to_csv("data/consolidate_data.csv")
## careful when reading, will read set as string
# df = pd.read_csv("data/consolidate_data.csv", index_col=[0])
df

Unnamed: 0,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time,start_time_min,end_time,end_time_min,deficit_id,disorder_id,domain_id
0,635715,15666,20,1,3,0.875,1y,1964,2014-07-31,2014-07-31 00:38:38,2014-07-31,2014-07-31 00:39:24,"{1, 2, 3, 4, 8}","{2, 3}",{1}
2,695385,15666,20,1,5,0.800,1y,1964,2014-08-18,2014-08-18 22:03:20,2014-08-18,2014-08-18 22:05:21,"{1, 2, 3, 4, 8}","{2, 3}",{1}
4,705234,15666,20,1,5,0.850,1y,1964,2014-08-21,2014-08-21 17:51:23,2014-08-21,2014-08-21 17:52:40,"{1, 2, 3, 4, 8}","{2, 3}",{1}
6,4929693,15666,20,1,5,0.800,1y,1964,2016-10-19,2016-10-19 20:47:01,2016-10-19,2016-10-19 20:48:29,"{1, 2, 3, 4, 8}","{2, 3}",{1}
32,5429562,15666,37,1,5,0.800,1y,1964,2016-12-13,2016-12-13 01:14:14,2016-12-13,2016-12-13 01:16:24,"{1, 2, 3, 4, 8}","{2, 3}","{1, 10, 11}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61860683,14061511,217018,7,5,10,0.908,6m,1943,2018-11-05,2018-11-05 15:52:25,2018-11-05,2018-11-05 16:08:50,{3},"{3, 6}",{14}
61860685,14062555,217018,7,5,10,0.978,6m,1943,2018-11-05,2018-11-05 16:16:53,2018-11-05,2018-11-05 16:27:40,{3},"{3, 6}",{14}
61860687,14002360,217018,28,5,7,0.973,6m,1943,2018-11-01,2018-11-01 14:40:43,2018-11-01,2018-11-01 14:41:49,{3},"{3, 6}",{14}
61860707,19565141,324494,173,6,1,1.000,10y,1947,2019-08-26,2019-08-26 14:17:41,,,{6},{9},{14}


All data has been consolidated, there should be no duplicate sessions for different disorders/domains.
We want to add time_period, usage_time, and usage_freq to the dataset.

**This part doesn't need to be run if it's claire's data**

In [8]:
progression_order_df = pd.read_csv("data/progression_order.csv", index_col=[0])
progression_order_df.rename(columns={'time_implemented':'start_time'}, inplace=True)
progression_order_df["start_time"] = pd.to_datetime(progression_order_df["start_time"])
df["start_time"] = pd.to_datetime(df["start_time"])

In [9]:
def time_period_convert(dd):
    dates = sorted(pd.unique(progression_order_df["start_time"]))
    if dd < pd.to_datetime(dates[1]):
        return 1
    elif dd < pd.to_datetime(dates[2]):
        return 2
    elif dd < pd.to_datetime(dates[3]):
        return 3
    elif dd < pd.to_datetime(dates[4]):
        return 4
    else:
        return 5

In [10]:
# added time period, takes around 8-9 minutes to run
df["time_period"] = df["start_time"].apply(time_period_convert)
df

Unnamed: 0,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time,start_time_min,end_time,end_time_min,deficit_id,disorder_id,domain_id,time_period
0,635715,15666,20,1,3,0.875,1y,1964,2014-07-31,2014-07-31 00:38:38,2014-07-31,2014-07-31 00:39:24,"{1, 2, 3, 4, 8}","{2, 3}",{1},1
2,695385,15666,20,1,5,0.800,1y,1964,2014-08-18,2014-08-18 22:03:20,2014-08-18,2014-08-18 22:05:21,"{1, 2, 3, 4, 8}","{2, 3}",{1},1
4,705234,15666,20,1,5,0.850,1y,1964,2014-08-21,2014-08-21 17:51:23,2014-08-21,2014-08-21 17:52:40,"{1, 2, 3, 4, 8}","{2, 3}",{1},1
6,4929693,15666,20,1,5,0.800,1y,1964,2016-10-19,2016-10-19 20:47:01,2016-10-19,2016-10-19 20:48:29,"{1, 2, 3, 4, 8}","{2, 3}",{1},1
32,5429562,15666,37,1,5,0.800,1y,1964,2016-12-13,2016-12-13 01:14:14,2016-12-13,2016-12-13 01:16:24,"{1, 2, 3, 4, 8}","{2, 3}","{1, 10, 11}",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61860683,14061511,217018,7,5,10,0.908,6m,1943,2018-11-05,2018-11-05 15:52:25,2018-11-05,2018-11-05 16:08:50,{3},"{3, 6}",{14},4
61860685,14062555,217018,7,5,10,0.978,6m,1943,2018-11-05,2018-11-05 16:16:53,2018-11-05,2018-11-05 16:27:40,{3},"{3, 6}",{14},4
61860687,14002360,217018,28,5,7,0.973,6m,1943,2018-11-01,2018-11-01 14:40:43,2018-11-01,2018-11-01 14:41:49,{3},"{3, 6}",{14},4
61860707,19565141,324494,173,6,1,1.000,10y,1947,2019-08-26,2019-08-26 14:17:41,,,{6},{9},{14},4


In [11]:
progression_order_df["time_period"] = progression_order_df["start_time"].apply(time_period_convert)
progression_order_df

Unnamed: 0,domain_id,root_id,task_type_id,task_level,progression_order,start_time,time_period
0,1,20,20,1,0,2012-01-01,1
1,1,20,20,2,1,2012-01-01,1
2,1,20,20,3,2,2012-01-01,1
3,1,20,20,4,3,2012-01-01,1
4,1,20,20,5,4,2012-01-01,1
...,...,...,...,...,...,...,...
1170,14,6,6,5,25,2019-10-08,5
1171,14,29,174,6,26,2019-10-08,5
1172,14,7,169,6,27,2019-10-08,5
1173,14,28,173,6,28,2019-10-08,5


Adding Usage Time
* this is by patient
* need a patient dataframe vs session dataframe (df)

In [12]:
df["start_time"] = pd.to_datetime(df["start_time"])

In [13]:
patients = (df.groupby("patient_id")["start_time"].max() - df.groupby("patient_id")["start_time"].min()).reset_index()
patients.columns = ["patient_id", "usage_time"]
patients["usage_time"] = patients["usage_time"].dt.days + 1
patients

Unnamed: 0,patient_id,usage_time
0,14189,243
1,14561,722
2,14683,219
3,14685,490
4,14696,2180
...,...,...
97749,415551,14
97750,415603,16
97751,415758,3
97752,415867,22


Add usage_freq for patient dataframe
* using the simplest way of calculating frequency
* how to get rid of outlier

In [14]:
patients["session_count"] = df.groupby("patient_id")["id"].count().reset_index(name="session_count")["session_count"]
patients["unique_days"] = df.groupby("patient_id")["start_time"].nunique().reset_index(name="days")["days"]
patients

Unnamed: 0,patient_id,usage_time,session_count,unique_days
0,14189,243,292,34
1,14561,722,3,2
2,14683,219,513,25
3,14685,490,456,64
4,14696,2180,13988,1866
...,...,...,...,...
97749,415551,14,10,6
97750,415603,16,74,6
97751,415758,3,23,3
97752,415867,22,34,12


In [15]:
patients["usage_freq"] = patients["unique_days"] / patients["usage_time"]
patients

Unnamed: 0,patient_id,usage_time,session_count,unique_days,usage_freq
0,14189,243,292,34,0.139918
1,14561,722,3,2,0.002770
2,14683,219,513,25,0.114155
3,14685,490,456,64,0.130612
4,14696,2180,13988,1866,0.855963
...,...,...,...,...,...
97749,415551,14,10,6,0.428571
97750,415603,16,74,6,0.375000
97751,415758,3,23,3,1.000000
97752,415867,22,34,12,0.545455


How to remove outlier

In [16]:
# modified from https://stackoverflow.com/questions/56750841/how-to-trim-outliers-in-dates-in-python
def datetime_outlier(data):
    qa = data["start_time"].quantile(0.1) #lower 10%
    qb = data["start_time"] #higher 10%
    #remove outliers
    xf = data[(data.start_time >= qa) & (data.start_time <= qb)]
    return xf

In [17]:
# about a min
patients_v2 = df.groupby("patient_id").apply(datetime_outlier).reset_index(drop=True)

In [18]:
patients_v2

Unnamed: 0,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time,start_time_min,end_time,end_time_min,deficit_id,disorder_id,domain_id,time_period
0,585724,14189,20,2,5,0.940,5y,1970,2014-07-13,2014-07-13 16:53:02,2014-07-13,2014-07-13 16:54:18,"{2, 4}",{3},{1},1
1,588925,14189,20,2,5,0.960,5y,1970,2014-07-14,2014-07-14 21:57:40,2014-07-14,2014-07-14 21:58:54,"{2, 4}",{3},{1},1
2,771833,14189,20,3,5,0.584,5y,1970,2014-10-14,2014-10-14 22:04:59,2014-10-14,2014-10-14 22:10:10,"{2, 4}",{3},{1},1
3,891587,14189,20,3,2,0.000,5y,1970,2014-10-15,2014-10-15 18:42:43,2014-10-15,2014-10-15 18:42:47,"{2, 4}",{3},{1},1
4,899521,14189,20,3,2,0.000,5y,1970,2014-10-17,2014-10-17 21:34:25,2014-10-17,2014-10-17 21:34:36,"{2, 4}",{3},{1},1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6869823,25172712,416292,79,3,10,0.400,6m,1953,2020-06-28,2020-06-28 19:59:51,2020-06-28,2020-06-28 20:44:53,"{2, 3, 5, 6, 7, 8, 10}",{2},{9},5
6869824,25192411,416292,79,3,10,0.600,6m,1953,2020-06-30,2020-06-30 01:05:42,2020-06-30,2020-06-30 01:30:24,"{2, 3, 5, 6, 7, 8, 10}",{2},{9},5
6869825,25207353,416292,79,3,10,0.701,6m,1953,2020-06-30,2020-06-30 20:40:34,2020-06-30,2020-06-30 20:56:16,"{2, 3, 5, 6, 7, 8, 10}",{2},{9},5
6869826,25242119,416292,25,2,6,0.906,6m,1953,2020-07-02,2020-07-02 19:58:29,2020-07-02,2020-07-02 20:02:46,"{2, 3, 5, 6, 7, 8, 10}",{2},{12},5


In [19]:
patients_filtered = (patients_v2.groupby("patient_id")["start_time"].max() - patients_v2.groupby("patient_id")["start_time"].min()).reset_index()
patients_filtered.columns = ["patient_id", "usage_time"]
patients_filtered["usage_time"] = patients_filtered["usage_time"].dt.days + 1
patients_filtered

Unnamed: 0,patient_id,usage_time
0,14189,241
1,14561,722
2,14683,24
3,14685,488
4,14696,2032
...,...,...
97749,415551,14
97750,415603,16
97751,415758,3
97752,415867,22


In [20]:
patients_filtered["session_count"] = patients_v2.groupby("patient_id")["id"].count().reset_index(name="session_count")["session_count"]
patients_filtered["unique_days"] = patients_v2.groupby("patient_id")["start_time"].nunique().reset_index(name="days")["days"]
patients_filtered["usage_freq"] = patients_filtered["unique_days"] / patients_filtered["usage_time"]

Check that the length of patients_filtered is less than the length of patients

In [21]:
patients

Unnamed: 0,patient_id,usage_time,session_count,unique_days,usage_freq
0,14189,243,292,34,0.139918
1,14561,722,3,2,0.002770
2,14683,219,513,25,0.114155
3,14685,490,456,64,0.130612
4,14696,2180,13988,1866,0.855963
...,...,...,...,...,...
97749,415551,14,10,6,0.428571
97750,415603,16,74,6,0.375000
97751,415758,3,23,3,1.000000
97752,415867,22,34,12,0.545455


In [22]:
patients_filtered

Unnamed: 0,patient_id,usage_time,session_count,unique_days,usage_freq
0,14189,241,270,33,0.136929
1,14561,722,3,2,0.002770
2,14683,24,498,22,0.916667
3,14685,488,423,63,0.129098
4,14696,2032,12590,1732,0.852362
...,...,...,...,...,...
97749,415551,14,10,6,0.428571
97750,415603,16,74,6,0.375000
97751,415758,3,23,3,1.000000
97752,415867,22,34,12,0.545455


## INTERACT
Filter seems to work for time outlier, so we using patient_filtered

In [23]:
# interact to figure out what thresholds to use

@widgets.interact_manual(usage_time=(1, 365), usage_freq=(0.0, 1.0))
def visualize(usage_time, usage_freq):
    temp = patients_filtered[patients_filtered.usage_time > usage_time]
    temp = temp[temp.usage_freq > usage_freq]
    return "number of patients: %d" %(temp["patient_id"].nunique())

interactive(children=(IntSlider(value=183, description='usage_time', max=365, min=1), FloatSlider(value=0.5, d…

In [24]:
# create the according filtered patients dataframe

usage_time = input("usage time: ")
print("inputted ", usage_time)
usage_freq = input("usage_freq: ")
print("inputted ", usage_freq)

inputted  60
inputted  0.1


In [25]:
filtered_patients_list = patients_filtered[patients_filtered.usage_time > float(usage_time)]
filtered_patients_list = filtered_patients_list[filtered_patients_list.usage_freq > float(usage_freq)]

In [26]:
filtered_patients_list.nunique()

patient_id       7185
usage_time       1203
session_count    1976
unique_days       616
usage_freq       5226
dtype: int64

In [27]:
# create filtered session dataframe from filtered patients data
temp_lst = filtered_patients_list["patient_id"]
sessions_filter_df = df[df.patient_id.isin(temp_lst)]
sessions_filter_df

Unnamed: 0,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time,start_time_min,end_time,end_time_min,deficit_id,disorder_id,domain_id,time_period
2826,639141,15796,20,1,2,0.032,5y,1950,2014-08-02,2014-08-02 21:11:37,2014-08-02,2014-08-02 21:11:40,"{1, 2, 3, 4, 8}","{2, 3}",{1},1
2834,11909161,15796,20,4,5,0.801,5y,1950,2018-06-18,2018-06-18 18:06:06,2018-06-18,2018-06-18 18:12:48,"{1, 2, 3, 4, 8}","{2, 3}",{1},4
2836,12021473,15796,20,4,6,0.928,5y,1950,2018-06-25,2018-06-25 17:01:08,2018-06-25,2018-06-25 17:10:41,"{1, 2, 3, 4, 8}","{2, 3}",{1},4
2838,12166235,15796,20,4,7,0.919,5y,1950,2018-07-09,2018-07-09 18:41:12,2018-07-13,2018-07-13 19:25:29,"{1, 2, 3, 4, 8}","{2, 3}",{1},4
2840,12427387,15796,20,4,6,0.965,5y,1950,2018-07-23,2018-07-23 19:05:27,2018-07-23,2018-07-23 19:13:06,"{1, 2, 3, 4, 8}","{2, 3}",{1},4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61855759,13536520,208098,28,4,5,0.000,5y,1948,2018-10-03,2018-10-03 20:32:58,2018-10-03,2018-10-03 20:36:59,{3},{6},{14},4
61855760,13589947,208098,28,4,2,0.000,5y,1948,2018-10-06,2018-10-06 20:04:34,2018-10-06,2018-10-06 20:00:22,{3},{6},{14},4
61855761,13647964,208098,28,4,4,0.000,5y,1948,2018-10-10,2018-10-10 15:24:09,2018-10-10,2018-10-10 15:31:41,{3},{6},{14},4
61855762,13781806,208098,28,4,3,0.000,5y,1948,2018-10-18,2018-10-18 16:39:13,2018-10-23,2018-10-23 17:00:21,{3},{6},{14},4


In [28]:
# filter based on time_period
time_period_n = input("time period: ")
sessions_filter_df = sessions_filter_df[sessions_filter_df.time_period == int(time_period_n)]

In [29]:
# sessions_filter_df.to_csv("data/PLACEHOLDER.csv")

In [30]:
sessions_filter_df

Unnamed: 0,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time,start_time_min,end_time,end_time_min,deficit_id,disorder_id,domain_id,time_period
2914,24003939,15796,20,5,5,0.850,5y,1950,2020-04-17,2020-04-17 16:16:37,2020-04-17,2020-04-17 16:22:28,"{1, 2, 3, 4, 8}","{2, 3}",{1},5
3270,24003940,15796,96,5,3,0.521,5y,1950,2020-04-17,2020-04-17 16:26:25,2020-04-17,2020-04-17 16:33:58,"{1, 2, 3, 4, 8}","{2, 3}","{1, 3}",5
3566,24003941,15796,12,1,5,1.000,5y,1950,2020-04-17,2020-04-17 16:34:41,2020-04-17,2020-04-17 16:36:25,"{1, 2, 3, 4, 8}","{2, 3}",{2},5
3568,24575532,15796,12,1,10,1.000,5y,1950,2020-05-25,2020-05-25 00:48:15,2020-05-25,2020-05-25 00:56:49,"{1, 2, 3, 4, 8}","{2, 3}",{2},5
4270,24115155,15796,12,2,5,1.000,5y,1950,2020-04-25,2020-04-25 00:07:03,2020-04-25,2020-04-25 00:09:19,"{1, 2, 3, 4, 8}","{2, 3}",{2},5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61845371,21058423,312055,87,1,10,0.802,1y,1936,2019-11-14,2019-11-14 20:54:18,2019-11-14,2019-11-14 20:56:15,"{10, 2, 3, 6}","{2, 3}","{12, 13}",5
61845373,21113156,312055,87,1,10,0.798,1y,1936,2019-11-14,2019-11-14 21:21:56,2019-11-21,2019-11-21 20:15:42,"{10, 2, 3, 6}","{2, 3}","{12, 13}",5
61845375,21274202,312055,87,1,10,0.917,1y,1936,2019-11-22,2019-11-22 21:57:56,2019-11-22,2019-11-22 21:59:52,"{10, 2, 3, 6}","{2, 3}","{12, 13}",5
61845377,21274779,312055,87,1,10,0.811,1y,1936,2019-11-25,2019-11-25 16:47:12,2019-11-25,2019-11-25 16:49:26,"{10, 2, 3, 6}","{2, 3}","{12, 13}",5


Now that we have the filtered dataframe, we want to start performance metric calculation
* add progression order  
**for now**  
* calculate percentile of each session for each domain
* for each patient, average percentile value across all domains **at the time**, which is our final performance metric calculation

This part doesn not need to be run if it's claire's data

In [31]:
# expand sessions due to compressed domain
sessions_filter_df.explode("domain_id").reset_index()

Unnamed: 0,index,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time,start_time_min,end_time,end_time_min,deficit_id,disorder_id,domain_id,time_period
0,2914,24003939,15796,20,5,5,0.850,5y,1950,2020-04-17,2020-04-17 16:16:37,2020-04-17,2020-04-17 16:22:28,"{1, 2, 3, 4, 8}","{2, 3}",1,5
1,3270,24003940,15796,96,5,3,0.521,5y,1950,2020-04-17,2020-04-17 16:26:25,2020-04-17,2020-04-17 16:33:58,"{1, 2, 3, 4, 8}","{2, 3}",1,5
2,3270,24003940,15796,96,5,3,0.521,5y,1950,2020-04-17,2020-04-17 16:26:25,2020-04-17,2020-04-17 16:33:58,"{1, 2, 3, 4, 8}","{2, 3}",3,5
3,3566,24003941,15796,12,1,5,1.000,5y,1950,2020-04-17,2020-04-17 16:34:41,2020-04-17,2020-04-17 16:36:25,"{1, 2, 3, 4, 8}","{2, 3}",2,5
4,3568,24575532,15796,12,1,10,1.000,5y,1950,2020-05-25,2020-05-25 00:48:15,2020-05-25,2020-05-25 00:56:49,"{1, 2, 3, 4, 8}","{2, 3}",2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692206,61845375,21274202,312055,87,1,10,0.917,1y,1936,2019-11-22,2019-11-22 21:57:56,2019-11-22,2019-11-22 21:59:52,"{10, 2, 3, 6}","{2, 3}",13,5
692207,61845377,21274779,312055,87,1,10,0.811,1y,1936,2019-11-25,2019-11-25 16:47:12,2019-11-25,2019-11-25 16:49:26,"{10, 2, 3, 6}","{2, 3}",12,5
692208,61845377,21274779,312055,87,1,10,0.811,1y,1936,2019-11-25,2019-11-25 16:47:12,2019-11-25,2019-11-25 16:49:26,"{10, 2, 3, 6}","{2, 3}",13,5
692209,61845379,21311266,312055,87,1,10,0.897,1y,1936,2019-11-29,2019-11-29 18:09:46,2019-11-29,2019-11-29 18:11:39,"{10, 2, 3, 6}","{2, 3}",12,5


In [32]:
progression_order_df[progression_order_df.time_period == int(time_period_n)]

Unnamed: 0,domain_id,root_id,task_type_id,task_level,progression_order,start_time,time_period
871,1,20,20,1,0,2019-10-08,5
872,1,20,20,2,1,2019-10-08,5
873,1,20,20,3,2,2019-10-08,5
874,1,20,20,4,3,2019-10-08,5
875,1,20,20,5,4,2019-10-08,5
...,...,...,...,...,...,...,...
1170,14,6,6,5,25,2019-10-08,5
1171,14,29,174,6,26,2019-10-08,5
1172,14,7,169,6,27,2019-10-08,5
1173,14,28,173,6,28,2019-10-08,5


In [33]:
sessions_filter_df.dtypes

id                               int64
patient_id                       int64
task_type_id                     int64
task_level                       int64
completed_task_count             int64
accuracy                       float64
condition_since                 object
birth_year                       int64
start_time              datetime64[ns]
start_time_min                  object
end_time                        object
end_time_min                    object
deficit_id                      object
disorder_id                     object
domain_id                       object
time_period                      int64
dtype: object

In [34]:
def set_to_int(s):
    return s.pop()

In [35]:
sessions_filter_df["domain_id"] = sessions_filter_df["domain_id"].apply(set_to_int)
sessions_filter_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sessions_filter_df["domain_id"] = sessions_filter_df["domain_id"].apply(set_to_int)


Unnamed: 0,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time,start_time_min,end_time,end_time_min,deficit_id,disorder_id,domain_id,time_period
2914,24003939,15796,20,5,5,0.850,5y,1950,2020-04-17,2020-04-17 16:16:37,2020-04-17,2020-04-17 16:22:28,"{1, 2, 3, 4, 8}","{2, 3}",1,5
3270,24003940,15796,96,5,3,0.521,5y,1950,2020-04-17,2020-04-17 16:26:25,2020-04-17,2020-04-17 16:33:58,"{1, 2, 3, 4, 8}","{2, 3}",1,5
3566,24003941,15796,12,1,5,1.000,5y,1950,2020-04-17,2020-04-17 16:34:41,2020-04-17,2020-04-17 16:36:25,"{1, 2, 3, 4, 8}","{2, 3}",2,5
3568,24575532,15796,12,1,10,1.000,5y,1950,2020-05-25,2020-05-25 00:48:15,2020-05-25,2020-05-25 00:56:49,"{1, 2, 3, 4, 8}","{2, 3}",2,5
4270,24115155,15796,12,2,5,1.000,5y,1950,2020-04-25,2020-04-25 00:07:03,2020-04-25,2020-04-25 00:09:19,"{1, 2, 3, 4, 8}","{2, 3}",2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61845371,21058423,312055,87,1,10,0.802,1y,1936,2019-11-14,2019-11-14 20:54:18,2019-11-14,2019-11-14 20:56:15,"{10, 2, 3, 6}","{2, 3}",12,5
61845373,21113156,312055,87,1,10,0.798,1y,1936,2019-11-14,2019-11-14 21:21:56,2019-11-21,2019-11-21 20:15:42,"{10, 2, 3, 6}","{2, 3}",12,5
61845375,21274202,312055,87,1,10,0.917,1y,1936,2019-11-22,2019-11-22 21:57:56,2019-11-22,2019-11-22 21:59:52,"{10, 2, 3, 6}","{2, 3}",12,5
61845377,21274779,312055,87,1,10,0.811,1y,1936,2019-11-25,2019-11-25 16:47:12,2019-11-25,2019-11-25 16:49:26,"{10, 2, 3, 6}","{2, 3}",12,5


In [36]:
# add progression order
sessions_filter_df = sessions_filter_df.merge(progression_order_df[progression_order_df.time_period == int(time_period_n)], on=["task_type_id", "task_level", "domain_id"]).reset_index()
sessions_filter_df

Unnamed: 0,index,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time_x,...,end_time,end_time_min,deficit_id,disorder_id,domain_id,time_period_x,root_id,progression_order,start_time_y,time_period_y
0,0,24003939,15796,20,5,5,0.850,5y,1950,2020-04-17,...,2020-04-17,2020-04-17 16:22:28,"{1, 2, 3, 4, 8}","{2, 3}",1,5,20,4,2019-10-08,5
1,1,20401349,27987,20,5,5,0.862,5y,1953,2019-10-09,...,2019-10-09,2019-10-09 18:44:53,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,5,20,4,2019-10-08,5
2,2,20456915,27987,20,5,5,0.888,5y,1953,2019-10-12,...,2019-10-12,2019-10-12 18:03:47,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,5,20,4,2019-10-08,5
3,3,20481221,27987,20,5,5,0.963,5y,1953,2019-10-14,...,2019-10-14,2019-10-14 15:22:18,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,5,20,4,2019-10-08,5
4,4,20481239,27987,20,5,2,0.719,5y,1953,2019-10-16,...,2019-10-16,2019-10-16 23:23:21,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,5,20,4,2019-10-08,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391903,391903,21153223,226386,185,3,10,0.900,1y,1956,2019-11-17,...,2019-11-17,2019-11-17 15:23:27,{2},"{2, 3, 6}",7,5,38,13,2019-10-08,5
391904,391904,20391808,168943,183,4,7,0.825,1y,1965,2019-10-09,...,2019-10-09,2019-10-09 13:54:51,"{1, 2, 3, 4}",{2},7,5,40,11,2019-10-08,5
391905,391905,20413789,168943,183,4,7,0.825,1y,1965,2019-10-11,...,2019-10-11,2019-10-11 13:43:00,"{1, 2, 3, 4}",{2},7,5,40,11,2019-10-08,5
391906,391906,20442005,168943,183,4,7,1.000,1y,1965,2019-10-14,...,2019-10-14,2019-10-14 13:52:26,"{1, 2, 3, 4}",{2},7,5,40,11,2019-10-08,5


Get Percentile

In [37]:
# get percentile of a domain
def get_percentile(data):
    data["percentile"] = data["progression_order"].rank(pct=True)
    return data

In [38]:
## get percentile for each domain
df_pct = sessions_filter_df.groupby("domain_id").apply(get_percentile).reset_index(drop=True)
df_pct

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_pct = sessions_filter_df.groupby("domain_id").apply(get_percentile).reset_index(drop=True)


Unnamed: 0,index,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time_x,...,end_time_min,deficit_id,disorder_id,domain_id,time_period_x,root_id,progression_order,start_time_y,time_period_y,percentile
0,0,24003939,15796,20,5,5,0.850,5y,1950,2020-04-17,...,2020-04-17 16:22:28,"{1, 2, 3, 4, 8}","{2, 3}",1,5,20,4,2019-10-08,5,0.474274
1,1,20401349,27987,20,5,5,0.862,5y,1953,2019-10-09,...,2019-10-09 18:44:53,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,5,20,4,2019-10-08,5,0.474274
2,2,20456915,27987,20,5,5,0.888,5y,1953,2019-10-12,...,2019-10-12 18:03:47,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,5,20,4,2019-10-08,5,0.474274
3,3,20481221,27987,20,5,5,0.963,5y,1953,2019-10-14,...,2019-10-14 15:22:18,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,5,20,4,2019-10-08,5,0.474274
4,4,20481239,27987,20,5,2,0.719,5y,1953,2019-10-16,...,2019-10-16 23:23:21,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,5,20,4,2019-10-08,5,0.474274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391903,391903,21153223,226386,185,3,10,0.900,1y,1956,2019-11-17,...,2019-11-17 15:23:27,{2},"{2, 3, 6}",7,5,38,13,2019-10-08,5,0.905797
391904,391904,20391808,168943,183,4,7,0.825,1y,1965,2019-10-09,...,2019-10-09 13:54:51,"{1, 2, 3, 4}",{2},7,5,40,11,2019-10-08,5,0.456522
391905,391905,20413789,168943,183,4,7,0.825,1y,1965,2019-10-11,...,2019-10-11 13:43:00,"{1, 2, 3, 4}",{2},7,5,40,11,2019-10-08,5,0.456522
391906,391906,20442005,168943,183,4,7,1.000,1y,1965,2019-10-14,...,2019-10-14 13:52:26,"{1, 2, 3, 4}",{2},7,5,40,11,2019-10-08,5,0.456522


In [39]:
## clean dataframe, does not need to be run if Claire's data
df_pct.drop(df_pct.columns[[0, 11, 16, 17, 19, 20]], axis=1, inplace=True)
df_pct

Unnamed: 0,id,patient_id,task_type_id,task_level,completed_task_count,accuracy,condition_since,birth_year,start_time_x,start_time_min,end_time_min,deficit_id,disorder_id,domain_id,progression_order,percentile
0,24003939,15796,20,5,5,0.850,5y,1950,2020-04-17,2020-04-17 16:16:37,2020-04-17 16:22:28,"{1, 2, 3, 4, 8}","{2, 3}",1,4,0.474274
1,20401349,27987,20,5,5,0.862,5y,1953,2019-10-09,2019-10-09 18:18:17,2019-10-09 18:44:53,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,4,0.474274
2,20456915,27987,20,5,5,0.888,5y,1953,2019-10-12,2019-10-12 17:59:21,2019-10-12 18:03:47,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,4,0.474274
3,20481221,27987,20,5,5,0.963,5y,1953,2019-10-14,2019-10-14 15:05:07,2019-10-14 15:22:18,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,4,0.474274
4,20481239,27987,20,5,2,0.719,5y,1953,2019-10-16,2019-10-16 23:17:37,2019-10-16 23:23:21,"{1, 2, 3, 4, 5, 6, 7, 8}",{3},1,4,0.474274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391903,21153223,226386,185,3,10,0.900,1y,1956,2019-11-17,2019-11-17 15:16:09,2019-11-17 15:23:27,{2},"{2, 3, 6}",7,13,0.905797
391904,20391808,168943,183,4,7,0.825,1y,1965,2019-10-09,2019-10-09 13:53:03,2019-10-09 13:54:51,"{1, 2, 3, 4}",{2},7,11,0.456522
391905,20413789,168943,183,4,7,0.825,1y,1965,2019-10-11,2019-10-11 13:41:48,2019-10-11 13:43:00,"{1, 2, 3, 4}",{2},7,11,0.456522
391906,20442005,168943,183,4,7,1.000,1y,1965,2019-10-14,2019-10-14 13:51:18,2019-10-14 13:52:26,"{1, 2, 3, 4}",{2},7,11,0.456522


In [None]:
df_pct.to_csv("data/raw_percentile_time5.csv", index=False)

Get overall

In [None]:
# take in individual dataframe and output dataframe with overall performance metric
def get_score(data):
    updated_domain_pct = dict() # keeps updated domain pct
    score = 0 # score for each session, an average of all available domains
    scores = []

    data = data.sort_values(by="start_time_min").reset_index() # sort data by time

    for idx, row in data.iterrows():
        updated_domain_pct[row["domain_id"]] = row["percentile"] # update domain pct to the latest one
        # find sum of all domain pct
        for k, v in updated_domain_pct.items():
            score += float(v)
        # take average of domain pct, add to list, reset score to 0
        score /= len(updated_domain_pct)
        scores.append(score)
        score = 0
    # set score to the score list
    data["score"] = scores
    return data

In [None]:
## get performance metric by each patient for each session
final_df = df_pct.groupby("patient_id").apply(get_score).reset_index(drop=True)
final_df

## Question
* right now there are duplicate sessions since domain percentile was calulated separately, how do we feed this to the model?
* task changes visualization -> try to explain the fluctuations
* add overall timeline

* update only the domains that have been practiced
* keep other domains constant
* how much they switch domains

In [None]:
import random
pid = random.choice(pd.unique(final_df["patient_id"]))
print(pid)

In [None]:
sns.scatterplot(data=final_df[final_df.patient_id == pid], x="start_time_min", y="score", hue="domain_id", palette="bright").set_title(pid)
plt.plot(final_df[final_df.patient_id == pid]["start_time_min"], final_df[final_df.patient_id == pid]["score"], 'k')

In [None]:
sns.lineplot(data=final_df[final_df.patient_id == pid], x="start_time_min", y="progression_order", hue="domain_id", palette="bright").set_title(pid)

Second metric calculation:
* keep other domains constant 
* how do we start off each domain -> average of when people first start out or just average

In [None]:
df_pct

First: average across all sessions

In [None]:
df_pct.groupby("domain_id")["percentile"].mean()

In [None]:
# take in individual dataframe and output dataframe with overall performance metric
def get_score_avg(data):
    # initialize score with average
    score = [0.5 for i in range(14)]
    scores = []
    

    data = data.sort_values(by="start_time_min") # sort data by time

    for idx, row in data.iterrows():
        score[row["domain_id"] - 1] = row["percentile"]
        # set score to the score list average
        scores.append(sum(score) / len(score))
    data["score"] = scores
    return data

In [None]:
## get performance metric by each patient for each session
avg_final_df = df_pct.groupby("patient_id").apply(get_score_avg).reset_index(drop=True)
avg_final_df

Let's look at results

In [None]:
import random
pid = random.choice(pd.unique(avg_final_df["patient_id"]))
print(pid)
sns.scatterplot(data=avg_final_df[avg_final_df.patient_id == pid], x="start_time_min", y="score", hue="domain_id", palette="bright").set_title(pid)
plt.plot(avg_final_df[avg_final_df.patient_id == pid]["start_time_min"], avg_final_df[avg_final_df.patient_id == pid]["score"])

Second average method: average starting point

In [None]:
df_pct

In [None]:
domain_avg = df_pct.sort_values(by="start_time_min")
domain_avg = domain_avg.drop_duplicates(subset=["patient_id", "domain_id"])

In [None]:
domain_avg["domain_id"].unique()

In [None]:
lst = list(domain_avg.groupby("domain_id")["percentile"].mean().reset_index()["percentile"])
lst

In [None]:
lst = lst[:10] + [0] + lst[10:]
lst

In [None]:
# take in individual dataframe and output dataframe with overall performance metric
def get_score_avg2(data):
    # initialize score with average
    global lst
    score = lst
    scores = []
    
    data = data.sort_values(by="start_time_min") # sort data by time

    for idx, row in data.iterrows():
        score[row["domain_id"] - 1] = row["percentile"]
        # set score to the score list average
        scores.append(sum(score) / len(score))
    data["score"] = scores
    return data

In [None]:
## get performance metric by each patient for each session
avg_final_df2 = df_pct.groupby("patient_id").apply(get_score_avg2).reset_index(drop=True)
avg_final_df2

In [None]:
import random
pid = random.choice(pd.unique(avg_final_df2["patient_id"]))

print(pid)
sns.scatterplot(data=avg_final_df2[avg_final_df2.patient_id == pid], x="start_time_min", y="score", hue="domain_id", palette="bright").set_title(pid)
plt.plot(avg_final_df2[avg_final_df2.patient_id == pid]["start_time_min"], avg_final_df2[avg_final_df2.patient_id == pid]["score"])

Normalization

In [None]:
df_pct

In [None]:
sorted_df = df_pct.sort_values(by="start_time_min")
sorted_df

In [None]:
# initial score
initial = lst

In [None]:
column_names = ["domain %d score" % i for i in range(1, 15)]
column_names

In [None]:
# create domain score columns, input a patient's session data
def create_domain_scores(data):
    global initial
    global column_names
    score = initial
    data = data.sort_values(by="start_time_min").reset_index() # sort data by time
    scores = np.zeros((len(data), 14))
    i = 0

    for idx, row in data.iterrows():
        score[row["domain_id"] - 1] = row["percentile"]
        # set score to the score list average
        scores[i] = score
        i += 1
    data = pd.concat([data, pd.DataFrame(scores, columns=column_names)], axis=1)
    return data

In [None]:
scores_df = sorted_df.groupby("patient_id").apply(create_domain_scores).reset_index(drop=True)
scores_df

In [None]:
# nomralize scores
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
# scores only df
scores_df_minmax = MinMaxScaler().fit_transform(scores_df[column_names])

In [None]:
minmax_df = scores_df
minmax_df[column_names] = scores_df_minmax

In [None]:
minmax_df.columns

In [None]:
minmax_df['score'] = minmax_df[column_names].mean(axis=1)
minmax_df

In [None]:
avg_final_df2[avg_final_df2.patient_id == pid]

In [None]:
minmax_df[minmax_df.patient_id == pid]

In [None]:
import random
pid = random.choice(pd.unique(minmax_df["patient_id"]))
print(pid)
sns.scatterplot(data=minmax_df[minmax_df.patient_id == pid], x="start_time_min", y="score", hue="domain_id", palette="bright").set_title(pid)
plt.plot(minmax_df[minmax_df.patient_id == pid]["start_time_min"], minmax_df[minmax_df.patient_id == pid]["score"])

In [None]:
temp = minmax_df[minmax_df.patient_id == pid]

In [None]:
sns.lineplot(data=temp, x="start_time_min", y="score", hue="domain_id", palette="bright").set_title(pid)

In [None]:
sns.lineplot(data=temp, x="start_time_min", y="progression_order", hue="domain_id", palette="bright").set_title(pid)

In [None]:
temp[["domain_id", "task_type_id", "task_level", "score", "progression_order", "domain 14 score", "start_time_min", "id"]]

Create day-to-day score -> combine sessions of the same domain -> average that then average across domains -> one score for each day instead of each session

Filter out fluctations

In [None]:
# return false if there's fluctation given a person's data
def fluctate(data):
    data = data.sort_values(by="start_time_min").reset_index() # sort data by time
    d = dict()
    for idx, row in data.iterrows():
        if row["start_time"] not in d:
            d[row["start_time"]] = [row["domain_id"]]
        else:
            if row["domain_id"] in d[row["start_time"]]:
                return False
            else:
                d[row["start_time"]].append(row["domain_id"])
    return True

In [None]:
filter_lst = minmax_df.groupby("patient_id").apply(fluctate).reset_index()

In [None]:
filter_lst = filter_lst.rename(columns={0: "a"})
filter_lst

In [None]:
filter_lst[filter_lst.a]["patient_id"]

In [None]:
flutuate_data = minmax_df.loc[minmax_df["patient_id"].isin(filter_lst[filter_lst.a]["patient_id"])].reset_index()
flutuate_data

Filtering doesn't seem to work, work on incorporating accuracy

In [None]:
df_pct

In [None]:
def new_score(row):
    score = None
    p = [1.05, 1.0, 0.9, 0.8]
    if row.accuracy > .90:
        score = row.percentile * p[0]
    elif row.accuracy > .60:
        score = row.percentile * p[1]
    elif row.accuracy > .40:
        score = row.percentile * p[2]
    else:
        score = row.percentile * p[3]
    return score

In [None]:
# 12 seconds
df_pscore = df_pct.copy(deep=True)
df_pscore["percentile"] = df_pscore.apply(new_score, axis=1)
df_pscore

In [None]:
def combine(data):
    data = data.sort_values(by="start_time_min").reset_index() # sort data by time
    return data.groupby(["domain_id", "start_time"])["percentile"].mean().reset_index()

In [None]:
# create domain score columns, input a patient's session data
def create_domain_scores2(data):
    global initial
    global column_names
    score = initial
    data = data.sort_values(by="start_time_min").reset_index() # sort data by time
    i = 0
    combined_scores = combine(data)
    scores = np.zeros((len(combined_scores), 14))
    combined_scores = combined_scores.sort_values(by="start_time").reset_index()
    for idx, row in combined_scores.iterrows():
        score[row["domain_id"] - 1] = row["percentile"]
        # set score to the score list average
        scores[i] = score
        i += 1
    data = data.drop_duplicates(subset=["domain_id", "start_time"]).reset_index()
    data = pd.concat([data, pd.DataFrame(scores, columns=column_names)], axis=1)
    return data

In [None]:
df_pscore2 = df_pscore.groupby("patient_id").apply(create_domain_scores2).reset_index(drop=True)
df_pscore2

In [None]:
column_n = column_names

In [None]:
scores_df_minmax2 = MinMaxScaler().fit_transform(df_pscore2[column_n])

In [None]:
minmax_df2 = df_pscore2
minmax_df2[column_n] = scores_df_minmax2

In [None]:
minmax_df2['score'] = minmax_df2[column_n].mean(axis=1)
minmax_df2

In [None]:
import random
pid = random.choice(pd.unique(minmax_df2["patient_id"]))
print(pid)
sns.scatterplot(data=minmax_df2[minmax_df2.patient_id == pid], x="start_time_min", y="score", hue="domain_id", palette="bright").set_title(pid)
plt.plot(minmax_df2[minmax_df2.patient_id == pid]["start_time_min"], minmax_df2[minmax_df2.patient_id == pid]["score"], 'k')

In [None]:
minmax_df2

In [None]:
temp = df_pct[df_pct.patient_id == 30679]
temp = temp.sort_values(by="start_time_min").reset_index()
temp[temp.domain_id == 10][["accuracy", "progression_order", "percentile"]][temp.progression_order == 3]

In [None]:
temp = create_domain_scores2(df_pscore[df_pscore.patient_id == 87299])
temp[temp.domain_id == 4][["accuracy", "progression_order", "domain 4 score", "start_time"]]