In [3]:
from datetime import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from statsmodels.nonparametric.smoothers_lowess import lowess
from collections import defaultdict
import pywt
import scaleogram as scg
from ssqueezepy import cwt
from ssqueezepy.visuals import plot, imshow
from sklearn.metrics import mean_squared_error 

## ignore warnings
import warnings
warnings.filterwarnings('ignore')

# import the written functions
from CGMfunctions import *

## Hall 2018 data
This study analyzes how blood glucose fluctuates in healthy individuals by using a CGM to monitor glucose. Standardized meals (breakfast only) were given to a subset of patients in order to monitor the effect of meals on the glucose readings of healthy individuals. The subjects in this study had no prior diabetes diagnosis. <br>
Sample size: 57 <br>
Diabetes Type: Healthy <br>
Population Group: Adult 18+ <br>
CGM device: Dexcom G4 <br>
Duration: varies <br> 
Interval: 5 minutes

In [4]:
# making data frame  
cgm_hall2018 = pd.read_csv("./data/Hall2018_processed.csv",header = 0,index_col=False)  
cgm_hall2018.rename(columns = {'gl': 'Glucose','time':'Time'},inplace = True)
cgm_hall2018.head(3)

Unnamed: 0,id,Time,Glucose
0,1636-69-001,2014-02-03 03:42:12,93.0
1,1636-69-001,2014-02-03 03:47:12,93.0
2,1636-69-001,2014-02-03 03:52:12,93.0


In [5]:
len(cgm_hall2018.id.unique())

57

## tsalikian2005 data
The purpose of this study was to find out how often low blood sugar (hypoglycemia) occurs during the night after exercise in late afternoon for children aged 10 to 18 with type 1 diabetes. The total sample size was 50 participants. The OneTouch Ultra Meter was used to continuously monitor glucose levels during two seperate 24 hours periods. <br>
Sample size: 50 <br>
Diabetes Type: Type 1 <br>
Population Group: Children (10-18) <br>
CGM device: OneTouch Ultra Meter <br>
Duration: 48 hours <br> 
Interval: 5 minutes

In [4]:
tsalikian2005 = pd.read_csv("./data/Tsalikian2005_processed.csv",header = 0,index_col=False)  
tsalikian2005.rename(columns = {'gl': 'Glucose','time':'Time'},inplace = True)
tsalikian2005.head(3)
print(len(tsalikian2005.id.unique()))
tsalikian2005.id.unique()

50


array([ 7, 43,  2, 27, 38, 15, 16, 19, 22, 32, 11,  8, 20, 44, 35, 18, 42,
       17, 36, 46, 33, 48,  5, 53, 14, 40, 13, 41, 25, 26, 28,  3, 50, 29,
       24, 12,  9, 23,  1, 10, 54, 39,  4, 37, 31, 30,  6, 52, 21, 45],
      dtype=int64)

## Anderson2016 data
This study was designed to test a closed-loop control-to-range artificial pancreas (AP) system. <br>
Sample size: 30 (phase 1), 14 (phase 2) <br>
Diabetes Type: Type 1 <br>
Population Group: Adult 18+ <br>
CGM device: Dexcom G4 Platinum <br>
Duration: 6-9 weeks (Phase 1), 20+ weeks (Phase 2) <br> 
Interval: 5 mins

In [7]:
Anderson2016 = pd.read_csv("./data/Anderson2016_processed.csv",header = 0,index_col=False)  
Anderson2016.rename(columns = {'gl': 'Glucose','time':'Time'},inplace = True)
Anderson2016.head(3)

Unnamed: 0,id,Time,Glucose
0,1,2013-09-29 17:36:48,194
1,1,2013-09-29 17:41:48,204
2,1,2013-09-29 17:46:48,201


In [8]:
Anderson2016.id.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 13, 12, 14, 15, 16, 17,
       18, 19, 20, 26, 27, 28, 29, 30, 21, 22, 23, 24, 25], dtype=int64)

## Buckingham2007 data
This study was designed as a pilot study to analyze use of a CGM for children with diabetes. The subjects first established a baseline during a week blinded use, followed by at home use for 3 months. <br>
Sample size: 30 (insulin pump), 30 (MDI) <br>
Diabetes Type: Type 1 <br>
Population Group: Children (3-17) <br>
CGM device: FreeStyle Navigator	 <br>
Duration: 13 Weeks <br> 
Interval: 10 minutes


In [9]:
Buckingham2007 = pd.read_csv("./data/Buckingham2007_processed.csv",header = 0,index_col=False)  
Buckingham2007.rename(columns = {'gl': 'Glucose','time':'Time'},inplace = True)
Buckingham2007.head(3)

Unnamed: 0,id,Time,Glucose
0,2,2000-01-10 16:13:00,108
1,2,2000-01-15 22:57:02,136
2,2,2000-01-10 16:22:55,108


In [10]:
Buckingham2007.id.unique()

array([ 2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20,
       21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40,
       41, 43, 44, 47, 50, 51, 52, 53, 54, 57, 58, 59, 60, 62],
      dtype=int64)

## Data information

Choose a dataset and get the basic information for each subject

In [5]:
dfs_good = data_subject_info(tsalikian2005)  #tsalikian2005 #cgm_hall2018 # Anderson2016 #Buckingham2007

There are in total  50 subjects.
The information for subject:  1
There are missing values in the dataset:
- Time: 24 missing values
Missing time range is  [(168, 179), (24626, 24637)]
Records per day:
          Date  Number of Records
0   2000-04-01                288
1   2000-04-02                144
2   2000-04-07                298
3   2000-04-08                174
4   2013-09-29                 77
..         ...                ...
93  2014-02-22                 28
94  2014-02-23                287
95  2014-02-24                279
96  2014-02-25                285
97  2014-02-26                 96

[98 rows x 2 columns]

Total number of days: 98
The information for subject:  2
There are no missing values in the dataset.
Records per day:
           Date  Number of Records
0    2000-02-06                212
1    2000-02-07                146
2    2000-02-15                298
3    2000-02-16                194
4    2013-08-26                 23
..          ...                ...
247 

## Discrete Wavelet Transform

Define the wavelet candidates and the level candidats for later DWT analysis

In [10]:
wavelet_candidates = ['bior1.1', 'bior1.3', 'bior1.5', 'bior2.2', 'bior2.4', 'bior2.6', 'bior2.8', 'bior3.1', 'bior3.3', 'bior3.5', 'bior3.7',
                          'bior3.9', 'bior4.4','coif2','coif10', 'coif16','db2', 'db3', 'db4', 'db5', 'db6', 'db7', 'db8', 'db9', 'db10', 'db11',
                          'db12', 'db15', 'db25',  'haar', 'rbio1.5', 'rbio3.1', 'rbio6.8']
level_candidates = [2,3,4,5,6]

Check the level of decomposition for a specific subject on a specific wavelet

In [13]:
# check the max level of decomposition
w = pywt.Wavelet('bior3.9')
print(pywt.dwt_max_level(data_len=len(dfs_good[2]), filter_len=w.dec_len))  # '1636-69-026'
#print(pywt.dwt_max_level(data_len=900, filter_len=w.dec_len))
len(dfs_good[2])

11


67420

Get the best parameters for all subjects 

In [14]:
all_param_df,all_param_dict = get_best_parameters_allsubject(dfs_good,wavelet_candidates,level_candidates)

In [24]:
all_param_df.shape

NameError: name 'all_param_df' is not defined

### Choose One subject for visualization

In [134]:
subject_1 = all_param_df['subject'][14]
print(subject_1)
df_good = dfs_good[subject_1]
one_param_df, one_param_alldf = get_best_parameters_onesub(df_good,wavelet_candidates,level_candidates)

15


In [47]:
one_param_df

Unnamed: 0,Metric,Value,Name,Thresh
0,MSE,0.688325,bior3.9_level_2,soft
1,PSNR,53.684951,bior3.9_level_2,hard
2,SNR,46.683729,bior3.9_level_2,hard
3,R^2,0.999843,bior3.9_level_2,hard


In [48]:
count_records_per_day(df_good)

(           Date  Number of Records
 0    2000-03-05                300
 1    2000-03-06                206
 2    2000-03-19                300
 3    2000-03-20                174
 4    2013-07-27                 71
 ..          ...                ...
 111  2013-12-11                287
 112  2013-12-12                287
 113  2013-12-13                287
 114  2013-12-14                238
 115  2013-12-15                 80
 
 [116 rows x 2 columns],
 116)

In [49]:
best_wavelet, best_level = best_parameter(one_param_df)
print(best_wavelet,best_level)
final_reconstructed = dwt_denoise(df_good["Glucose"],wavelet=best_wavelet, level=best_level, thresholding='hard')
#final_reconstructed = dwt_denoise(df_good["Glucose"],wavelet='bior3.9', level=2, thresholding='hard')

bior3.9 2


In [115]:
#plot_compare_zoom(df_good, final_reconstructed)

In [114]:
#interactive_compare(df_good,final_reconstructed)

## Split all data into daily data then do DWT

Calculate the metrics for each subject on each day and return into a big dataframe

In [52]:
# no need to run every time
allsub_bydate_df, allsub_bydate_dict = get_best_parameters_allsubject_alldates(dfs_good,wavelet_candidates, level_candidates)


print the result for one subject, and all metrics for all different days

In [53]:
print(allsub_bydate_dict[subject_1].head(6))

  Metric      Value             Name Thresh  subject        date
0    MSE   0.347005  bior2.4_level_2   soft       29  2013-09-29
1   PSNR  54.562857  bior2.4_level_2   hard       29  2013-09-29
2    SNR  50.224372  bior2.4_level_2   hard       29  2013-09-29
3    R^2   0.999906  bior2.4_level_2   hard       29  2013-09-29
4    MSE   0.290529  bior2.8_level_2   soft       29  2013-09-30
5   PSNR  54.939433  bior2.8_level_2   hard       29  2013-09-30


### Test on one subject on daily data

In [1]:
records_per_day_df, total_days = count_records_per_day(df_good)
print("Records per day:")
print(records_per_day_df)
print("\nTotal number of days:", total_days)

NameError: name 'count_records_per_day' is not defined

In [143]:
# get one date one subject data
daily_df_dict = split_by_date_onesub(df_good) # for subject 1 selected before
try_date =  pd.to_datetime('2012-03-29').date()
oneday_df = daily_df_dict[try_date]
print(len(oneday_df))
oneday_df.iloc[55:65]

301


Unnamed: 0,id,Time,Glucose
55,15,2012-03-29 04:35:14,281
56,15,2012-03-29 04:40:14,281
57,15,2012-03-29 04:45:14,283
58,15,2012-03-29 04:50:14,281
59,15,2012-03-29 04:55:14,280
60,15,2012-03-29 05:00:14,277
61,15,2012-03-29 05:05:14,273
62,15,2012-03-29 05:10:14,270
63,15,2012-03-29 05:15:14,264
64,15,2012-03-29 05:20:14,260


In [23]:
df_good = dfs_good[16]

Unnamed: 0,id,Time,Glucose
0,16,2000-02-13 10:50:00,142
1,16,2000-02-13 10:55:00,141
2,16,2000-02-13 11:00:00,146
3,16,2000-02-13 11:05:00,152
4,16,2000-02-13 11:10:00,162
...,...,...,...
35810,16,2000-03-11 11:02:00,151
35811,16,2000-03-11 11:07:00,151
35812,16,2000-03-11 11:12:00,151
35813,16,2000-03-11 11:17:00,151


In [20]:
daily_metrics_dict, daily_param_dict = get_best_parameters_onesubject_alldates(df_good,wavelet_candidates, level_candidates)

In [21]:
daily_reconstructions= get_reconstruct_daily(df_good,daily_param_dict)


In [22]:
## this code give the interactive plots for each day of one subject
interactive_daily_compare(df_good,daily_reconstructions)

## Add noise to the data

In [40]:
noisy_dfs = add_noise_to_all_dfs(dfs_good, 3)

In [45]:
all__noise_df,all_noise_dict = get_best_parameters_allsubject(noisy_dfs,wavelet_candidates,level_candidates)
all_noise_dict[subject_1][try_date]

KeyboardInterrupt: 

### add noise to one day data

In [42]:
# one subject data
noisy_sub1_df = noisy_dfs[subject_1]
# split into daily data
noisy_daily_df_dict = split_by_date_onesub(noisy_sub1_df)
# get one day data
noisy_oneday_df = noisy_daily_df_dict[try_date]
noisy_oneday_df

Unnamed: 0,id,Time,Glucose
0,8,2000-04-01 00:02:00,49.112719
1,8,2000-04-01 00:07:00,39.581044
2,8,2000-04-01 00:12:00,43.141119
3,8,2000-04-01 00:17:00,41.320390
4,8,2000-04-01 00:22:00,45.093667
...,...,...,...
163,8,2000-04-01 06:37:00,136.000000
164,8,2000-04-01 06:42:00,143.000000
165,8,2000-04-01 06:47:00,149.000000
166,8,2000-04-01 06:52:00,140.000000


In [43]:
one_para_df, one_allresult = get_best_parameters_onesub(noisy_oneday_df,wavelet_candidates,level_candidates)
best_wavelet, best_level = best_parameter(one_para_df)
print(best_wavelet,best_level)
final_reconstructed = dwt_denoise(noisy_oneday_df["Glucose"],wavelet=best_wavelet, level=best_level, thresholding='hard')
print(get_metrics(noisy_oneday_df['Glucose'],final_reconstructed))

db2 2
        MSE       PSNR        SNR       R^2
0  3.281079  38.303558  34.706819  0.996963


In [44]:
interactive_compare_three(oneday_df,noisy_oneday_df,final_reconstructed)