# Denoise CGM Using Wavelet Transform on Different time segment 
We are trying to clean the data using wavelet transform

* [Cleaning data](#cleaning_data)
* [Denoise Data with DWT ](#denoise_data)
* Adding noise

In [19]:
from datetime import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from statsmodels.nonparametric.smoothers_lowess import lowess
from collections import defaultdict
import pywt
import scaleogram as scg
from ssqueezepy import cwt
from ssqueezepy.visuals import plot, imshow
from sklearn.metrics import mean_squared_error 
# import the written functions
from CGMfunctions import *
## ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [20]:
tsalikian2005 = pd.read_csv("./data/Tsalikian2005_processed.csv",header = 0,index_col=False)  
tsalikian2005.rename(columns = {'gl': 'Glucose','time':'Time'},inplace = True)
tsalikian2005.id.unique()

array([ 7, 43,  2, 27, 38, 15, 16, 19, 22, 32, 11,  8, 20, 44, 35, 18, 42,
       17, 36, 46, 33, 48,  5, 53, 14, 40, 13, 41, 25, 26, 28,  3, 50, 29,
       24, 12,  9, 23,  1, 10, 54, 39,  4, 37, 31, 30,  6, 52, 21, 45],
      dtype=int64)

In [21]:
Anderson2016 = pd.read_csv("./data/Anderson2016_processed.csv",header = 0,index_col=False)  
Anderson2016.rename(columns = {'gl': 'Glucose','time':'Time'},inplace = True)
Anderson2016.head(3)

Unnamed: 0,id,Time,Glucose
0,1,2013-09-29 17:36:48,194
1,1,2013-09-29 17:41:48,204
2,1,2013-09-29 17:46:48,201


In [22]:
dfs_good = data_subject_info(Anderson2016)
allsub_daily_dict = split_by_date_allsub(dfs_good)

There are in total  30 subjects.
The information for subject:  1
There are no missing values in the dataset.
Records per day:
          Date  Number of Records
0   2013-09-29                 77
1   2013-09-30                284
2   2013-10-01                285
3   2013-10-02                277
4   2013-10-03                288
..         ...                ...
89  2014-02-22                 28
90  2014-02-23                287
91  2014-02-24                279
92  2014-02-25                285
93  2014-02-26                 96

[94 rows x 2 columns]

Total number of days: 94
The information for subject:  2
There are no missing values in the dataset.
Records per day:
           Date  Number of Records
0    2013-08-26                 23
1    2013-08-27                283
2    2013-08-28                271
3    2013-08-29                267
4    2013-08-30                282
..          ...                ...
243  2014-10-17                288
244  2014-10-18                286
245  2014

In [23]:
allsub_daily_240_dict = filter_dfdict_by_record_count(allsub_daily_dict,144) 
allsub_chunk_dict = split_crossover_timechunk(allsub_daily_240_dict)
allsub_chunk_dict2 = remove_dup_dicts(allsub_chunk_dict)

In [24]:
def split_into_8hour_segments(df):
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
    df = df.dropna(subset=['Time'])
    df.set_index('Time', inplace=True)
    segments = [("00:00", "07:59"), ("08:00", "15:59"), ("16:00", "23:59")]
    segmented_dfs = {}
    for start, end in segments:
        segment_start = start
        segment_end = end
        if segment_end == "00:00":
            segment_end = "23:59:59"
        else:
            segment_end = segment_end
            
        segment_df = df.between_time(segment_start, segment_end).reset_index()
        key = f"{segment_start}-{segment_end}"
        segmented_dfs[key] = segment_df
    
    return segmented_dfs

def split_into_8hour_dict(df_dicts):
    splited_dict = {}
    for date, date_dfs in df_dicts.items():
        middle_dict = {}
        for subject, df in date_dfs.items():
            final = split_into_8hour_segments(df)
            # for i, df in enumerate(df_list):
            #     new_subject = subject if i == 0 else f"{subject}_2"
            #     middle_dict[new_subject] = split_into_6hour_segments(df)
            middle_dict[subject] = final
        splited_dict[date] = middle_dict
    return splited_dict

def metricdf_by_timesegment(allsub_6hour_2keydict,denoised_signal_dict,org_dict_labels,denoised_dict_labels):
    subjects = []
    mse_values = []
    psnr_values = []
    snr_values = []
    org_labels = []
    denoise_labels = []
    for date, dict in allsub_6hour_2keydict.items():
        for subject, df in dict.items():
            mse_1= get_metrics(df['Glucose'],denoised_signal_dict[date][subject])['MSE'].values 
            psnr_1= get_metrics(df['Glucose'],denoised_signal_dict[date][subject])['PSNR'].values
            snr_1= get_metrics(df['Glucose'],denoised_signal_dict[date][subject])['SNR'].values
            subjects.append(subject)
            mse_values.append(np.round(mse_1,6))
            psnr_values.append(np.round(psnr_1,6))
            snr_values.append(np.round(snr_1,6))
            org_labels.append(org_dict_labels[date][subject]['Label'][0])
            denoise_labels.append(denoised_dict_labels[date][subject]['Label'][0])

    result_df = pd.DataFrame({
        'subject': subjects,
        'MSE_values': mse_values,
        'PSNR_values': psnr_values,
        'SNR_values': snr_values,
        'org_label':org_labels,
        'denoise_label':denoise_labels
    })        
    return result_df

def summarydf_timesegment(df):
    df['time_segment'] = df['subject'].apply(lambda x: x.split('_')[-1])
    result = df.groupby(['time_segment','denoise_label']).agg( mean_MSE=('MSE_values', 'mean'), mean_PSNR=('PSNR_values', 'mean'), mean_SNR=('SNR_values', 'mean'),
                                                              count=('MSE_values', 'size')).reset_index() #,
                                            
    result2 = df.groupby(['time_segment']).agg(mean_MSE=('MSE_values', 'mean'), mean_PSNR=('PSNR_values', 'mean'), mean_SNR=('SNR_values', 'mean'),
                                               org_1s = ('org_label','sum'), denoise_1s = ('denoise_label','sum'), count=('MSE_values', 'size')).reset_index()
    return result, result2

def split_into_4hour_segments(df):
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
    df = df.dropna(subset=['Time'])
    df.set_index('Time', inplace=True)
    segments = [("00:00", "03:59"), ("04:00", "7:59"), ("8:00", "11:59"),
                ("12:00","15:59"),("16:00","19:59"),("20:00","23:59")]
    segmented_dfs = {}
    for start, end in segments:
        segment_start = start
        segment_end = end
        if segment_end == "00:00":
            segment_end = "23:59:59"
        else:
            segment_end = segment_end
            
        segment_df = df.between_time(segment_start, segment_end).reset_index()
        key = f"{segment_start}-{segment_end}"
        segmented_dfs[key] = segment_df
    
    return segmented_dfs

def split_into_4hour_dict(df_dicts):
    splited_dict = {}
    for date, date_dfs in df_dicts.items():
        middle_dict = {}
        for subject, df in date_dfs.items():
            final = split_into_4hour_segments(df)
            # for i, df in enumerate(df_list):
            #     new_subject = subject if i == 0 else f"{subject}_2"
            #     middle_dict[new_subject] = split_into_6hour_segments(df)
            middle_dict[subject] = final
        splited_dict[date] = middle_dict
    return splited_dict

In [25]:
allsub_8hours_dict = split_into_8hour_dict(allsub_chunk_dict2)

In [26]:
allsub_8hour_combined = combine_3keys(allsub_8hours_dict)
allsub_8hour_96_combined = filter_6hourdict_by_record_count(allsub_8hour_combined,96)
len(allsub_8hour_96_combined)

5909

In [27]:
allsub_8hour_2keydict = split_combined_3keys(allsub_8hour_96_combined)

In [28]:
wavelet_candidates = ['bior1.1', 'bior1.3', 'bior1.5', 'bior2.2', 'bior2.4', 'bior2.6', 'bior2.8', 'bior3.1', 'bior3.3', 'bior3.5', 'bior3.7',
                          'bior3.9', 'bior4.4','coif2','coif10', 'coif16','db2', 'db3', 'db4', 'db5', 'db6', 'db7', 'db8', 'db9', 'db10', 'db11',
                          'db12', 'db15', 'db25',  'haar', 'rbio1.5', 'rbio3.1', 'rbio6.8']
level_candidates = [2,3,4]

In [29]:
all_param_daily_df,all_param_daily_dict = get_best_parameters_allsubject_alldates(allsub_8hour_96_combined,wavelet_candidates,level_candidates)
denoised_signal_dict = denoised_cgm_allsub_daily(allsub_8hour_2keydict,all_param_daily_df)

In [30]:
allsub_8hour_2keydict_labels = label_glucose(allsub_8hour_2keydict)
denoised_signal_dict_labels = label_denoised_glucose(denoised_signal_dict)

In [31]:
metricdf_8hour = metricdf_by_timesegment(allsub_8hour_2keydict,denoised_signal_dict,
                                         allsub_8hour_2keydict_labels,denoised_signal_dict_labels)
summarydf_8hour,summarydf_8hour2 = summarydf_timesegment(metricdf_8hour)
print(summarydf_8hour)
print(summarydf_8hour2)

  time_segment  denoise_label               mean_MSE             mean_PSNR  \
0  00:00-07:59              0  [0.39221390692969155]   [52.80269063378865]   
1  00:00-07:59              1  [0.48842248750000006]       [52.0595697175]   
2  08:00-15:59              0   [0.5931439991980758]  [50.420965534081866]   
3  08:00-15:59              1    [0.595744973360656]   [49.91865355737699]   
4  16:00-23:59              0   [0.6277757405271823]   [50.93766338467872]   
5  16:00-23:59              1   [0.6703144665523152]   [49.83195435849059]   

              mean_SNR  count  
0  [50.37061154476484]   1977  
1  [48.55402401499999]    400  
2  [47.50317561186848]   1247  
3  [46.21181575819672]    488  
4  [47.96254112602958]   1214  
5  [46.00859215437393]    583  
  time_segment              mean_MSE             mean_PSNR  \
0  00:00-07:59  [0.4084038237273873]     [52.677638733698]   
1  08:00-15:59  [0.5938755700288191]  [50.279681243227586]   
2  16:00-23:59   [0.641576562604342]   [50.

In [32]:
allsub_4hours_dict = split_into_4hour_dict(allsub_chunk_dict2)
allsub_4hour_combined = combine_3keys(allsub_4hours_dict)
allsub_4hour_48_combined = filter_6hourdict_by_record_count(allsub_4hour_combined,48)
print(len(allsub_4hour_48_combined))
allsub_4hour_2keydict = split_combined_3keys(allsub_4hour_48_combined)

15457


In [33]:
all_param_daily_df4,all_param_daily_dict4 = get_best_parameters_allsubject_alldates(allsub_4hour_48_combined,wavelet_candidates,level_candidates)
denoised_signal_dict4 = denoised_cgm_allsub_daily(allsub_4hour_2keydict,all_param_daily_df4)

In [34]:
denoised_signal_dict4 = denoised_cgm_allsub_daily(allsub_4hour_2keydict,all_param_daily_df4)

In [35]:
allsub_4hour_2keydict_labels = label_glucose(allsub_4hour_2keydict)
denoised_signal_dict_labels4 = label_denoised_glucose(denoised_signal_dict4)

In [36]:
metricdf_4hour = metricdf_by_timesegment(allsub_4hour_2keydict,denoised_signal_dict4,
                                         allsub_4hour_2keydict_labels,denoised_signal_dict_labels4)
summarydf_4hour,summarydf_4hour2 = summarydf_timesegment(metricdf_4hour)
print(summarydf_4hour)
print(summarydf_4hour2)

   time_segment  denoise_label               mean_MSE             mean_PSNR  \
0   00:00-03:59              0  [0.27572212109815697]                 [inf]   
1   00:00-03:59              1  [0.30885341967213115]   [53.40552834754099]   
2    04:00-7:59              0   [0.3276337413793106]   [54.99789973333326]   
3    04:00-7:59              1  [0.40995084558823547]   [53.57549702205882]   
4   12:00-15:59              0  [0.48473119889502775]   [51.89740888749374]   
5   12:00-15:59              1  [0.44239126293103487]   [50.70661480172417]   
6   16:00-19:59              0   [0.4851912838609242]   [51.47055119252721]   
7   16:00-19:59              1   [0.5086359134808855]  [50.506836853118706]   
8   20:00-23:59              0  [0.48503906867167945]   [51.99894123157902]   
9   20:00-23:59              1   [0.5102536201373002]    [49.7554589130435]   
10   8:00-11:59              0  [0.44863249974683567]   [51.71484600000007]   
11   8:00-11:59              1   [0.4161034400000001