In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from scipy.fftpack import fft, ifft,rfft
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, RepeatedKFold
from joblib import dump, load
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

In [2]:
insulin1_df=pd.read_csv('InsulinData.csv',low_memory=False,usecols=['Date','Time','BWZ Carb Input (grams)'])
cgm1_df=pd.read_csv('CGMData.csv',low_memory=False,usecols=['Date','Time','Sensor Glucose (mg/dL)'])

In [3]:
insulin1_df['date_time_stamp']=pd.to_datetime(insulin1_df['Date'] + ' ' + insulin1_df['Time'])
cgm1_df['date_time_stamp']=pd.to_datetime(cgm1_df['Date'] + ' ' + cgm1_df['Time'])

In [4]:
insulin2_df=pd.read_csv('Insulin_patient2.csv',low_memory=False,usecols=['Date','Time','BWZ Carb Input (grams)'])
cgm2_df=pd.read_csv('CGM_patient2.csv',low_memory=False,usecols=['Date','Time','Sensor Glucose (mg/dL)'])

In [5]:
insulin2_df['date_time_stamp']=pd.to_datetime(insulin2_df['Date'] + ' ' + insulin2_df['Time'])
cgm2_df['date_time_stamp']=pd.to_datetime(cgm2_df['Date'] + ' ' + cgm2_df['Time'])

In [6]:
def mealDataExtraction(insulin_df, cgm_df, date_identifier):
    insulin_df = insulin_df.set_index('date_time_stamp')
    valid_timestamps = []
    find_carb_inputs = insulin_df.sort_values(by='date_time_stamp', ascending=True).dropna().reset_index()
    find_carb_inputs['BWZ Carb Input (grams)'].replace(0.0, np.nan, inplace=True)
    find_carb_inputs = find_carb_inputs.dropna().reset_index().drop(columns='index')
    
    for i, timestamp in enumerate(find_carb_inputs['date_time_stamp']):
        try:
            time_diff = (find_carb_inputs['date_time_stamp'][i+1] - timestamp).seconds / 60.0
            if find_carb_inputs.loc[i, 'BWZ Carb Input (grams)'] > 0 and time_diff >= 120:
                valid_timestamps.append(timestamp)
        except KeyError:
            pass
    
    meal_data = []
    if date_identifier == 1:
        for timestamp in valid_timestamps:
            start = pd.to_datetime(timestamp - timedelta(minutes=30))
            end = pd.to_datetime(timestamp + timedelta(minutes=120))
            date_str = timestamp.date().strftime('%#m/%#d/%Y')
            meal_data.append(cgm_df.loc[cgm_df['Date'] == date_str].set_index('date_time_stamp').between_time(start_time=start.strftime('%#H:%#M:%#S'),end_time=end.strftime('%#H:%#M:%#S'))['Sensor Glucose (mg/dL)'].values.tolist())
        return pd.DataFrame(meal_data)
    else:
        for timestamp in valid_timestamps:
            start = pd.to_datetime(timestamp - timedelta(minutes=30))
            end = pd.to_datetime(timestamp + timedelta(minutes=120))
            date_str = timestamp.date().strftime('%Y-%m-%d')
            meal_data.append(cgm_df.loc[cgm_df['Date'] == date_str].set_index('date_time_stamp').between_time(start_time=start.strftime('%H:%M:%S'),end_time=end.strftime('%H:%M:%S'))['Sensor Glucose (mg/dL)'].values.tolist())
        return pd.DataFrame(meal_data)

In [7]:
meal_data1 = mealDataExtraction(insulin1_df, cgm1_df, 1)
meal_data2 = mealDataExtraction(insulin2_df, cgm2_df, 2)
meal_data1 = meal_data1.iloc[:,0:30]
meal_data2 = meal_data2.iloc[:,0:30]
print(meal_data1)

        0      1      2      3      4      5      6      7      8      9   \
0    312.0  311.0  311.0  311.0  309.0  310.0  314.0    NaN    NaN    NaN   
1    196.0  203.0  198.0  195.0  190.0  184.0  178.0  169.0  164.0  168.0   
2    278.0  283.0  284.0  274.0  267.0  267.0  269.0  274.0  277.0  270.0   
3     81.0   77.0   74.0   67.0   70.0   72.0   74.0   75.0   71.0   67.0   
4    209.0  210.0  209.0  210.0  210.0  213.0  216.0  212.0  213.0  210.0   
..     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
598  202.0  197.0  191.0  185.0  197.0  206.0  209.0  207.0  174.0  168.0   
599  208.0  215.0  212.0  206.0  203.0  196.0  190.0  180.0  184.0  180.0   
600  107.0  106.0  123.0  128.0  137.0  147.0  145.0  142.0    NaN    NaN   
601  177.0  194.0  200.0  201.0  189.0  162.0  166.0  173.0  176.0  169.0   
602    NaN   93.0   92.0   86.0   84.0   80.0   87.0   98.0  105.0  113.0   

     ...     20     21     22     23     24     25     26     27     28    

In [36]:
def noMealDataExtraction(insulin_data, cgm_data):
    insulin_no_meal = insulin_data.copy()
    insulin_no_meal = insulin_no_meal.sort_values(by='date_time_stamp', ascending=True).replace(0.0,np.nan).dropna().copy()
    insulin_no_meal = insulin_no_meal.reset_index().drop(columns='index')
    valid_timestamps = []
    for i, timestamp in enumerate(insulin_no_meal['date_time_stamp']):
        try:
            time_diff = (insulin_no_meal['date_time_stamp'][i+1]-timestamp).seconds // 3600
            if time_diff >= 4:
                valid_timestamps.append(timestamp)
        except KeyError:
            pass
    
    no_meal_data = []
    for i, timestamp in enumerate(valid_timestamps):
        counter = 1
        try:
            length = len(cgm_data.loc[(cgm_data['date_time_stamp'] >= valid_timestamps[i]+pd.Timedelta(minutes=120))&(cgm_data['date_time_stamp'] < valid_timestamps[i+1])]) // 24
            while counter <= length:
                if counter == 1:
                    no_meal_data.append(cgm_data.loc[(cgm_data['date_time_stamp'] >= valid_timestamps[i]+pd.Timedelta(minutes=120))&(cgm_data['date_time_stamp'] < valid_timestamps[i+1])]['Sensor Glucose (mg/dL)'][:counter*24].values.tolist())
                    counter += 1
                else:
                    no_meal_data.append(cgm_data.loc[(cgm_data['date_time_stamp'] >= valid_timestamps[i]+pd.Timedelta(minutes=120))&(cgm_data['date_time_stamp'] < valid_timestamps[i+1])]['Sensor Glucose (mg/dL)'][(counter-1)*24:(counter)*24].values.tolist())
                    counter += 1
        except IndexError:
            break
    return pd.DataFrame(no_meal_data)

In [37]:
no_meal_data1 = noMealDataExtraction(insulin1_df, cgm1_df)
no_meal_data2 = noMealDataExtraction(insulin2_df, cgm2_df)

In [28]:
def create_meal_feature_matrix(meal_data):
    cleaned_data = meal_data.drop(meal_data.isna().sum(axis=1).replace(0, np.nan).dropna().where(lambda x: x > 6).dropna().index).reset_index().drop(columns='index')
    cleaned_data = cleaned_data.interpolate(method='linear', axis=1)
    index_to_drop = cleaned_data.isna().sum(axis=1).replace(0, np.nan).dropna().index
    cleaned_data = cleaned_data.drop(meal_data.index[index_to_drop]).reset_index().drop(columns='index')
    tau_time = (cleaned_data.iloc[:, 22:25].idxmin(axis=1) - cleaned_data.iloc[:, 5:19].idxmax(axis=1)) * 5
    difference_in_glucose_normalized = (cleaned_data.iloc[:, 5:19].max(axis=1) - cleaned_data.iloc[:, 22:25].min(axis=1)) / (cleaned_data.iloc[:, 22:25].min(axis=1))
    cleaned_data = cleaned_data.dropna().reset_index().drop(columns='index') 
    power_at_first_max = []
    index_of_first_max = []
    power_at_second_max = []
    index_of_second_max = []
    for i in range(len(cleaned_data)):
        fft_results = abs(rfft(cleaned_data.iloc[:, 0:30].iloc[i].values.tolist())).tolist()
        sorted_fft_results = fft_results.copy()
        sorted_fft_results.sort()
        power_at_first_max.append(sorted_fft_results[-2])
        power_at_second_max.append(sorted_fft_results[-3])
        index_of_first_max.append(fft_results.index(sorted_fft_results[-2]))
        index_of_second_max.append(fft_results.index(sorted_fft_results[-3]))
    
    maximum = cleaned_data.iloc[:, 5:19].idxmax(axis=1)
    tm = cleaned_data.iloc[:, 22:25].idxmin(axis=1)
    first_differential = []
    second_differential = []
    for i in range(len(cleaned_data)):
        data = cleaned_data.iloc[:, maximum[i]:tm[i]].iloc[i].tolist()
        first_differential.append(np.diff(data).max())
        second_differential.append(np.diff(np.diff(data)).max())
    
    meal_feature_matrix = pd.DataFrame({
        'tau_time': tau_time,
        'difference_in_glucose_normalized': difference_in_glucose_normalized,
        'power_at_first_max': power_at_first_max,
        'power_at_second_max': power_at_second_max,
        'index_of_first_max': index_of_first_max,
        'index_of_second_max': index_of_second_max,
        'first_differential': first_differential,
        'second_differential': second_differential,
    })
    
    return meal_feature_matrix


In [29]:
meal_feature_matrix1 = create_meal_feature_matrix(meal_data1)
meal_feature_matrix2 = create_meal_feature_matrix(meal_data2)
meal_feature_matrix = pd.concat([meal_feature_matrix1, meal_feature_matrix2]).reset_index().drop(columns='index')

In [30]:
def create_no_meal_feature_matrix(non_meal_data):
    index_to_remove = non_meal_data.isna().sum(axis=1).replace(0,np.nan).dropna().where(lambda x:x>5).dropna().index
    non_meal_data_cleaned = non_meal_data.drop(non_meal_data.index[index_to_remove]).reset_index().drop(columns='index')
    non_meal_data_cleaned = non_meal_data_cleaned.interpolate(method='linear',axis=1)
    index_to_drop_again = non_meal_data_cleaned.isna().sum(axis=1).replace(0,np.nan).dropna().index
    non_meal_data_cleaned = non_meal_data_cleaned.drop(non_meal_data_cleaned.index[index_to_drop_again]).reset_index().drop(columns='index')
    non_meal_data_cleaned['tau_time'] = (24 - non_meal_data_cleaned.iloc[:,0:19].idxmax(axis=1)) * 5
    non_meal_data_cleaned['difference_in_glucose_normalized'] = (non_meal_data_cleaned.iloc[:,0:19].max(axis=1) - non_meal_data_cleaned.iloc[:,24]) / (non_meal_data_cleaned.iloc[:,24])
    power_at_first_max = []
    index_of_first_max = []
    power_at_second_max = []
    index_of_second_max = []
    for i in range(len(non_meal_data_cleaned)):
        array = abs(rfft(non_meal_data_cleaned.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array = abs(rfft(non_meal_data_cleaned.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array.sort()
        power_at_first_max.append(sorted_array[-2])
        power_at_second_max.append(sorted_array[-3])
        index_of_first_max.append(array.index(sorted_array[-2]))
        index_of_second_max.append(array.index(sorted_array[-3]))
    first_differential = []
    second_differential = []
    for i in range(len(non_meal_data_cleaned)):
        first_differential.append(np.diff(non_meal_data_cleaned.iloc[:,0:24].iloc[i].tolist()).max())
        second_differential.append(np.diff(np.diff(non_meal_data_cleaned.iloc[:,0:24].iloc[i].tolist())).max())
    no_meal_feature_matrix = pd.DataFrame({
        'tau_time': non_meal_data_cleaned['tau_time'],
        'difference_in_glucose_normalized': non_meal_data_cleaned['difference_in_glucose_normalized'],
        'power_at_first_max': power_at_first_max,
        'power_at_second_max': power_at_second_max,
        'index_of_first_max': index_of_first_max,
        'index_of_second_max': index_of_second_max,
        'first_differential': first_differential,
        'second_differential': second_differential,
    })
    return no_meal_feature_matrix

In [31]:
non_meal_feature_matrix1 = create_no_meal_feature_matrix(no_meal_data1)
non_meal_feature_matrix2 = create_no_meal_feature_matrix(no_meal_data2)
non_meal_feature_matrix = pd.concat([non_meal_feature_matrix1, non_meal_feature_matrix2]).reset_index().drop(columns='index')

In [32]:
meal_feature_matrix['label'] = 1
non_meal_feature_matrix['label'] = 0
total_data = pd.concat([meal_feature_matrix, non_meal_feature_matrix]).reset_index().drop(columns='index')
dataset = shuffle(total_data, random_state=1).reset_index().drop(columns='index')
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
principaldata = dataset.drop(columns='label')
accuracy = []
precision = []
recall = []
f1score = []
model = DecisionTreeClassifier(criterion="entropy")
for train_index, test_index in kfold.split(principaldata):
    data_train, data_test, label_train, label_test = principaldata.loc[train_index],principaldata.loc[test_index], dataset.label.loc[train_index],dataset.label.loc[test_index]
    model.fit(data_train, label_train)
    predictions = model.predict(data_test)
    accuracy.append(accuracy_score(label_test, predictions)*100)
    precision.append(precision_score(label_test, predictions)*100)
    recall.append(recall_score(label_test, predictions)*100)
    f1score.append(f1_score(label_test, predictions)*100)
dump(model, 'DecisionTreeClassifier.pickle')

['DecisionTreeClassifier.pickle']

In [33]:
print("Accuracy:", np.mean(accuracy))
print("Precision:", np.mean(precision))
print("Recall:", np.mean(recall))
print("F1 Score:", np.mean(f1score))

Accuracy: 97.55022574476057
Precision: 95.3682799050088
Recall: 95.3727766340643
F1 Score: 95.34630587822393
