In [33]:
import numpy as np
import pandas as pd
import datetime
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

In [34]:
# load CGM data
cgm_data_df = pd.read_excel('./CGMData670GPatient3.xlsx', usecols=["Date", "Time", "Sensor Glucose (mg/dL)"])

# datetime formatting
cgm_data_df['DateTime'] = pd.to_datetime(cgm_data_df['Date'].astype(str) + ' ' + cgm_data_df['Time'].astype(str))
cgm_data_df.drop(columns=["Date", "Time"], inplace=True)

# load CGM data
cgm_data_p_df = pd.read_csv('./CGMData.csv', usecols=["Date", "Time", "Sensor Glucose (mg/dL)"])

# datetime formatting
cgm_data_p_df['DateTime'] = pd.to_datetime(cgm_data_p_df['Date'].astype(str) + ' ' + cgm_data_p_df['Time'].astype(str))
cgm_data_p_df.drop(columns=["Date", "Time"], inplace=True)

In [35]:
# load Insulin data
insulin_data_df = pd.read_excel("./InsulinAndMealIntake670GPatient3.xlsx", usecols=["Date","Time","BWZ Carb Input (grams)"])

# datetime formatting
insulin_data_df['DateTime'] = pd.to_datetime(insulin_data_df['Date'].astype(str) + ' ' + insulin_data_df['Time'].astype(str))
insulin_data_df.drop(columns=["Date", "Time"], inplace=True)

# load Insulin data
insulin_data_p_df = pd.read_csv("./InsulinData.csv", usecols=["Date","Time","BWZ Carb Input (grams)"])

# datetime formatting
insulin_data_p_df['DateTime'] = pd.to_datetime(insulin_data_p_df['Date'].astype(str) + ' ' + insulin_data_p_df['Time'].astype(str))
insulin_data_p_df.drop(columns=["Date", "Time"], inplace=True)

In [36]:
# get datetime of all meals
filt = insulin_data_df['BWZ Carb Input (grams)'].notnull() & insulin_data_df['BWZ Carb Input (grams)'] != 0
insulin_all_meal_df = insulin_data_df.loc[filt]["DateTime"].sort_values()

# get datetime of all meals
filt = insulin_data_p_df['BWZ Carb Input (grams)'].notnull() & insulin_data_p_df['BWZ Carb Input (grams)'] != 0
insulin_p_all_meal_df = insulin_data_p_df.loc[filt]["DateTime"].sort_values()

In [37]:
# filter meal times if another meal is not within 2hours
filt = []
for i in range(len(insulin_all_meal_df) - 1):
    td = insulin_all_meal_df.iloc[i+1] - insulin_all_meal_df.iloc[i]
    if td.total_seconds() <= 2*60*60:
        filt.append(False)
    else:
        filt.append(True)
filt.append(True)
insulin_meal_df = insulin_all_meal_df[filt]

# filter meal times if another meal is not within 2hrs
filt = []
for i in range(len(insulin_p_all_meal_df) - 1):
    td = insulin_p_all_meal_df.iloc[i+1] - insulin_p_all_meal_df.iloc[i]
    if td.total_seconds() <= 2*60*60:
        filt.append(False)
    else:
        filt.append(True)
filt.append(True)
insulin_p_meal_df = insulin_p_all_meal_df[filt]


In [38]:
# extracting meal data from CGM data 
cols = list(range(1,31))
meal_data_df = pd.DataFrame()
for i in range(len(insulin_meal_df)):
    ll = insulin_meal_df.iloc[i] - datetime.timedelta(seconds=30*60)
    ul = insulin_meal_df.iloc[i] + datetime.timedelta(seconds=2*60*60)
    filt = (cgm_data_df["DateTime"] >= ll) & (cgm_data_df["DateTime"] < ul)
    filter_cgm_data_df = cgm_data_df[filt]
    if len(filter_cgm_data_df.index) == 30 and filter_cgm_data_df.isnull().values.any() == False:
        filter_cgm_data_df = filter_cgm_data_df.sort_values(by="DateTime").T
        filter_cgm_data_df.drop('DateTime', inplace=True)
        filter_cgm_data_df.reset_index(drop=True, inplace=True)
        filter_cgm_data_df.columns = cols
        meal_data_df = meal_data_df.append(filter_cgm_data_df, ignore_index=True)

meal_data_df = meal_data_df.apply(pd.to_numeric)

# extracting meal data from CGM data
cols = list(range(1,31))
meal_data_p_df = pd.DataFrame()
for i in range(len(insulin_p_meal_df)):
    ll = insulin_p_meal_df.iloc[i] - datetime.timedelta(seconds=30*60)
    ul = insulin_p_meal_df.iloc[i] + datetime.timedelta(seconds=2*60*60)
    filt = (cgm_data_p_df["DateTime"] >= ll) & (cgm_data_p_df["DateTime"] < ul)
    filter_cgm_data_df = cgm_data_p_df[filt]
    if len(filter_cgm_data_df.index) == 30 and filter_cgm_data_df.isnull().values.any() == False:
        filter_cgm_data_df = filter_cgm_data_df.sort_values(by="DateTime").T
        filter_cgm_data_df.drop('DateTime', inplace=True)
        filter_cgm_data_df.reset_index(drop=True, inplace=True)
        filter_cgm_data_df.columns = cols
        meal_data_p_df = meal_data_p_df.append(filter_cgm_data_df, ignore_index=True)

meal_data_p_df = meal_data_p_df.apply(pd.to_numeric)

In [39]:
# extracting meal data from CGM data
cols = list(range(1,25))
meal_data_p_test_df = pd.DataFrame()
for i in range(len(insulin_p_meal_df)):
    ll = insulin_p_meal_df.iloc[i]
    ul = insulin_p_meal_df.iloc[i] + datetime.timedelta(seconds=2*60*60)
    filt = (cgm_data_p_df["DateTime"] >= ll) & (cgm_data_p_df["DateTime"] < ul)
    filter_cgm_data_df = cgm_data_p_df[filt]
    if len(filter_cgm_data_df.index) == 24 and filter_cgm_data_df.isnull().values.any() == False:
        filter_cgm_data_df = filter_cgm_data_df.sort_values(by="DateTime").T
        filter_cgm_data_df.drop('DateTime', inplace=True)
        filter_cgm_data_df.reset_index(drop=True, inplace=True)
        filter_cgm_data_df.columns = cols
        meal_data_p_test_df = meal_data_p_test_df.append(filter_cgm_data_df, ignore_index=True)

meal_data_p_test_df = meal_data_p_test_df.apply(pd.to_numeric)

In [40]:
# filter no meal times by checking if another meal is not happening within 4hours, and then storing all start times within 2hours
no_meal_times = []
for i in range(len(insulin_all_meal_df) - 1):
    td = insulin_all_meal_df.iloc[i+1] - insulin_all_meal_df.iloc[i]
    if td.total_seconds() > 4*60*60:
        tc = insulin_all_meal_df.iloc[i] + datetime.timedelta(seconds=2*60*60)
        while (insulin_all_meal_df.iloc[i+1] - tc).total_seconds() > 2*60*60:
            no_meal_times.append(tc)
            tc = tc + datetime.timedelta(seconds=2*60*60)

# filter no meal times by checking if another meal is not happening within 4hours, and then storing all start times within 2hours
no_meal_times_p = []
for i in range(len(insulin_p_all_meal_df) - 1):
    td = insulin_p_all_meal_df.iloc[i+1] - insulin_p_all_meal_df.iloc[i]
    if td.total_seconds() > 4*60*60:
        tc = insulin_p_all_meal_df.iloc[i] + datetime.timedelta(seconds=2*60*60)
        while (insulin_p_all_meal_df.iloc[i+1] - tc).total_seconds() > 2*60*60:
            no_meal_times_p.append(tc)
            tc = tc + datetime.timedelta(seconds=2*60*60)

In [41]:
# extracting no meal data from CGM data
cols = list(range(1,25))
no_meal_data_df = pd.DataFrame()
for i in range(len(no_meal_times)):
    lb = no_meal_times[i]
    ub = no_meal_times[i] + datetime.timedelta(seconds=2*60*60)
    filt = (cgm_data_df["DateTime"] >= lb) & (cgm_data_df["DateTime"] < ub)
    filter_cgm_data_df = cgm_data_df[filt]
    if len(filter_cgm_data_df.index) == 24 and filter_cgm_data_df.isnull().values.any() == False:
        filter_cgm_data_df = filter_cgm_data_df.sort_values(by="DateTime")
        filter_cgm_data_df = filter_cgm_data_df.T
        filter_cgm_data_df.drop('DateTime', inplace=True)
        filter_cgm_data_df.reset_index(drop=True, inplace=True)
        filter_cgm_data_df.columns = cols
        no_meal_data_df = no_meal_data_df.append(filter_cgm_data_df, ignore_index=True)

no_meal_data_df = no_meal_data_df.apply(pd.to_numeric)

# extracting no meal data from CGM data
cols = list(range(1,25))
no_meal_data_p_df = pd.DataFrame()
for i in range(len(no_meal_times_p)):
    lb = no_meal_times_p[i]
    ub = no_meal_times_p[i] + datetime.timedelta(seconds=2*60*60)
    filt = (cgm_data_p_df["DateTime"] >= lb) & (cgm_data_p_df["DateTime"] < ub)
    filter_cgm_data_df = cgm_data_p_df[filt]
    if len(filter_cgm_data_df.index) == 24 and filter_cgm_data_df.isnull().values.any() == False:
        filter_cgm_data_df = filter_cgm_data_df.sort_values(by="DateTime")
        filter_cgm_data_df = filter_cgm_data_df.T
        filter_cgm_data_df.drop('DateTime', inplace=True)
        filter_cgm_data_df.reset_index(drop=True, inplace=True)
        filter_cgm_data_df.columns = cols
        no_meal_data_p_df = no_meal_data_p_df.append(filter_cgm_data_df, ignore_index=True)

no_meal_data_p_df = no_meal_data_p_df.apply(pd.to_numeric)

In [42]:
total_data_X = np.concatenate((meal_data_p_test_df.to_numpy(), no_meal_data_p_df.to_numpy()), axis=0)
mY = np.ones(meal_data_p_test_df.shape[0])
nmY = np.zeros(no_meal_data_p_df.shape[0])
total_data_Y = np.concatenate((mY, nmY), axis=0)
np.savetxt("test.csv", total_data_X, delimiter=",", fmt="%10.2f")
np.savetxt("testResult.csv", total_data_Y, delimiter=",", fmt="%d")

In [43]:
# Feature extraction
# tmax - tm
# CGM_max - CGM_min
# max CGM velocity, time at which the velocity is max
# FTT - half sinusoidal - get two most dominant frequency buckets
# windowed mean - window size = 6: you will get 4 and 5 means
# take middle 5 means - window size = 3

def absorption_time(row):
    if row.size == 30:
        newrow = row.iloc[6:30]
        return 5*int(newrow.idxmax(skipna = True))
    else:
        return 5*int(row.idxmax(skipna = True))

def CGM_max_velocity(row):
    vmax = None
    vmaxtime = None
    for i in range(row.size):
        if i == 0:
            v = (row.iloc[i+1] - row.iloc[i])/5
        elif i == row.size-1:
            v = (row.iloc[i] - row.iloc[i-1])/5
        else:
            v = (row.iloc[i+1] - row.iloc[i-1])/10
        if vmax == None or v > vmax:
            vmax = v
            vmaxtime = i*5
    return (vmax, vmaxtime)

def CGM_FFT(row):
    sp = np.fft.fft(row)
    power = np.square(sp.real) + np.square(sp.imag)
    freq = np.fft.fftfreq(row.size, d=300)
    mp = 0
    mp2 = 0
    mpi = None
    mp2i = None
    for i in range(1,row.size):
        p = power[i]
        f = freq[i]
        
        if p > mp:
            mp2 = mp
            mp2i = mpi
            mp = p
            mpi = f
        elif p > mp2:
            mp2 = p
            mp2i = f
            
    return (mpi, mp2i)

# take mean of middle 5 windows of 3 length
def windowed_mean(row):
    if row.size == 30:
        newrow = row.iloc[7:22]
    else:
        newrow = row.iloc[4:19]
    avgs = []
    for i in range(5):
        m = (newrow.iloc[i*3] + newrow.iloc[i*3 + 1] + newrow.iloc[i*3 + 2])/3
        avgs.append(m)
    return (avgs[0], avgs[1], avgs[2], avgs[3], avgs[4])


def extract_data(input_df):
    out_df = pd.DataFrame()
    out_df['absorption_time (mins)'] = input_df.apply(lambda row: absorption_time(row), axis=1)
    out_df['CGM_range'] = input_df.apply(lambda row: row.max() - row.min(), axis=1)

    cgm_velocity_data = input_df.apply(lambda row: CGM_max_velocity(row), axis=1)
    cgm_max_vel,cgm_max_vel_time = list(zip(*cgm_velocity_data))
    out_df['CGM_max_vel'] = cgm_max_vel
    out_df['CGM_max_vel_time'] = cgm_max_vel_time

    cgm_fft_data = input_df.apply(lambda row: CGM_FFT(row), axis=1)
    cgm_max_freq,cgm_max2_freq = list(zip(*cgm_fft_data))
    out_df['CGM_max_freq'] = cgm_max_freq
    out_df['CGM_max2_freq'] = cgm_max2_freq

    cgm_wm_data = input_df.apply(lambda row: windowed_mean(row), axis=1)
    cgm_wm1,cgm_wm2,cgm_wm3,cgm_wm4,cgm_wm5 = list(zip(*cgm_wm_data))
    out_df['CGM_wm1'] = cgm_wm1
    out_df['CGM_wm2'] = cgm_wm2
    out_df['CGM_wm3'] = cgm_wm3
    out_df['CGM_wm4'] = cgm_wm4
    out_df['CGM_wm5'] = cgm_wm5
    
    return out_df

meal_data_ext_df = extract_data(meal_data_df)
no_meal_data_ext_df = extract_data(no_meal_data_df)
meal_data_p_ext_df = extract_data(meal_data_p_df)
no_meal_data_p_ext_df = extract_data(no_meal_data_p_df)

In [44]:
# training SVM
total_data_X = np.concatenate((meal_data_ext_df.to_numpy(), no_meal_data_ext_df.to_numpy(), meal_data_p_ext_df.to_numpy(), no_meal_data_p_ext_df.to_numpy()), axis=0)
mY = np.ones(meal_data_ext_df.shape[0])
nmY = np.zeros(no_meal_data_ext_df.shape[0])
mpY = np.ones(meal_data_p_ext_df.shape[0])
nmpY = np.zeros(no_meal_data_p_ext_df.shape[0])
total_data_Y = np.concatenate((mY, nmY, mpY, nmpY), axis=0)

x_train,x_test,y_train,y_test = train_test_split(total_data_X,total_data_Y,test_size=0.2,random_state=123)

clf = svm.SVC()
clf.fit(x_train, y_train)

filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [45]:
yhat = clf.predict(x_test)
# evaluate predictions
acc = accuracy_score(y_test, yhat)

In [46]:
acc

0.7315689981096408

In [47]:
yhat = np.loadtxt('Result.csv')
y_test = np.loadtxt('testResult.csv')
acc = accuracy_score(y_test, yhat)

In [48]:
acc

0.7

In [49]:
true_file = pd.read_csv(r'/Users/tanujsingh/Desktop/FALL 2021/CSE 572 - DM/Assignment2/assignment 2/Result.csv')
my_file = pd.read_csv(r'Result.csv')