In [1]:
import pandas as pd
import numpy as np
import math
from datetime import timedelta
from scipy.fftpack import fft, ifft,rfft
from scipy.stats import entropy
from scipy.stats import iqr
from scipy.signal import periodogram
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from joblib import dump, load
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import cluster
import collections
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
insulin_df=pd.read_csv('InsulinData.csv',low_memory=False,usecols=['Date','Time','BWZ Carb Input (grams)'])
cgm_df=pd.read_csv('CGMData.csv',low_memory=False,usecols=['Date','Time','Sensor Glucose (mg/dL)'])

In [3]:
insulin_df['date_time_stamp']=pd.to_datetime(insulin_df['Date'] + ' ' + insulin_df['Time'])
cgm_df['date_time_stamp']=pd.to_datetime(cgm_df['Date'] + ' ' + cgm_df['Time'])

In [4]:
insulin_df = insulin_df.set_index('date_time_stamp')
valid_timestamps = []
find_carb_inputs = insulin_df.sort_values(by='date_time_stamp', ascending=True).dropna().reset_index()
find_carb_inputs['BWZ Carb Input (grams)'].replace(0.0, np.nan, inplace=True)
find_carb_inputs = find_carb_inputs.dropna().reset_index().drop(columns='index')
ground_truth = []

for i, timestamp in enumerate(find_carb_inputs['date_time_stamp']):
    try:
        time_diff = (find_carb_inputs['date_time_stamp'][i+1] - timestamp).seconds / 60.0
        if find_carb_inputs.loc[i, 'BWZ Carb Input (grams)'] > 0 and time_diff >= 120:
            valid_timestamps.append(timestamp)
            ground_truth.append(find_carb_inputs.loc[find_carb_inputs['date_time_stamp'] == timestamp, 'BWZ Carb Input (grams)'].values[0])
    except KeyError:
        pass
    
meal_data = []
for timestamp in valid_timestamps:
    start = pd.to_datetime(timestamp - timedelta(minutes=30))
    end = pd.to_datetime(timestamp + timedelta(minutes=120))
    date_str = timestamp.date().strftime('%#m/%#d/%Y')
    meal_data.append(cgm_df.loc[cgm_df['Date'] == date_str].set_index('date_time_stamp').between_time(start_time=start.strftime('%#H:%#M:%#S'),end_time=end.strftime('%#H:%#M:%#S'))['Sensor Glucose (mg/dL)'].values.tolist())
meal_data = pd.DataFrame(meal_data)
meal_data = meal_data.iloc[:,0:30]
meal_data['carbs_ground_truth'] = ground_truth

In [5]:
cleaned_data = meal_data.drop(meal_data.isna().sum(axis=1).replace(0, np.nan).dropna().where(lambda x: x > 6).dropna().index).reset_index().drop(columns='index')
cleaned_data = cleaned_data.interpolate(method='linear', axis=1)
index_to_drop = cleaned_data.isna().sum(axis=1).replace(0, np.nan).dropna().index
cleaned_data = cleaned_data.drop(meal_data.index[index_to_drop]).reset_index().drop(columns='index')

In [6]:
min_truth = cleaned_data['carbs_ground_truth'].min()
max_truth = cleaned_data['carbs_ground_truth'].max()
number_of_bins = math.ceil((max_truth - min_truth)/20)
bin_values = []
for i in cleaned_data['carbs_ground_truth'].tolist():
    bin_values.append(int((i-min_truth)/20))
ground_truth = cleaned_data['carbs_ground_truth'].tolist()
cleaned_data = cleaned_data.drop('carbs_ground_truth', axis = 1)

In [7]:
first_differential_max = []
first_differential_min = []
first_differential_avg = []
second_differential_max = []
second_differential_min = []
second_differential_avg = []
entropies = []
iqr_by_row = cleaned_data.apply(iqr, axis=1, nan_policy='omit')
fft_1 = []
fft_2 = []
fft_3 = []
fft_4 = []
fft_5 = []
fft_6 = []
psd1_mean = []
psd2_mean = []
psd3_mean = []

for i in range(len(cleaned_data)):
    fft_results = abs(rfft(cleaned_data.iloc[:, 0:30].iloc[i].values.tolist())).tolist()
    fft_results.sort(reverse=True)
    fft_1.append(fft_results[0])
    fft_2.append(fft_results[1])
    fft_3.append(fft_results[2])
    fft_4.append(fft_results[3])
    fft_5.append(fft_results[4])
    fft_6.append(fft_results[5])
    
for i in range(len(cleaned_data)):
    data = cleaned_data.iloc[:,0:30].iloc[i].values.tolist()
    #Velocity
    first_differential_max.append(np.diff(data).max())
    first_differential_min.append(np.diff(data).min())
    first_differential_avg.append(np.diff(data).sum()/len(np.diff(data)))
    #Acceleration
    second_differential_max.append(np.diff(np.diff(data)).max())
    second_differential_min.append(np.diff(np.diff(data)).min())
    second_differential_avg.append(np.diff(np.diff(data)).sum()/len(np.diff(np.diff(data))))
    #Entropy
    entropies.append(entropy(data, base=2))
    #PSD
    frequencies, psd = periodogram((data))
    psd1_mean.append(np.mean(psd[0:5]))
    psd2_mean.append(np.mean(psd[5:10]))
    psd3_mean.append(np.mean(psd[10:16]))

In [8]:
meal_features = pd.DataFrame({
    'first_differential_max': first_differential_max,
    'first_differential_min': first_differential_min,
    'first_differential_avg': first_differential_avg,
    'second_differential_max': second_differential_max,
    'second_differential_min': second_differential_min,
    'second_differential_avg': second_differential_avg,
    'entropies': entropies,
    'iqr_by_row': iqr_by_row,
    'fft_1': fft_1,
    'fft_2': fft_2,
    'fft_3': fft_3,
    'fft_4': fft_4,
    'fft_5': fft_5,
    'fft_6': fft_6,
    'psd1_mean': psd1_mean,
    'psd2_mean': psd2_mean,
    'psd3_mean': psd3_mean
})
scaler = RobustScaler()
meal_features_scaled = scaler.fit_transform(meal_features)

In [9]:
def calculate_entropy(y_true, y_pred, base = 2):
    contingency_matrix = cluster.contingency_matrix(y_true, y_pred)
    base = e if base is None else base
    Entropy = []
    for i in range(0, len(contingency_matrix)):
        p = contingency_matrix[i,:]
        p = pd.Series(p).value_counts(normalize=True, sort=False)
        Entropy.append((-p/p.sum() * np.log(p/p.sum())/np.log(2)).sum())
    TotalP = sum(contingency_matrix,1);
    WholeEntropy = 0;
    for i in range(0, len(contingency_matrix)):
        p = contingency_matrix[i,:]
        WholeEntropy = WholeEntropy + ((sum(p))/(sum(TotalP)))*Entropy[i]
    return WholeEntropy

In [10]:
def calculate_purity_score(y_true, y_pred):
    contingency_matrix = cluster.contingency_matrix(y_true, y_pred)
    Purity = []
    for i in range(0, len(contingency_matrix)):
        p = contingency_matrix[i,:]
        Purity.append(p.max()/p.sum())
    TotalP = sum(contingency_matrix,1);
    WholePurity = 0;
    for i in range(0, len(contingency_matrix)):
        p = contingency_matrix[i,:]
        WholePurity = WholePurity + ((sum(p))/(sum(TotalP)))*Purity[i]
    return WholePurity

In [19]:
kmeans = KMeans(n_clusters=6, random_state=42)
kmeans.fit(meal_features_scaled)
KMeans_df = pd.DataFrame({'Ground Truth': bin_values})
KMeans_df['KmeanCluster'] = kmeans.labels_
kmean_sse = kmeans.inertia_
kmean_entropy = calculate_entropy(KMeans_df['Ground Truth'], KMeans_df['KmeanCluster'])
kmean_purity = calculate_purity_score(KMeans_df['Ground Truth'], KMeans_df['KmeanCluster'])
labels = kmeans.labels_
n_clusters = len(set(labels))
bin_matrix = np.zeros([6, n_clusters])

for i in range(len(labels)):
    if labels[i] != -1:
        bin_matrix[bin_values[i]][labels[i]]+=1
print("K Means Bin Cluster matrix:\n", bin_matrix)

K Means Bin Cluster matrix:
 [[92.  7.  1. 20. 31.  4.]
 [85.  7.  0. 16. 34.  3.]
 [66.  7.  0. 10. 35.  0.]
 [29.  3.  0.  7. 18.  1.]
 [18.  5.  0.  1.  6.  0.]
 [ 2.  1.  0.  1.  1.  0.]]


In [12]:
dbscan = DBSCAN(eps=1.115, min_samples=3, metric='euclidean')
dbscan_model = dbscan.fit(meal_features_scaled)
DBScan = pd.DataFrame({'Ground Truth': bin_values})
DBScan['DBScan'] = dbscan_model.labels_
cluster_centers = []
for label in np.unique(dbscan_model.labels_):
    if label != -1:
        cluster_centers.append(np.mean(meal_features_scaled[dbscan_model.labels_ == label], axis=0))

# Calculate the sum of squared distances for each cluster
sse_values = []
for i, center in enumerate(cluster_centers):
    sse = np.sum(pairwise_distances(meal_features_scaled[dbscan_model.labels_ == i], [center])**2)
    sse_values.append(sse)
DBScan_sse = np.mean(sse_values)
DBScan_entropy = calculate_entropy(DBScan['Ground Truth'], DBScan['DBScan'])
DBScan_purity = calculate_purity_score(DBScan['Ground Truth'], DBScan['DBScan'])
# Compute bin cluster matrix
labels = dbscan_model.labels_
n_clusters = len(set(labels))
bin_matrix = np.zeros([6, n_clusters])

for i in range(len(labels)):
    if labels[i] != -1:
        bin_matrix[bin_values[i]][labels[i]]+=1
print("DB Scan Bin Cluster matrix:\n", bin_matrix)

DB Scan Bin Cluster matrix:
 [[44.  5.  2.  0.  2.  0.]
 [30.  3.  2.  0.  1.  0.]
 [20.  1.  1.  4.  0.  0.]
 [12.  1.  0.  0.  0.  0.]
 [ 7.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]]


In [18]:
ldct_result = {}
ldct_result['SSE for Kmeans'] =  kmean_sse
ldct_result['SSE for DBSCAN'] =  DBScan_sse
ldct_result['Entropy for Kmeans'] =  kmean_entropy
ldct_result['Entropy for DBSCAN'] =  DBScan_entropy
ldct_result['Purity for K means'] =  kmean_purity
ldct_result['Purity for DBSCAN'] =  DBScan_purity
ldf_result = pd.DataFrame(ldct_result, index=[0])
ldf_result.to_csv('Result.csv',index=False,header=False)
print(ldf_result)

   SSE for Kmeans  SSE for DBSCAN  Entropy for Kmeans  Entropy for DBSCAN  \
0     4449.163556       46.202987            2.448653            2.225344   

   Purity for K means  Purity for DBSCAN  
0            0.564797           0.725338  


In [14]:
print("Ground Truth Cluster Alloted:", bin_values)

Ground Truth Cluster Alloted: [5, 3, 2, 1, 2, 2, 3, 3, 2, 2, 5, 1, 0, 1, 2, 4, 0, 1, 1, 3, 3, 3, 1, 1, 2, 2, 1, 3, 2, 1, 0, 3, 1, 1, 1, 3, 3, 2, 3, 2, 0, 0, 0, 1, 1, 3, 0, 2, 1, 1, 2, 2, 1, 2, 3, 1, 0, 1, 2, 2, 1, 2, 0, 1, 3, 0, 1, 0, 1, 4, 1, 2, 4, 1, 1, 1, 2, 2, 0, 2, 4, 2, 4, 2, 2, 1, 0, 0, 1, 4, 4, 2, 1, 4, 0, 0, 4, 1, 3, 5, 5, 3, 1, 1, 1, 1, 0, 2, 4, 3, 1, 0, 1, 1, 2, 3, 0, 3, 1, 2, 1, 3, 3, 1, 1, 4, 2, 0, 4, 0, 2, 1, 4, 2, 2, 2, 0, 4, 2, 0, 2, 0, 2, 2, 2, 2, 2, 1, 0, 2, 1, 4, 3, 2, 1, 4, 2, 2, 0, 2, 1, 1, 1, 1, 0, 0, 1, 2, 0, 2, 0, 1, 1, 0, 1, 1, 1, 2, 2, 3, 2, 0, 2, 0, 4, 0, 1, 2, 1, 0, 3, 2, 3, 2, 3, 1, 3, 3, 1, 3, 2, 2, 0, 3, 2, 1, 0, 1, 1, 4, 0, 0, 3, 1, 0, 3, 1, 2, 3, 3, 2, 1, 0, 0, 1, 0, 3, 1, 2, 1, 1, 0, 1, 0, 0, 2, 4, 0, 1, 0, 2, 1, 2, 1, 2, 3, 0, 0, 2, 1, 1, 0, 0, 0, 3, 2, 0, 0, 2, 2, 2, 1, 2, 4, 0, 2, 0, 0, 2, 0, 0, 2, 1, 0, 0, 2, 2, 1, 0, 2, 0, 0, 2, 3, 0, 1, 1, 3, 1, 3, 0, 3, 1, 5, 0, 0, 3, 0, 3, 0, 3, 1, 0, 0, 0, 0, 0, 0, 4, 4, 1, 1, 3, 3, 2, 1, 3, 1, 0, 1, 0, 1, 0, 

In [15]:
bins = pd.DataFrame(bin_values)
print("Ground Truth:", bins[0].value_counts().tolist())
print("K Means Cluster Count:", KMeans_df['KmeanCluster'].value_counts().tolist())
print("DBScan Cluster Count:", DBScan['DBScan'].value_counts().tolist())

Ground Truth: [155, 145, 118, 58, 30, 5]
K Means Cluster Count: [292, 125, 55, 30, 8, 1]
DBScan Cluster Count: [375, 113, 10, 5, 5, 3]
