In [78]:
# load in the required libraries
import os
import pandas as pd
import numpy as np
from itertools import combinations
from math import factorial

In [79]:
# USER SETTINGS:
# minimum number of duplicates needed to say that the uploads having duplicate cgm data
minThreshold = 96  # 288

# define results dataframe:
results = pd.DataFrame(columns=["uploadId_A", "uploadId_B",
                                "nDuplicates",
                                "averageTimeDifference",
                                "startIndex_A", "endIndex_A",
                                "startIndex_B", "endIndex_B"])

In [80]:
# load in an exmaple dataset
hashID = "0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15"
dataPath = os.path.join(".", "data")
data = pd.read_csv(os.path.join(dataPath, hashID + ".csv"), low_memory=False)
# work with just the cgm data (though this can be modified for pump data)
cgm = data.loc[data.type == "cbg", ["deviceTime", "time", "value", "uploadId"]]
# convert mmol/L to mg/dL
cgm = cgm.rename(columns={"value": "mmol_L"})
cgm["mg_dL"] = (cgm["mmol_L"] * 18.01559).astype(int)
cgm.head()

Unnamed: 0,deviceTime,time,mmol_L,uploadId,mg_dL
0,2015-09-20T12:54:49,2015-09-20T19:57:51.000Z,10.046854,upid_3c41703c2d3a8b97f479afdb6ccf799f,180
1,2015-09-20T12:54:49,2015-09-20T19:57:51.000Z,10.046854,upid_3fc32e5ad912a8ea7efced9151804bdb,180
2,2015-09-20T12:59:49,2015-09-20T20:02:51.000Z,10.102361,upid_3c41703c2d3a8b97f479afdb6ccf799f,182
3,2015-09-20T12:59:49,2015-09-20T20:02:51.000Z,10.102361,upid_3fc32e5ad912a8ea7efced9151804bdb,182
4,2015-09-20T13:04:49,2015-09-20T20:07:51.000Z,10.768451,upid_3c41703c2d3a8b97f479afdb6ccf799f,193


In [81]:
# round UTC time (i.e., time) to the nearest 30 seconds to deal with rounding errors
cgm["roundedTime30sec"] = pd.to_datetime(cgm["time"]).dt.round("30S")
# and then round to nearest 5 minutes
ns5min=5*60*1E9
cgm["roundedTime5min"] = \
    pd.to_datetime((pd.to_datetime(cgm["roundedTime30sec"]).astype(np.int64) // ns5min + 1) * ns5min)
cgm.head()


Unnamed: 0,deviceTime,time,mmol_L,uploadId,mg_dL,roundedTime30sec,roundedTime5min
0,2015-09-20T12:54:49,2015-09-20T19:57:51.000Z,10.046854,upid_3c41703c2d3a8b97f479afdb6ccf799f,180,2015-09-20 19:58:00,2015-09-20 20:00:00
1,2015-09-20T12:54:49,2015-09-20T19:57:51.000Z,10.046854,upid_3fc32e5ad912a8ea7efced9151804bdb,180,2015-09-20 19:58:00,2015-09-20 20:00:00
2,2015-09-20T12:59:49,2015-09-20T20:02:51.000Z,10.102361,upid_3c41703c2d3a8b97f479afdb6ccf799f,182,2015-09-20 20:03:00,2015-09-20 20:05:00
3,2015-09-20T12:59:49,2015-09-20T20:02:51.000Z,10.102361,upid_3fc32e5ad912a8ea7efced9151804bdb,182,2015-09-20 20:03:00,2015-09-20 20:05:00
4,2015-09-20T13:04:49,2015-09-20T20:07:51.000Z,10.768451,upid_3c41703c2d3a8b97f479afdb6ccf799f,193,2015-09-20 20:08:00,2015-09-20 20:10:00


In [82]:
# get a count of all the unique uploadIDs
uniqueUploads = pd.DataFrame(cgm.groupby(by="uploadId").mg_dL.count())
uniqueUploads = uniqueUploads.rename(columns={"mg_dL":"counts"})
uniqueUploads = uniqueUploads[uniqueUploads.counts > minThreshold].reset_index()
print("there are %d unique uploadIds" % len(uniqueUploads))
uniqueUploads.head()

there are 302 unique uploadIds


Unnamed: 0,uploadId,counts
0,upid_008d3670b5d22f7c8c9224c7e3e2cb6b,288
1,upid_0147bc4243e2e3265136ea742707e226,211
2,upid_0165f5fecabcc9afe71634ed8ec516bb,285
3,upid_02905a123ef649c17aa9f2fced2f1f68,288
4,upid_0398d37fae900252fb8fcd0a504986f9,1000


In [83]:
# calculate the number of unique combinations of uploadIds
combos = combinations(uniqueUploads.uploadId, 2)
nCombos = int(factorial(len(uniqueUploads))/(factorial(2) * factorial(len(uniqueUploads) - 2)))
print("there are %d unique combinations...oh my" % nCombos)

there are 45451 unique combinations...oh my


Given that there are so many unique combinations, we will only show one combination; however, you could loop through the rest of the code with the following:

In [84]:
# for cIndex, combo in enumerate(list(combos)):
    # assign the uploadId with the largest time span to A, and shortest to B
    #        duration0 = pd.to_datetime(cgm[cgm.uploadId == combo[0]].time.max()) - \
    #          pd.to_datetime(cgm[cgm.uploadId == combo[0]].time.min())
    #        duration1 = pd.to_datetime(cgm[cgm.uploadId == combo[1]].time.max()) - \
    #          pd.to_datetime(cgm[cgm.uploadId == combo[1]].time.min())
    #        if duration0 >= duration1:
    #            results.loc[cIndex, ["uploadId_A"]] = combo[0]
    #            results.loc[cIndex, ["span_uploadId_A"]] = duration0
    #            results.loc[cIndex, ["n_uploadId_A"]] = \
    #                uniqueUploads[uniqueUploads.uploadId == combo[0]].counts.values[0]
    #            results.loc[cIndex, ["uploadId_B"]] = combo[1]
    #            results.loc[cIndex, ["span_uploadId_B"]] = duration1
    #            results.loc[cIndex, ["n_uploadId_B"]] = \
    #                uniqueUploads[uniqueUploads.uploadId == combo[1]].counts.values[0]
    #        else:
    #            results.loc[cIndex, ["uploadId_A"]] = combo[1]
    #            results.loc[cIndex, ["span_uploadId_A"]] = duration1
    #            results.loc[cIndex, ["n_uploadId_A"]] = \
    #                uniqueUploads[uniqueUploads.uploadId == combo[1]].counts.values[0]
    #            results.loc[cIndex, ["uploadId_B"]] = combo[0]
    #            results.loc[cIndex, ["span_uploadId_B"]] = duration0
    #            results.loc[cIndex, ["n_uploadId_B"]] = \
    #                uniqueUploads[uniqueUploads.uploadId == combo[0]].counts.values[0]
    #
    #        uploadId_A = results.loc[cIndex, "uploadId_A"]
    #        uploadId_B = results.loc[cIndex, "uploadId_B"]
uploadId_A = "upid_ff6bf4b6fde9c9bc45bb211de131d225"
uploadId_B = "upid_12164f5817e09ab7bffb439d8c260131"

In [85]:
# get the cgm data of these two time series, and
# keep track of the original index
cgm_A = cgm[cgm.uploadId == uploadId_A].reset_index().rename(columns={"index":"originalIndex"})
cgm_B = cgm[cgm.uploadId == uploadId_B].reset_index().rename(columns={"index":"originalIndex"})
cgm_A.head()

Unnamed: 0,originalIndex,deviceTime,time,mmol_L,uploadId,mg_dL,roundedTime30sec,roundedTime5min
0,28728,2016-01-21T11:54:48,2016-01-21T19:57:24.000Z,5.606255,upid_ff6bf4b6fde9c9bc45bb211de131d225,101,2016-01-21 19:57:30,2016-01-21 20:00:00
1,28730,2016-01-21T11:59:49,2016-01-21T20:02:24.000Z,6.216838,upid_ff6bf4b6fde9c9bc45bb211de131d225,112,2016-01-21 20:02:30,2016-01-21 20:05:00
2,28732,2016-01-21T12:04:48,2016-01-21T20:07:24.000Z,7.04945,upid_ff6bf4b6fde9c9bc45bb211de131d225,127,2016-01-21 20:07:30,2016-01-21 20:10:00
3,28735,2016-01-21T12:09:48,2016-01-21T20:12:24.000Z,7.549017,upid_ff6bf4b6fde9c9bc45bb211de131d225,136,2016-01-21 20:12:30,2016-01-21 20:15:00
4,28736,2016-01-21T12:14:48,2016-01-21T20:17:24.000Z,7.604525,upid_ff6bf4b6fde9c9bc45bb211de131d225,136,2016-01-21 20:17:30,2016-01-21 20:20:00


In [88]:
# create a continguous time series from the first to last data point
# first for A
contiguousBeginDateTime_A = min(cgm_A.roundedTime5min)
contiguousEndDateTime_A = max(cgm_A.roundedTime5min)
rng_A = pd.date_range(contiguousBeginDateTime_A, contiguousEndDateTime_A, freq="5min")
contiguousData_A = pd.DataFrame(rng_A, columns=["dateTime"])
# then for B
contiguousBeginDateTime_B = min(cgm_B.roundedTime5min)
contiguousEndDateTime_B = max(cgm_B.roundedTime5min)
rng_B = pd.date_range(contiguousBeginDateTime_B, contiguousEndDateTime_B, freq="5min")
contiguousData_B = pd.DataFrame(rng_B, columns=["dateTime"])

# merge cgm data with contiguous time series
contiguousData_A = pd.merge(contiguousData_A, cgm_A,
                            left_on="dateTime", right_on="roundedTime5min",
                            how="left")
contiguousData_B = pd.merge(contiguousData_B, cgm_B,
                            left_on="dateTime", right_on="roundedTime5min",
                            how="left")

In [89]:
contiguousData_A

Unnamed: 0,dateTime,originalIndex,deviceTime,time,mmol_L,uploadId,mg_dL,roundedTime30sec,roundedTime5min
0,2016-01-21 20:00:00,28728.0,2016-01-21T11:54:48,2016-01-21T19:57:24.000Z,5.606255,upid_ff6bf4b6fde9c9bc45bb211de131d225,101.0,2016-01-21 19:57:30,2016-01-21 20:00:00
1,2016-01-21 20:05:00,28730.0,2016-01-21T11:59:49,2016-01-21T20:02:24.000Z,6.216838,upid_ff6bf4b6fde9c9bc45bb211de131d225,112.0,2016-01-21 20:02:30,2016-01-21 20:05:00
2,2016-01-21 20:10:00,28732.0,2016-01-21T12:04:48,2016-01-21T20:07:24.000Z,7.049450,upid_ff6bf4b6fde9c9bc45bb211de131d225,127.0,2016-01-21 20:07:30,2016-01-21 20:10:00
3,2016-01-21 20:15:00,28735.0,2016-01-21T12:09:48,2016-01-21T20:12:24.000Z,7.549017,upid_ff6bf4b6fde9c9bc45bb211de131d225,136.0,2016-01-21 20:12:30,2016-01-21 20:15:00
4,2016-01-21 20:20:00,28736.0,2016-01-21T12:14:48,2016-01-21T20:17:24.000Z,7.604525,upid_ff6bf4b6fde9c9bc45bb211de131d225,136.0,2016-01-21 20:17:30,2016-01-21 20:20:00
5,2016-01-21 20:25:00,28738.0,2016-01-21T12:19:48,2016-01-21T20:22:24.000Z,6.993942,upid_ff6bf4b6fde9c9bc45bb211de131d225,125.0,2016-01-21 20:22:30,2016-01-21 20:25:00
6,2016-01-21 20:30:00,28740.0,2016-01-21T12:24:49,2016-01-21T20:27:24.000Z,7.382495,upid_ff6bf4b6fde9c9bc45bb211de131d225,133.0,2016-01-21 20:27:30,2016-01-21 20:30:00
7,2016-01-21 20:35:00,28742.0,2016-01-21T12:29:49,2016-01-21T20:32:24.000Z,7.271480,upid_ff6bf4b6fde9c9bc45bb211de131d225,131.0,2016-01-21 20:32:30,2016-01-21 20:35:00
8,2016-01-21 20:40:00,28745.0,2016-01-21T12:34:49,2016-01-21T20:37:24.000Z,7.215972,upid_ff6bf4b6fde9c9bc45bb211de131d225,129.0,2016-01-21 20:37:30,2016-01-21 20:40:00
9,2016-01-21 20:45:00,28746.0,2016-01-21T12:39:48,2016-01-21T20:42:24.000Z,7.326987,upid_ff6bf4b6fde9c9bc45bb211de131d225,131.0,2016-01-21 20:42:30,2016-01-21 20:45:00


In [90]:
# TL is the longer time series, and Ts is the shorter
TL = np.array(contiguousData_A.mg_dL)
Ts = np.array(contiguousData_B.mg_dL)

print(TL)
print(Ts)

[101. 112. 127. ...  78.  78.  78.]
[207.  nan 196. ...  nan  nan 233.]


In [91]:
# add NaNs to the beginning and end of TL, which is necessary
# for covering all possible duplicate locations
# (ask Ed to share a picture that describes this)
addNaNs = np.repeat(np.nan, len(Ts) - minThreshold)
n_addNaNs = len(addNaNs)

TL = np.append(addNaNs, TL)
TL = np.append(TL, addNaNs)
print(TL)

[nan nan nan ... nan nan nan]


In [96]:
# shift Ts over TL to see if there are any duplicate sections
indices = list(range(n_addNaNs, len(TL) - len(Ts)))
indices.extend(range(0, n_addNaNs))
combined = pd.DataFrame()
j = -1
while ((len(combined) == 0) | (j < len(indices))):
    j += 1 
    i = indices[j]

    # calculate the arry-element difference between the mg/dL values at each shift
    tempDiff = TL[i:(len(Ts) + i)] - Ts
    # calculate how many times the difference is zero
    nZeros = sum(tempDiff == 0)
    if nZeros >= minThreshold:
        nDuplicates = nZeros
        # "case 1, Ts before TL"
        if (i - n_addNaNs) < 0:
            dupTs = contiguousData_B[(n_addNaNs - i):]
            dupTL = contiguousData_A[:(len(dupTs))]
        else:
            # "case 2, Ts within TL"
            if (i + len(Ts)) < len(TL):
                dupTs = contiguousData_B
                dupTL = contiguousData_A[(i - n_addNaNs):((i - n_addNaNs) + (len(dupTs)))]
            else:
                # "case 3, Ts extends TL"
                dupTL = contiguousData_A[(i - n_addNaNs):(n_addNaNs + len(TL))]
                dupTs = contiguousData_B[:(len(dupTL))]
        # combine the results together
        combined = \
            pd.concat([dupTL.reset_index(drop=True).add_suffix(".TL"),
                       dupTs.reset_index(drop=True).add_suffix(".Ts")], axis=1)
      

IndexError: list index out of range

In [94]:
combined.head()

Unnamed: 0,dateTime.TL,originalIndex.TL,deviceTime.TL,time.TL,mmol_L.TL,uploadId.TL,mg_dL.TL,roundedTime30sec.TL,roundedTime5min.TL,dateTime.Ts,originalIndex.Ts,deviceTime.Ts,time.Ts,mmol_L.Ts,uploadId.Ts,mg_dL.Ts,roundedTime30sec.Ts,roundedTime5min.Ts
0,2016-01-22 11:05:00,29090.0,2016-01-22T02:59:46,2016-01-22T11:02:23.000Z,11.490048,upid_ff6bf4b6fde9c9bc45bb211de131d225,207.0,2016-01-22 11:02:30,2016-01-22 11:05:00,2016-01-22 11:05:00,29091.0,2016-01-22T02:59:46,2016-01-22T11:02:23.000Z,11.490048,upid_12164f5817e09ab7bffb439d8c260131,207.0,2016-01-22 11:02:30,2016-01-22 11:05:00
1,2016-01-22 11:10:00,29093.0,2016-01-22T03:04:46,2016-01-22T11:07:23.000Z,10.657436,upid_ff6bf4b6fde9c9bc45bb211de131d225,191.0,2016-01-22 11:07:30,2016-01-22 11:10:00,2016-01-22 11:10:00,,,,,,,NaT,NaT
2,2016-01-22 11:15:00,29095.0,2016-01-22T03:09:46,2016-01-22T11:12:22.000Z,10.934974,upid_ff6bf4b6fde9c9bc45bb211de131d225,196.0,2016-01-22 11:12:30,2016-01-22 11:15:00,2016-01-22 11:15:00,29094.0,2016-01-22T03:09:46,2016-01-22T11:12:22.000Z,10.934974,upid_12164f5817e09ab7bffb439d8c260131,196.0,2016-01-22 11:12:30,2016-01-22 11:15:00
3,2016-01-22 11:20:00,29096.0,2016-01-22T03:14:46,2016-01-22T11:17:22.000Z,9.935839,upid_ff6bf4b6fde9c9bc45bb211de131d225,178.0,2016-01-22 11:17:30,2016-01-22 11:20:00,2016-01-22 11:20:00,29097.0,2016-01-22T03:14:46,2016-01-22T11:17:22.000Z,9.935839,upid_12164f5817e09ab7bffb439d8c260131,178.0,2016-01-22 11:17:30,2016-01-22 11:20:00
4,2016-01-22 11:25:00,29098.0,2016-01-22T03:19:46,2016-01-22T11:22:22.000Z,8.714674,upid_ff6bf4b6fde9c9bc45bb211de131d225,157.0,2016-01-22 11:22:30,2016-01-22 11:25:00,2016-01-22 11:25:00,29099.0,2016-01-22T03:19:46,2016-01-22T11:22:22.000Z,8.714674,upid_12164f5817e09ab7bffb439d8c260131,157.0,2016-01-22 11:22:30,2016-01-22 11:25:00


In [95]:
# get stats on the duplicates (if they exist)
if len(combined) > 0:
    cIndex = 0
    results.loc[cIndex, ["uploadId_A"]] = uploadId_A
    results.loc[cIndex, ["uploadId_B"]] = uploadId_B
    results.loc[cIndex, ["nDuplicates"]] = nDuplicates
    results.loc[cIndex, ["startIndex_A"]] = \
        combined.loc[combined["mg_dL.TL"].notnull(), "originalIndex.TL"].min()
    results.loc[cIndex, ["endIndex_A"]] = \
        combined.loc[combined["mg_dL.TL"].notnull(), "originalIndex.TL"].max()

    results.loc[cIndex, ["startIndex_B"]] = \
        combined.loc[combined["mg_dL.Ts"].notnull(), "originalIndex.Ts"].min()
    results.loc[cIndex, ["endIndex_B"]] = \
        combined.loc[combined["mg_dL.Ts"].notnull(), "originalIndex.Ts"].max()

    cTimeDifference = pd.to_datetime(combined["time.TL"]) - \
                        pd.to_datetime(combined["time.Ts"])

    averageTimeDifference = cTimeDifference.dt.seconds.mean()
    results.loc[cIndex, ["averageTimeDifference"]] = averageTimeDifference
    
results.head()

Unnamed: 0,uploadId_A,uploadId_B,nDuplicates,averageTimeDifference,startIndex_A,endIndex_A,startIndex_B,endIndex_B
0,upid_ff6bf4b6fde9c9bc45bb211de131d225,upid_12164f5817e09ab7bffb439d8c260131,5,0,29090,30212,29091,30213


NameError: name 'combined' is not defined

Unnamed: 0,dateTime,originalIndex,deviceTime,time,mmol_L,uploadId,mg_dL,roundedTime30sec,roundedTime5min
0,2016-01-21 20:00:00,28728.0,2016-01-21T11:54:48,2016-01-21T19:57:24.000Z,5.606255,upid_ff6bf4b6fde9c9bc45bb211de131d225,101.0,2016-01-21 19:57:30,2016-01-21 20:00:00
1,2016-01-21 20:05:00,28730.0,2016-01-21T11:59:49,2016-01-21T20:02:24.000Z,6.216838,upid_ff6bf4b6fde9c9bc45bb211de131d225,112.0,2016-01-21 20:02:30,2016-01-21 20:05:00
2,2016-01-21 20:10:00,28732.0,2016-01-21T12:04:48,2016-01-21T20:07:24.000Z,7.04945,upid_ff6bf4b6fde9c9bc45bb211de131d225,127.0,2016-01-21 20:07:30,2016-01-21 20:10:00
3,2016-01-21 20:15:00,28735.0,2016-01-21T12:09:48,2016-01-21T20:12:24.000Z,7.549017,upid_ff6bf4b6fde9c9bc45bb211de131d225,136.0,2016-01-21 20:12:30,2016-01-21 20:15:00
4,2016-01-21 20:20:00,28736.0,2016-01-21T12:14:48,2016-01-21T20:17:24.000Z,7.604525,upid_ff6bf4b6fde9c9bc45bb211de131d225,136.0,2016-01-21 20:17:30,2016-01-21 20:20:00
