In [None]:
def plotIndividuals(data, ids):
    """Plots the time series for a subset of donors, each donor gets their own plot.
       In:  data: dataframe with donation data (with relative time)
            ids : list of donor ids that should be plotted
       Out: nothing, shows plots"""
    
    n = len(ids)
    
    a = np.floor(n**0.5).astype(int)
    b = np.ceil(1.*n/a).astype(int)

    fig = plt.figure(figsize=(5.*b, 3.*a))
    
    for i in range(1, n+1):
        s_id = ids[i-1]
        df_sub = data.loc[data['KeyID'] == s_id, :]
        ax = fig.add_subplot(a,b,i)
        ax.plot(df_sub['TimeSinceFirst'], df_sub['Hb'])
        for t in df_sub['TimeSinceFirst']:
            ax.axvline(t, color='grey', lw=0.8)
        ax.set_title('Donor ' + str(s_id))
        ax.set_ylim(6, 12)
        ax.set_xlabel('Days since first measurement')
        ax.set_ylabel('Hb Value')
        if s_id in id_f:
            ax.axhline(y=7.8, color='red')
        elif s_id in id_m:
            ax.axhline(y=8.4, color='red')

    plt.tight_layout()

In [None]:
def pad(seq, target_length, padding=None):
    length = len(seq)
    seq.extend([padding] * (target_length - length))
    return seq

def makeSeriesDf(data, ids, interval, xmax):
    """ Turns a long-format dataframe into one with subjects as rows and 
        measurements by time in the columns.
        
        Args:
        data     : pandas dataframe in long-format
        ids      : donors to be selected
        interval : the desired interval to sample at (in days)
        xmax     : the maximum time to be considered (in days)
        
        Returns:
        y: dataframe with subjects as rows and measurements in columns
    """
    y = pd.DataFrame(columns=list(range(0, xmax, interval)))
    start = datetime.datetime.now()
    counter = 0
    
    for d_id in ids:
        df_sub = df.loc[df['KeyID'] == d_id, ['TimeSinceFirst', 'Hb']].dropna()
        df_sub['TimeSinceFirst'] = pd.to_timedelta(df_sub['TimeSinceFirst'], unit='day')
        df_sub = df_sub.resample('7d', on='TimeSinceFirst').mean().interpolate()
        y.loc[d_id] = pad(list(df_sub['Hb']), y.shape[1])
        
        counter += 1
        if counter in [10, 100, 1000, 10000] or (datetime.datetime.now() - start) / pd.Timedelta('1min') > 15:
            print(datetime.datetime.now())
            print(counter) 
            start = datetime.datetime.now()
            
    return y

In [None]:
# Distance measures
# Adapted from http://alexminnaar.com/time-series-classification-and-clustering-with-python.html

def DTWDistance(s1, s2, w):
#     n = len(s1.dropna())
#     m = len(s2.dropna())
    DTW = {}
    
    n = min(len(s1.dropna()), len(s2.dropna()))
    s1 = s1[:n]
    s2 = s2[:n]

#     w = max(w, abs(n - m))

    for i in range(-1, n):
        for j in range(-1, n):
            DTW[(i, j)] = float('inf')
    DTW[(-1, -1)] = 0

    for i in range(n):
        for j in range(max(0, i-w), min(n, i+w)):
            dist = (s1.iloc[i] - s2.iloc[j]) ** 2
            DTW[(i, j)] = dist + min(DTW[(i-1, j)], DTW[(i, j-1)], DTW[(i-1, j-1)])

    return np.sqrt(DTW[n-1, n-1])

def LB_Keogh(s1, s2, r):
    LB_sum = 0
    for index, value in enumerate(s1):
        lower_bound = min(s2[(index-r if index-r >= 0 else 0):(index+r)])
        upper_bound = max(s2[(index-r if index-r >= 0 else 0):(index+r)])

        if value > upper_bound:
            LB_sum = LB_sum + (value-upper_bound) ** 2
        elif value < lower_bound:
            LB_sum = LB_sum + (value-lower_bound) ** 2

    return np.sqrt(LB_sum)

In [None]:
# Clustering

def k_means_clust(data, num_clust, num_iter, w=5, plot=False, ntry=0):
    centroids = pd.DataFrame(np.random.uniform(low=5, high=12, size=(num_clust, len(data.columns))), 
                             index=list(range(0, num_clust)), 
                             columns=list(data.columns))
    for n in range(num_iter):
        print('Starting iteration', n+1)
        print(datetime.datetime.now())
        assignments = {k: [] for k in range(0, num_clust)} 
        
        # Assign data points to clusters
        for s_index, s_row in data.iterrows():
            s_row = s_row.dropna()
            min_dist = float('inf')
            closest_clust = None
            for c_index, c_row in centroids.iterrows():
                c_row = c_row[:len(s_row)]
                if LB_Keogh(s_row.squeeze(), c_row.squeeze(), w) < min_dist:
                    cur_dist = DTWDistance(s_row.squeeze(), c_row, w)
                    if cur_dist < min_dist:
                        min_dist = cur_dist
                        closest_clust = c_index
            assignments[closest_clust].append(s_index)
        
        # If any clusters has zero data points, start over (max 3 times)
        if any([is_empty(assignments[key]) for key in assignments.keys()]):
            if ntry < 3:
                print("Try", ntry+1, "failed, starting over")
                k_means_clust(data, num_clust, num_iter, w, plot, ntry+1)
                break
            else:
                return("4 tries where a cluster got no data points") 
            
        # Recalculate centroids of clusters
        for c_index in centroids.index:
            s_rows = data.loc[data.index.isin(assignments[c_index])]           
            centroids.loc[c_index] = s_rows.mean(axis=0).values  
                
    # After assigning all data points, calculate within-cluster distances
    distances = pd.DataFrame(index=data.index, columns=['cluster', 'distance'])
    for key in assignments.keys():
        for s_index in assignments[key]:
            s_row = data.loc[data.index == s_index].dropna(axis=1).squeeze()
            c_row = centroids.loc[key][:len(s_row)].squeeze()
            dist = DTWDistance(s_row, c_row, w)
            distances.loc[distances.index == s_index, 'cluster'] = key
            distances.loc[distances.index == s_index, 'distance'] = dist
        
    # Plot centroids
    if plot:
        for index, row in centroids.iterrows():
            plt.plot(centroids.loc[index])
        plt.show()
    
    return centroids, assignments, distances

In [None]:
def normalizeFeatures(df):
    x = df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_norm = pd.DataFrame(x_scaled, index=df.index, columns=df.columns)
    return df_norm

In [None]:
def calcCentroids(df_features, df, k):
    centroids = []
    for num_clust in k:
        kmeans = KMeans(n_clusters=num_clust)
        kmeans.fit(df_features)
        labels = kmeans.predict(df_features)
        centroidsk = pd.DataFrame(0, index=list(range(0, num_clust)), columns=list(df.columns))

        for c_index in range(0, num_clust):
            df_cluster = df.loc[labels == c_index]
            centroidsk.loc[c_index] = df_cluster.mean(axis=0).values

        centroids.append(centroidsk)
    
    return centroids

In [None]:
def translateClusters(column, order):
    return [order[x] for x in list(column)]