In [1]:
from anomaly import prepare_data, fetch_anomalies_and_preceding_data

In [2]:
import pandas as pd
from json import load, dumps
import numpy as np

In [15]:
# # Anomaly Detection
#
# - Create a 'vitality' feature, which is an average of all the vitality metrics.
# - Plot a graph of this over time
# - Determine the standard deviation & mean
# - Identify specific points which fall out of (mean +/- sigma), and return the 7 day period immediately preceeding this (separate function).
#     - One function which detects anomalies, and another function which takes the anomalies, and uses the date feature to extract 7 (or otherwise specified) periods of measured metric data immediately preceding the anomaly's date.
#     - Quantify changes in correlated metrics (% change in the last few days compared to a global average or baseline)

import pandas as pd
import numpy as np
import math
# import matplotlib.pyplot as plt

# Data preparation
# data = pd.read_csv('sample_agg_health_and_checkin_data.csv')
#
# pd.read_json('sampleAggHealthAndCheckinData.json')

vitality_metrics = ["generalFeeling", "mood", "focus", "energy"]
measured_metrics = []

# Given a list of health data, engineers the 'vitality' feature, which is the average of all vitality metrics recorded
# for a given day, excluding '0' or NaN values for each row considered when applying a column-wise mean.
# The result is an array of shape (-1, 2), where each row contains the original data frame index, so that we can
# re-merge at a later stage and extract the measured_metrics, as well as the actual vitality for the day.
def summarise_vitality(data, vitality_metrics):
    vitality = np.array([ np.mean(x[np.logical_not(np.isnan(x))]) for x in data[vitality_metrics].values ]).reshape(-1, 1)
    indeces = np.array(data.index.to_series().values).reshape(-1, 1)
    return np.append(indeces, vitality, axis=1)


# Given a pd data frame containing aggregated health data, creates the 'vitality'
# feature, and interpolates weight to remove NaN values.
def prepare_data(data):
    # print(type(data))
    if type(data) != pd.core.frame.DataFrame:
        try:
            # print("Attempting to read from json.")
            data = pd.read_json(data)
        except:
            log("Failed to parse data as JSON.")
            try:
                # print("Attempting to read from csv.")
                data = pd.read_csv(data)
            except:
                log("Failed to parse data as CSV.")

                try:
                    data = pd.DataFrame.from_dict(data)
                except:
                    log("Failed to parse data as dict.")
                    pass

    # print(data)
    assert type(data) is pd.core.frame.DataFrame

    global measured_metrics

    # Remove 'attributesAndCounts' if present.
    if "attributesAndCounts" in data.columns:
        data = data.drop("attributesAndCounts", axis=1)

    # Reassign measured_metrics to be used across the package.
    measured_metrics = [col for col in data.columns.drop('startOfDate') if col not in vitality_metrics]

    data_df = data.copy()

    vitality = summarise_vitality(data, vitality_metrics)

    # Add vitality to the data.
    data_df["vitality"] = vitality[:, 1]

    # Replace all '0' values in weight by NaN values so that we can interpolate.
    # Fill in all missing NaN weight values by linearly interpolating.
    # data_df.loc[data_df["weight"] == 0, "weight"] = None
    data_df["weight"] = data_df["weight"].interpolate(limit_direction='both')
    data_df.loc[data_df["weight"] == 0, "weight"] = None
    data_df["weight"] = data_df["weight"].interpolate(limit_direction='both')
    
    # Drop all NaN rows.
    data_df = data_df.dropna()
    
    # Remove rows which have not been enriched with added health data (e.g. Apple Watch was not worn that day).
    data_df = data_df.loc[data_df["basalEnergyBurned"] != 0]

    return data_df.reset_index(drop=True)


# # Detecting anomalies.
# For each anomaly detected, return the immediate 'X' day period preceding the occurrance of the anomaly.
# We merge the vitality for each day with all the other metrics observed or measured on a particular day / row.
def get_data_preceding_anomaly(data, anomaly_idx, preceding_num_days=7):
    return data.iloc[(anomaly_idx-preceding_num_days if anomaly_idx > preceding_num_days else 0):anomaly_idx]

# For each measured metric preceding an anomaly, find the most correlated features which contribute to the anomaly itself.
# For this, we will need to append 'vitality' as an extra column on the pandas dataframe.
def get_most_correlated_complement(corr_matrix, metric):
    most_corr_metric, most_corr_val = ("", 0)
    for colName, value in corr_matrix[metric].iteritems():
        if (colName == metric or colName in vitality_metrics):
            continue
        if (abs(value) > abs(most_corr_val)):
            most_corr_metric, most_corr_val = (colName, value)
    return (most_corr_metric, most_corr_val)

# Given a correlation matrix and a desired metric, returns a list of metrics and their correlation
# scores, sorted in terms of the highest correlation.
def fetch_most_correlated_to(data_df, desired_metric):
#     print(data_df, desired_metric)
#     corr_matrix = data.corr()
    metrics_to_select = measured_metrics + [desired_metric]
#     print(f'Metrics to select: {metrics_to_select}')
#     print(f'Subset of measured {data_df[measured_metrics].columns}')
#     print(f'Subset of selected {data_df[metrics_to_select].columns}')
#     print(f'Correlation matrix for metrics_to_select {data_df[metrics_to_select].corr()}')
    correlation_matrix = data_df[metrics_to_select].corr()[desired_metric]
    correlation_matrix_df = pd.DataFrame(correlation_matrix)

#     print('Correlation Matrix', correlation_matrix_df)

    sorted_abs_corr_matrix_df = pd.DataFrame(correlation_matrix.abs()).sort_values(desired_metric, ascending=False)

#     print('Sorted (abs) corr matrix', sorted_abs_corr_matrix_df)

    # After sorting the correlation matrix using absolute values, we need to 'merge' back in the negative
    # values wherever they also apply. We keep all key in the sorted_abs_corr_matrix so that we can
    # preserve the index ordering which we computed during the sort method applied above.
    sorted_corr_matrix_df = sorted_abs_corr_matrix_df.join(other=correlation_matrix_df, on=sorted_abs_corr_matrix_df.index, how='left', lsuffix="_abs").drop(f'{desired_metric}_abs', axis=1)

#     print("Sorted corr matrix", sorted_corr_matrix_df)

    # Return all corrleated metrics, excluding 'vitality' which is trivially '1' by default.
    return sorted_corr_matrix_df

# Given a data frame, determines the % change in each column when compared to the global mean, as well as the
# start of the subset period, by fitting a line and determining the % change between the first and last point
# of the fitted line.
def determine_global_percentage_change_for_subset(all_data, subset_data):
    return pd.DataFrame((subset_data.mean() / all_data.mean()) - 1)

# Determines the local percentage change for a subset of the data, by fitting a line of best fit and determining
# the relative change between the first and the last value.
def determine_local_percentage_change_for_subset(subset_data):

    # if subset_data.empty:
    #     return subset_data

    x_mean = subset_data.index.values.mean()
    x_centered = (subset_data.index.values - x_mean)

    y_mean = subset_data.mean()
    y_centered = subset_data.drop("startOfDate", axis=1) - y_mean

    m = (x_centered.reshape(-1, 1) * y_centered).sum() / (x_centered ** 2).sum()

    b = y_mean - m * x_mean

    lobf_first_point = m * subset_data.index.values[0] + b
    lobf_last_point = m * subset_data.index.values[-1] + b

    local_percentage_change = lobf_last_point / lobf_first_point

    return pd.DataFrame(local_percentage_change - 1)

# Obtains correlated metrics along with the % change in each correlated metric from the global average.
# These are sorted by an 'importance' ranking which weights the correlation by the percentage change.
# Returns all metrics with an importance greater than or equal to 0.4.
def fetch_most_important_metrics_related_to(all_data, subset_data, desired_metric, importance_threshold=0.4):

    most_corr_metrics = fetch_most_correlated_to(subset_data, desired_metric)
    global_percentage_change_for_subset = determine_global_percentage_change_for_subset(all_data, subset_data)
    local_percentage_change_for_subset = determine_local_percentage_change_for_subset(subset_data)
    joined_df = most_corr_metrics
    joined_df["local_percentage_change"] = local_percentage_change_for_subset
    joined_df["global_percentage_change"] = global_percentage_change_for_subset
    joined_df.columns = ["correlation", "local_percentage_change", "global_percentage_change"]

    # Drop the desired_metric column.
    joined_df = joined_df.drop(desired_metric, axis=0)

    # Add an 'importance' column which weights each correlated metric by the percentage change which occured
    # (both globally, and locally).
    #
    # We take into account the percentage change, as a correlation in and of itself isn't as useful out side
    # of the context of a recent change. Indication of recent change would enable awareness and allow us to take action.
    joined_df['importance'] = joined_df["correlation"].abs() * joined_df["global_percentage_change"].abs() * joined_df["local_percentage_change"].abs()
    joined_df['importance'] = joined_df['importance'] / joined_df['importance'].max()

    return joined_df.sort_values(by='importance', ascending=False)

# Fetch the most correlated metrics, and display a comparison for each.
def fetch_most_correlated_pairs(data):
    corr = data.corr()
    correlated_metrics = []
    for col in corr.columns:
        pair, corrVal = get_most_correlated_complement(corr, col)
        if ((col, pair, corrVal) not in correlated_metrics or (pair, col, corrVal) not in correlated_metrics):
            correlated_metrics.append((col, pair, corrVal))
    return correlated_metrics

# Compare metrics which are most correlated to the desired metric visually, displaying the values for both
# metrics across time, as well as labelling the correlation between the two metrics.
# def compare_most_correlated_to_visually(data, desired_metric, max_num_correlated_metrics=3):

#     most_correlated_metrics = fetch_most_correlated_to(data, desired_metric)
#     most_correlated_metrics = list(zip(most_correlated_metrics.index, most_correlated_metrics.values.reshape(-1)))[1:]
#     if max_num_correlated_metrics > len(most_correlated_metrics):
#         max_num_correlated_metrics = len(most_correlated_metrics)

#     # Fetch the `max_num_correlated_metrics` top most correlated metrics, skipping the first which is
#     # trivially the desired_metric itself.
#     most_correlated_metrics = most_correlated_metrics[0:max_num_correlated_metrics]

#     plot_idx = 0
#     for correlated_metric, corr_score in most_correlated_metrics:
#         plt.figure(plot_idx)
#         compare_metrics(data, desired_metric, correlated_metric, corr_score=corr_score)
#         plot_idx += 1

# def compare_most_correlated_visually(data):
#     correlated_metrics = fetch_most_correlated_pairs(data)
#     plotIndex = 0
#     for (metricOne, metricTwo, _) in correlated_metrics:
#         plt.figure(plotIndex)
#         plotIndex += 1
#         compare_metrics(data, metricOne, metricTwo)

# Plotting restingHeartRate against the 4 vitality indicators.
# def compare_metrics(df, metricOne, metricTwo, corr_score=None):
#     feelingVHR = df[[metricOne, metricTwo]].dropna()
#     metricOneMax = feelingVHR[metricOne].max()
#     metricTwoMax = feelingVHR[metricTwo].max()
#     plt.plot(feelingVHR[metricOne] / metricOneMax)
#     plt.plot(feelingVHR[metricTwo] / metricTwoMax)
#     plt.xlabel('Day')
#     plt.ylabel('Normalised value')
#     if corr_score is not None:
#         plt.text((feelingVHR.index.min() + feelingVHR.index.max()) / 2, 1, f'Correlation: {corr_score}', horizontalalignment='center', verticalalignment='center')
#     plt.legend()

# Given the data, fetch the anomalies and the preceding data as an array of dictionaries
# {
#     anomaly_idx,
#     anomaly_value,
#     preceding_data,
#     most_important_metrics,
#     most_important_preceding_data
# }
def fetch_anomalies_and_preceding_data(data, desired_metric="vitality", preceding_num_days=7, std_deviations=1, importance_threshold=0.4):

    if desired_metric is None:
        desired_metric = "vitality"

    if preceding_num_days is None:
        preceding_num_days = 7

    if std_deviations is None:
        std_deviations = 1

    if importance_threshold is None:
        importance_threshold = 0.4

    data = prepare_data(data)

    anomalies = []

    anomaly_idxs = detect_anomaly_indeces(data=data, desired_metric=desired_metric, std_deviations=std_deviations)

    anomaly_values = [data[desired_metric].iloc[idx] for idx in anomaly_idxs]

    for anomaly_index, anomaly_value in zip(anomaly_idxs, anomaly_values):
        preceding_data = get_data_preceding_anomaly(data, anomaly_index, preceding_num_days=preceding_num_days)

        # If the preceding data is empty for whatever reason, skip the current anomaly.
        if preceding_data.empty:
            log(f'Preceding data empty for anomaly index {anomaly_index} and value {anomaly_value}.')
            continue

        important_metrics = fetch_most_important_metrics_related_to(data, preceding_data, desired_metric)
        most_important_metrics = important_metrics.loc[important_metrics["importance"] >= 0.4]
        most_important_preceding_data = preceding_data[most_important_metrics.index.values]

        anomalies.append({
            "desired_metric": desired_metric,
            "anomaly_index": int(anomaly_index),
            "anomaly_value": float(anomaly_value),
            "preceding_data": preceding_data.to_dict(),
            "most_important_metrics": most_important_metrics.to_dict(),
            "most_important_preceding_data": most_important_preceding_data.to_dict()
        })

    return anomalies

def log(message):
    print(f'ANOMLAY | {message}')


In [3]:
data = pd.read_json("./sample_data.json")
# with open('./sample_data.json', 'r+') as json_file:
#     json_dict = load(json_file)
# #     print(json_dict["result"])
#     json_file.seek(0)
#     json_file.truncate(0)
#     json_file.write(dumps(json_dict["result"]))

In [7]:
data_prepped = prepare_data(data)

In [14]:
data_prepped.loc[data_prepped["caloricIntake"] == 0]

Unnamed: 0,startOfDate,generalFeeling,mood,energy,focus,activeEnergyBurned,basalEnergyBurned,caloricIntake,dietaryCarbohydrates,dietaryFats,dietaryProtein,hrv,lowHeartRateEvents,restingHeartRate,sleepHours,weight,vitality
2,2020-07-05T00:00:00.000Z,2.5,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.300055,1.875
16,2020-06-20T00:00:00.000Z,1.666667,2.333333,1.666667,1.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.053869,1.833333
17,2020-06-19T00:00:00.000Z,3.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.138476,3.0
18,2020-06-18T00:00:00.000Z,4.0,4.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.223083,4.0
19,2020-06-17T00:00:00.000Z,3.5,3.5,3.5,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.307691,3.5
29,2020-05-26T00:00:00.000Z,3.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.153763,3.0
30,2020-05-25T00:00:00.000Z,3.5,4.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.23837,3.375
31,2020-05-24T00:00:00.000Z,4.0,5.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.322977,4.25
36,2020-05-18T00:00:00.000Z,3.0,4.0,3.0,3.0,179.67,1303.657,0.0,0.0,0.0,0.0,77.3304,1.0,40.0,8.183194,59.746013,3.25
38,2020-05-10T00:00:00.000Z,4.0,5.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.915227,4.0


In [11]:
# # Anomaly Detection
#
# - Create a 'vitality' feature, which is an average of all the vitality metrics.
# - Plot a graph of this over time
# - Determine the standard deviation & mean
# - Identify specific points which fall out of (mean +/- sigma), and return the 7 day period immediately preceeding this (separate function).
#     - One function which detects anomalies, and another function which takes the anomalies, and uses the date feature to extract 7 (or otherwise specified) periods of measured metric data immediately preceding the anomaly's date.
#     - Quantify changes in correlated metrics (% change in the last few days compared to a global average or baseline)

import pandas as pd
import numpy as np
import math
# import matplotlib.pyplot as plt

# Data preparation
# data = pd.read_csv('sample_agg_health_and_checkin_data.csv')
#
# pd.read_json('sampleAggHealthAndCheckinData.json')

vitality_metrics = ["generalFeeling", "mood", "focus", "energy"]
measured_metrics = []

# Given a list of health data, engineers the 'vitality' feature, which is the average of all vitality metrics recorded
# for a given day, excluding '0' or NaN values for each row considered when applying a column-wise mean.
# The result is an array of shape (-1, 2), where each row contains the original data frame index, so that we can
# re-merge at a later stage and extract the measured_metrics, as well as the actual vitality for the day.
def summarise_vitality(data, vitality_metrics):
    vitality = np.array([ np.mean(x[np.logical_not(np.isnan(x))]) for x in data[vitality_metrics].values ]).reshape(-1, 1)
    indeces = np.array(data.index.to_series().values).reshape(-1, 1)
    return np.append(indeces, vitality, axis=1)


# Given a pd data frame containing aggregated health data, creates the 'vitality'
# feature, and interpolates weight to remove NaN values.
def prepare_data(data):
    # print(type(data))
    if type(data) != pd.core.frame.DataFrame:
        try:
            # print("Attempting to read from json.")
            data = pd.read_json(data)
        except:
            log("Failed to parse data as JSON.")
            try:
                # print("Attempting to read from csv.")
                data = pd.read_csv(data)
            except:
                log("Failed to parse data as CSV.")

                try:
                    data = pd.DataFrame.from_dict(data)
                except:
                    log("Failed to parse data as dict.")
                    pass

    # print(data)
    assert type(data) is pd.core.frame.DataFrame

    global measured_metrics

    # Remove 'attributesAndCounts' if present.
    if "attributesAndCounts" in data.columns:
        data = data.drop("attributesAndCounts", axis=1)

    # Reassign measured_metrics to be used across the package.
    measured_metrics = [col for col in data.columns.drop('startOfDate') if col not in vitality_metrics]

    data_df = data.copy()

    vitality = summarise_vitality(data, vitality_metrics)

    # Add vitality to the data.
    data_df["vitality"] = vitality[:, 1]

    # Replace all '0' values in weight by NaN values so that we can interpolate.
    # Fill in all missing NaN weight values by linearly interpolating.
    # data_df.loc[data_df["weight"] == 0, "weight"] = None
    data_df["weight"] = data_df["weight"].interpolate(limit_direction='both')
    data_df.loc[data_df["weight"] == 0, "weight"] = None
    data_df["weight"] = data_df["weight"].interpolate(limit_direction='both')
    
    # Drop all NaN rows.
    data_df = data_df.dropna()
    
    # Remove rows which have not been enriched with added health data (e.g. Apple Watch was not worn that day).
    data_df = data_df.loc[data_df["basalEnergyBurned"] != 0]

    return data_df.reset_index(drop=True)


# # Detecting anomalies.

# Detects anomalies in the data by detecting values which fall outside of the
# mu +/- sigma range, returning the indeces of the corresponding data frame
# rows where the desired metric falls outside of this range.
def detect_anomaly_indeces(data, desired_metric, std_deviations=1):

    sigma = data[desired_metric].std()
    mu = data[desired_metric].mean()

    upper_bound = mu + sigma*std_deviations
    lower_bound = mu - sigma*std_deviations

    anomaly_indeces = np.where((data[desired_metric] != 0) & (data[desired_metric] < lower_bound) | (data[desired_metric] > upper_bound))
    return anomaly_indeces[0]

# For each measured metric preceding an anomaly, find the most correlated features which contribute to the anomaly itself.
# For this, we will need to append 'vitality' as an extra column on the pandas dataframe.
def get_most_correlated_complement(corr_matrix, metric):
    most_corr_metric, most_corr_val = ("", 0)
    for colName, value in corr_matrix[metric].iteritems():
        if (colName == metric or colName in vitality_metrics):
            continue
        if (abs(value) > abs(most_corr_val)):
            most_corr_metric, most_corr_val = (colName, value)
    return (most_corr_metric, most_corr_val)

# Given a correlation matrix and a desired metric, returns a list of metrics and their correlation
# scores, sorted in terms of the highest correlation.
def fetch_most_correlated_to(data_df, desired_metric):
#     print(data_df, desired_metric)
#     corr_matrix = data.corr()
    metrics_to_select = measured_metrics + [desired_metric]
#     print(f'Metrics to select: {metrics_to_select}')
#     print(f'Subset of measured {data_df[measured_metrics].columns}')
#     print(f'Subset of selected {data_df[metrics_to_select].columns}')
#     print(f'Correlation matrix for metrics_to_select {data_df[metrics_to_select].corr()}')
    correlation_matrix = data_df[metrics_to_select].corr()[desired_metric]
    correlation_matrix_df = pd.DataFrame(correlation_matrix)

#     print('Correlation Matrix', correlation_matrix_df)

    sorted_abs_corr_matrix_df = pd.DataFrame(correlation_matrix.abs()).sort_values(desired_metric, ascending=False)

#     print('Sorted (abs) corr matrix', sorted_abs_corr_matrix_df)

    # After sorting the correlation matrix using absolute values, we need to 'merge' back in the negative
    # values wherever they also apply. We keep all key in the sorted_abs_corr_matrix so that we can
    # preserve the index ordering which we computed during the sort method applied above.
    sorted_corr_matrix_df = sorted_abs_corr_matrix_df.join(other=correlation_matrix_df, on=sorted_abs_corr_matrix_df.index, how='left', lsuffix="_abs").drop(f'{desired_metric}_abs', axis=1)

#     print("Sorted corr matrix", sorted_corr_matrix_df)

    # Return all corrleated metrics, excluding 'vitality' which is trivially '1' by default.
    return sorted_corr_matrix_df

# Given a data frame, determines the % change in each column when compared to the global mean, as well as the
# start of the subset period, by fitting a line and determining the % change between the first and last point
# of the fitted line.
def determine_global_percentage_change_for_subset(all_data, subset_data):
    return pd.DataFrame((subset_data.mean() / all_data.mean()) - 1)

# Determines the local percentage change for a subset of the data, by fitting a line of best fit and determining
# the relative change between the first and the last value.
def determine_local_percentage_change_for_subset(subset_data):

    # if subset_data.empty:
    #     return subset_data

    x_mean = subset_data.index.values.mean()
    x_centered = (subset_data.index.values - x_mean)

    y_mean = subset_data.mean()
    y_centered = subset_data.drop("startOfDate", axis=1) - y_mean

    m = (x_centered.reshape(-1, 1) * y_centered).sum() / (x_centered ** 2).sum()

    b = y_mean - m * x_mean

    lobf_first_point = m * subset_data.index.values[0] + b
    lobf_last_point = m * subset_data.index.values[-1] + b

    local_percentage_change = lobf_last_point / lobf_first_point

    return pd.DataFrame(local_percentage_change - 1)

# Obtains correlated metrics along with the % change in each correlated metric from the global average.
# These are sorted by an 'importance' ranking which weights the correlation by the percentage change.
# Returns all metrics with an importance greater than or equal to 0.4.
def fetch_most_important_metrics_related_to(all_data, subset_data, desired_metric, importance_threshold=0.4):

    most_corr_metrics = fetch_most_correlated_to(subset_data, desired_metric)
    global_percentage_change_for_subset = determine_global_percentage_change_for_subset(all_data, subset_data)
    local_percentage_change_for_subset = determine_local_percentage_change_for_subset(subset_data)
    joined_df = most_corr_metrics
    joined_df["local_percentage_change"] = local_percentage_change_for_subset
    joined_df["global_percentage_change"] = global_percentage_change_for_subset
    joined_df.columns = ["correlation", "local_percentage_change", "global_percentage_change"]

    # Drop the desired_metric column.
    joined_df = joined_df.drop(desired_metric, axis=0)

    # Add an 'importance' column which weights each correlated metric by the percentage change which occured
    # (both globally, and locally).
    #
    # We take into account the percentage change, as a correlation in and of itself isn't as useful out side
    # of the context of a recent change. Indication of recent change would enable awareness and allow us to take action.
    joined_df['importance'] = joined_df["correlation"].abs() * joined_df["global_percentage_change"].abs() * joined_df["local_percentage_change"].abs()
    joined_df['importance'] = joined_df['importance'] / joined_df['importance'].max()

    return joined_df.sort_values(by='importance', ascending=False)

# Fetch the most correlated metrics, and display a comparison for each.
def fetch_most_correlated_pairs(data):
    corr = data.corr()
    correlated_metrics = []
    for col in corr.columns:
        pair, corrVal = get_most_correlated_complement(corr, col)
        if ((col, pair, corrVal) not in correlated_metrics or (pair, col, corrVal) not in correlated_metrics):
            correlated_metrics.append((col, pair, corrVal))
    return correlated_metrics

# Compare metrics which are most correlated to the desired metric visually, displaying the values for both
# metrics across time, as well as labelling the correlation between the two metrics.
# def compare_most_correlated_to_visually(data, desired_metric, max_num_correlated_metrics=3):

#     most_correlated_metrics = fetch_most_correlated_to(data, desired_metric)
#     most_correlated_metrics = list(zip(most_correlated_metrics.index, most_correlated_metrics.values.reshape(-1)))[1:]
#     if max_num_correlated_metrics > len(most_correlated_metrics):
#         max_num_correlated_metrics = len(most_correlated_metrics)

#     # Fetch the `max_num_correlated_metrics` top most correlated metrics, skipping the first which is
#     # trivially the desired_metric itself.
#     most_correlated_metrics = most_correlated_metrics[0:max_num_correlated_metrics]

#     plot_idx = 0
#     for correlated_metric, corr_score in most_correlated_metrics:
#         plt.figure(plot_idx)
#         compare_metrics(data, desired_metric, correlated_metric, corr_score=corr_score)
#         plot_idx += 1

# def compare_most_correlated_visually(data):
#     correlated_metrics = fetch_most_correlated_pairs(data)
#     plotIndex = 0
#     for (metricOne, metricTwo, _) in correlated_metrics:
#         plt.figure(plotIndex)
#         plotIndex += 1
#         compare_metrics(data, metricOne, metricTwo)

# Plotting restingHeartRate against the 4 vitality indicators.
# def compare_metrics(df, metricOne, metricTwo, corr_score=None):
#     feelingVHR = df[[metricOne, metricTwo]].dropna()
#     metricOneMax = feelingVHR[metricOne].max()
#     metricTwoMax = feelingVHR[metricTwo].max()
#     plt.plot(feelingVHR[metricOne] / metricOneMax)
#     plt.plot(feelingVHR[metricTwo] / metricTwoMax)
#     plt.xlabel('Day')
#     plt.ylabel('Normalised value')
#     if corr_score is not None:
#         plt.text((feelingVHR.index.min() + feelingVHR.index.max()) / 2, 1, f'Correlation: {corr_score}', horizontalalignment='center', verticalalignment='center')
#     plt.legend()

# Given the data, fetch the anomalies and the preceding data as an array of dictionaries
# {
#     anomaly_idx,
#     anomaly_value,
#     preceding_data,
#     most_important_metrics,
#     most_important_preceding_data
# }
def fetch_anomalies_and_preceding_data(data, desired_metric="vitality", preceding_num_days=7, std_deviations=1, importance_threshold=0.4):

    if desired_metric is None:
        desired_metric = "vitality"

    if preceding_num_days is None:
        preceding_num_days = 7

    if std_deviations is None:
        std_deviations = 1

    if importance_threshold is None:
        importance_threshold = 0.4

    data = prepare_data(data)

    anomalies = []

    anomaly_idxs = detect_anomaly_indeces(data=data, desired_metric=desired_metric, std_deviations=std_deviations)

    anomaly_values = [data[desired_metric].iloc[idx] for idx in anomaly_idxs]

    for anomaly_index, anomaly_value in zip(anomaly_idxs, anomaly_values):
        preceding_data = get_data_preceding_anomaly(data, anomaly_index, preceding_num_days=preceding_num_days)

        # If the preceding data is empty for whatever reason, skip the current anomaly.
        if preceding_data.empty:
            log(f'Preceding data empty for anomaly index {anomaly_index} and value {anomaly_value}.')
            continue

        important_metrics = fetch_most_important_metrics_related_to(data, preceding_data, desired_metric)
        most_important_metrics = important_metrics.loc[important_metrics["importance"] >= 0.4]
        most_important_preceding_data = preceding_data[most_important_metrics.index.values]
        
        # If we do not have enough preceding data to cover the preceding number of days, then we should 
        # skip this particular result.
#         if preceding_data

        anomalies.append({
            "desired_metric": desired_metric,
            "anomaly_index": int(anomaly_index),
            "anomaly_value": float(anomaly_value),
            "preceding_data": preceding_data.to_dict(),
            "most_important_metrics": most_important_metrics.to_dict(),
            "most_important_preceding_data": most_important_preceding_data.to_dict()
        })

    return anomalies

def log(message):
    print(f'ANOMLAY | {message}')

In [63]:
detect_anomaly_indeces(data, "sleepHours")

array([  8,  52,  65,  67,  68,  89,  90, 104, 112, 124, 126, 129, 140])

In [5]:
anomalies = fetch_anomalies_and_preceding_data(data)

ANOMLAY | Preceding data empty for anomaly index 0 and value 1.75.


In [6]:
anomalies

[{'desired_metric': 'vitality',
  'anomaly_index': 2,
  'anomaly_value': 1.5,
  'preceding_data': {'startOfDate': {0: '2020-07-04T00:00:00.000Z',
    1: '2020-07-03T00:00:00.000Z'},
   'generalFeeling': {0: 2.0, 1: 3.0},
   'mood': {0: 2.5, 1: 4.0},
   'energy': {0: 1.0, 1: 2.0},
   'focus': {0: 1.5, 1: 3.0},
   'activeEnergyBurned': {0: 53.88599999999994, 1: 609.430999999999},
   'basalEnergyBurned': {0: 812.7140000000003, 1: 1614.7650000000012},
   'caloricIntake': {0: 851.9059753417969, 1: 1496.7840576171875},
   'dietaryCarbohydrates': {0: 95.11075973510742, 1: 129.95592498779297},
   'dietaryFats': {0: 22.593937873840332, 1: 55.09947109222412},
   'dietaryProtein': {0: 67.05394554138184, 1: 114.36804580688477},
   'hrv': {0: 60.983245849609375, 1: 67.176748752594},
   'lowHeartRateEvents': {0: 1.0, 1: 1.0},
   'restingHeartRate': {0: 40.0, 1: 40.0},
   'sleepHours': {0: 8.88513888888889, 1: 7.151111111111112},
   'weight': {0: 57.300054755995944, 1: 57.300054755995944},
   'vitali

In [61]:
np.where(((data["caloricIntake"] != 0) & (data["caloricIntake"] > 3200)))

(array([ 74, 108, 136]),)

In [23]:
# For each anomaly detected, return the immediate 'X' day period preceding the occurrance of the anomaly.
# We merge the vitality for each day with all the other metrics observed or measured on a particular day / row.
def get_data_preceding_anomaly(data, anomaly_idx, preceding_num_days=7):
    return data.iloc[anomaly_idx: min(anomaly_idx+preceding_num_days, len(data))]
#     return data.iloc[(anomaly_idx-preceding_num_days if anomaly_idx > preceding_num_days else 0):anomaly_idx]


In [24]:
preceding_data = get_data_preceding_anomaly(data, 10, preceding_num_days=7)

In [32]:
preceding_data.where(preceding_data != 0, None).dropna()

Unnamed: 0,startOfDate,generalFeeling,mood,energy,focus,attributesAndCounts,activeEnergyBurned,basalEnergyBurned,caloricIntake,dietaryCarbohydrates,dietaryFats,dietaryProtein,hrv,lowHeartRateEvents,restingHeartRate,sleepHours,weight
12,2020-06-24T00:00:00.000Z,2,3,2,2,"{'generalFeeling': 1, 'mood': 1, 'energy': 1, ...",265.06,1582.15,1888.46,143.514,80.5268,143.099,56.0543,18,37,9.56806,57.2003
13,2020-06-23T00:00:00.000Z,1,1,1,1,"{'generalFeeling': 1, 'mood': 1, 'energy': 1, ...",317.569,1582.59,1685.14,122.288,80.5868,108.147,54.1322,1,39,7.05458,57.8


In [35]:
# Given a dataframe, creates a dict for each column as arrays rather than objects keyed by the index.
def pandas_to_array_dict(df):
    
    output_dict = df.to_dict()
    
    for key in output_dict:
        output_dict[key] = list(output_dict[key].values())
    
    return output_dict

In [36]:
pandas_to_array_dict(preceding_data)

{'startOfDate': ['2020-06-26T00:00:00.000Z',
  '2020-06-25T00:00:00.000Z',
  '2020-06-24T00:00:00.000Z',
  '2020-06-23T00:00:00.000Z',
  '2020-06-22T00:00:00.000Z',
  '2020-06-21T00:00:00.000Z',
  '2020-06-20T00:00:00.000Z'],
 'generalFeeling': [2.0, 1.0, 2.0, 1.0, 2.0, 2.5, 1.666666666666666],
 'mood': [2.0, 1.0, 3.0, 1.0, 2.0, 3.0, 2.333333333333333],
 'energy': [2.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.666666666666666],
 'focus': [2.0, 1.0, 2.0, 1.0, 2.0, 2.0, 1.666666666666666],
 'attributesAndCounts': [{'generalFeeling': 1,
   'mood': 1,
   'energy': 1,
   'focus': 1,
   '_id': 1,
   'startOfDay': 1,
   'activeEnergyBurned': 1,
   'basalEnergyBurned': 1,
   'caloricIntake': 1,
   'date': 1,
   'dietaryCarbohydrates': 1,
   'dietaryFats': 1,
   'dietaryProtein': 1,
   'endDate': 1,
   'endOfDay': 1,
   'hrv': 1,
   'lowHeartRateEvents': 1,
   'restingHeartRate': 1,
   'sleepHours': 1,
   'startDate': 1,
   'weight': 1,
   'workouts': 1},
  {'generalFeeling': 1,
   'mood': 1,
   'energy': 1,