In [None]:
# interactive plotting
# %matplotlib widget
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
from scipy.cluster.hierarchy import linkage, leaves_list

In [None]:
FILE = ".../Summary_Agenda_Daily.csv"
#".../Summary_Agenda__1 hours__2023-01-01 00:00 +0000__2024-06-01 00:00 +0000.csv"
# ".../Summary_Agenda__4 hours__2023-01-01 00:00 +0000__2024-06-01 00:00 +0000.csv"
# ".../Summary_Agenda_Daily.csv"
# ".../Summary_Agenda_Daily.csv"
# skip the first column (date indication)
rawDF = pd.read_csv(FILE, sep=",")

In [None]:
def sum_columns_by_tag(raw, columns_dict, split_string = " ", keep_columns = ["from_time", "to_time"]):
    """Produces a dataset by combining all columns with a tag from the selection into one, over multiple.\n
    - raw: DataFrame containing the keep_columns
    - columns_dict: associates column name to the sum of all columns containing any string
    -   \n"""
    if not all([kept_column in raw.columns for kept_column in keep_columns]):
        raise KeyError("keep_columns is defined but some of its elements are not in raw.columns")

    original_columns = raw.columns.to_series()

    # creates a new dataframe from the keys of the dictionary
    dataframe = pd.DataFrame(columns= keep_columns + list(columns_dict.keys()))
    if "from_time" in keep_columns:
        dataframe['from_time'] = pd.to_datetime(raw['from_time'])
    if "to_time" in keep_columns:
        dataframe['to_time'] = pd.to_datetime(raw['to_time'])

    for key in columns_dict.keys():
        selection = original_columns.str.split(split_string).map(lambda x: any([f in x for f in columns_dict[key]]))
        dataframe[key] = raw.loc[:, selection].sum(axis=1)

    return dataframe
    
    
# note that the tags should be exclusive to avoid double counting
tag_tree = {
    "standard": {
        "Sleep": ["SWO", "SFR"],
        "Lessons": ["LES"],
        "Revision": ["REV", "EXM"],
        "Repetitive": ["BUR", "WRK", "TDY", "ORG"],
        "Projects": ["PRJ"],
        "Media": ["MDI"],
        "Social": ["CAL", "OUT", "EVE", "DOG"],
    },
    "study": {
        "Theory": ["R"],
        "Exercise": ["E"],
        "Projects": ["P"],
        "Exams": ["EXM"],
        "Lessons": ["LES"],
    },
    "extended": {
        "Sleep": ["SWO", "SFR"],
        "Lessons": ["LES"],
        "Revision": ["REV"],
        "Exams": ["EXM"],
        "Repetitive": ["REP"],
        "Bureaucracy": ["BUR"],
        "Work": ["WRK"],
        "Tidying": ["TDY"],
        "Organization": ["ORG"],
        "Projects": ["PRJ"],
        "Media": ["MDI"],
        "Calls": ["CAL"],
        "Going Out": ["OUT", "EVE", "DOG"],
    },
}

In [None]:
grouping = "standard"

df = sum_columns_by_tag(rawDF, tag_tree[grouping])

print(df.head().to_string())

In [None]:
def format_timedelta(td):
    days = td.days
    seconds = td.seconds
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)

    parts = []
    if days > 0:
        parts.append(f"{days} days")
    if hours > 0:
        parts.append(f"{hours} hours")
    if minutes > 0:
        parts.append(f"{minutes} minutes")
    if seconds > 0:
        parts.append(f"{seconds} seconds")

    return ', '.join(parts)

# extracts the difference in time between the first two rows
# WARNING: assumes all differences are equal
time_step = df['from_time'][1] - df['from_time'][0]
time_step_string = format_timedelta(time_step)
print("Delta", time_step_string)

selection = grouping + " " + time_step_string

In [None]:
# cannot cross the year boundary
period_dates = {
    "lessons": [("-09-01", "-12-01"), ("-02-01", "-05-20")],
    "exams": [("-12-01", "-12-31"), ("-01-01", "-02-01"), ("-05-20", "-06-20")],
    "university": [("-09-01", "-12-31"), ("-01-01", "-06-20")],
}

def get_df_period_mask(period_dates_string, dataframe):
    dates = period_dates[period_dates_string]
    
    for dates_tuple in dates:
        from_date = pd.to_datetime(dataframe['from_time'].dt.year.astype(str) + dates_tuple[0], utc=True)
        to_date = pd.to_datetime(dataframe['from_time'].dt.year.astype(str) + dates_tuple[1], utc=True)
        mask = (dataframe["from_time"] >= from_date) & (dataframe["from_time"] < to_date)
        if dates_tuple == dates[0]:
            total_mask = mask
        else:
            total_mask = total_mask | mask
    
    return total_mask
  
def extract_data_from_df(dataframe, columns = ["from_time", "to_time"]):
    """Extracts the data from the dataframe according to the selection."""
    return dataframe.drop(columns=columns)
    
def to_normalised_df(dataframe_of_only_data):
    """Normalises the data in the dataframe."""
    return pd.DataFrame(StandardScaler().fit_transform(dataframe_of_only_data), columns=dataframe_of_only_data.columns, index=dataframe_of_only_data.index) 

def extend_dataframe_with_period(dataframe, period_dates_string):
    """Extends the dataframe with the period mask."""
    dataframe[period_dates_string] = get_df_period_mask(period_dates_string, dataframe)
    return dataframe

def get_all_period_masks(dataframe):
    period_masks = {}
    for period in period_dates.keys():
        period_masks[period] = get_df_period_mask(period, dataframe)
                
    return period_masks

In [None]:
def produce_feature_histogram(dataframe):
    # removes the zero values from each column, allowing for a more natural scale of the dataset  
    plt.figure()
    axes = dataframe.apply(lambda x: x[x != 0]).hist(figsize=(15, 7))
    plt.subplots_adjust(hspace=0.8, wspace=0.2)
    for ax in axes.flatten():
        ax.set_xlabel(f"Total duration in steps of {time_step_string}")
        ax.set_ylabel("Occurrences")
    plt.savefig(f'fil/Histogram Columns {selection}.png') 
    
data = extract_data_from_df(df)
produce_feature_histogram(data)

In [None]:
# days without any activity
sparsity = data.apply(lambda x: x[x==0]).count()/len(data)
plt.figure()
sparsity.sort_values(ascending=False).plot(kind='bar', figsize=(7, 4), title= time_step_string + " steps without any activity", grid=True)
plt.savefig(f'fil/Sparsity Chart {selection}.png') 

In [None]:
plt.figure()
data.sum(axis=1).hist(figsize=(5, 4))
plt.xlabel(f"Total duration per step of {time_step_string}")
plt.ylabel("Number of occurrences")
plt.title("Histogram of the total duration per step")
plt.savefig(f'fil/Histogram Total {selection}.png') 

In [None]:
maxes = pd.DataFrame(columns=["max"] + list(df.columns))

m = []

for col in data.columns:
    max_idx = np.argmax(data[col]) 

    m.append(pd.Series([col] + df.iloc[max_idx].tolist(), index=maxes.columns))

maxes = pd.concat(m, axis=1).T.reset_index(drop=True)

print("Possible outliers in the data")
print(maxes.to_string())

In [None]:
print(f"Data in range\n{df['from_time'].min()}\n{df['from_time'].max()}")

# normalises the data
scaled = StandardScaler().fit_transform(data)

In [None]:
def perform_pca(x, display=False):
    """Performs PCA with n_components, and returns them"""
    n_components = len(x.columns)
    # Create a PCA instance
    pca = PCA(n_components = n_components)

    # Fit the data
    components = pca.fit_transform(x)
    rounded_components = np.vectorize(lambda x: f"{x:.2f}")(pca.explained_variance_ratio_.cumsum())
    print("Cumulative sum of the principal components", rounded_components)

    if display:
        explained_variance_ratio = pca.explained_variance_ratio_
        plt.figure(figsize=(6, 4))
        plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, alpha=0.7, align='center')
        plt.step(range(1, len(explained_variance_ratio) + 1), np.cumsum(explained_variance_ratio), where='mid', color='red', label='Cumulative Explained Variance')
        plt.ylabel('Explained Variance Ratio')
        plt.xlabel('Principal Components')
        plt.title('PCA % Explained Variance Ratio')
        plt.xticks(range(1, len(explained_variance_ratio)))
        plt.legend(loc='best')
        plt.grid(True)
        plt.savefig(f'fil/PCA 2D Barplot-Histogram Total {selection}.png') 

    return pd.DataFrame(data=components, columns=[f"PC{i}" for i in range(1, n_components + 1)])

p_comp = perform_pca(data, True)

In [None]:
def get_cluster_association(scaled_data, num_clusters):
    hierarchical_cluster = AgglomerativeClustering(n_clusters=num_clusters, linkage="ward")
    return hierarchical_cluster.fit_predict(scaled_data)

NUMBER_OF_CLUSTERS = 5
df_cl = df.copy()
df_cl["cluster"] = get_cluster_association(scaled, NUMBER_OF_CLUSTERS)

cluster_grouping = df_cl.drop(columns=["from_time", "to_time"]).groupby("cluster")
cluster_sizes = cluster_grouping.size()
total_size = len(df_cl)

print("Size of the clusters")
for i, size in cluster_sizes.items():
    float_value = size / total_size
    print(f"+ Cluster {i} :: {float_value:.0%}, {size} samples")

In [None]:
print("Mean daily time expenditure in minutes, per cluster (row) and activity (col)")
means = cluster_grouping.mean()
print(means.to_string())

In [None]:
def identify_notable_values(dataframe):
    return dataframe.apply(lambda col: col.rank(method='min', ascending=False).astype(int) - 1)

def z_score_columns(dataframe):
    """Computes a z-score normalisation over the columns of a DataFrame"""
    return dataframe.apply(lambda col: (col - col.mean()) / col.std())

z_normalized = z_score_columns(means)

def assign_significance_value_lambda(x, thresh):
    if x >= thresh:
        return 1
    elif x < -thresh:
        return -1
    else:
        return 0

def highlight_significant(z_normalized, thresh):
    assert thresh > 0, "Threshold must be positive"
    return z_normalized.map(lambda x : assign_significance_value_lambda(x, thresh))

def print_significant(significant):
    dataframe = pd.DataFrame(columns=["High", "Low"])
    for index, row in significant.iterrows():
        high = [column for column, value in row.items() if value == 1]
        low = [column for column, value in row.items() if value == -1]
        dataframe.loc[index] = [str(high), str(low)]
    return dataframe

def analyze_significance(mean_time_expenditure, thresh):
    """Normalises (z-score) the dataframe over the columns, then identifies the values
    with sdev above thresh or below -thresh"""
    return print_significant(highlight_significant(z_score_columns(mean_time_expenditure), thresh))

cluster_meaning = analyze_significance(means, 1)
print(cluster_meaning.to_string())

In [None]:
def fill_diagonal_na(dataframe):
    copy = dataframe.copy()
    np.fill_diagonal(copy.values, np.nan)
    return copy

def get_corr_matrix(data_dataframe, method):
    correlation = data_dataframe.corr(method=method)
    correlation_matrix_without_self = fill_diagonal_na(correlation)

    linked = linkage(correlation, 'ward')

    # Reorder the columns based on the clustering
    ordered_columns = correlation_matrix_without_self.columns[leaves_list(linked)]

    # Display the reordered correlation matrix
    reordered_corr = correlation_matrix_without_self.loc[ordered_columns, ordered_columns]

    return reordered_corr

def show_correlation_matrix_method_comparisons(data_dataframe):
    plt.figure()
    fig, axes = plt.subplots(1, 2, figsize=(10, 5), gridspec_kw={'width_ratios': [0.75, 1]})

    sns.heatmap(get_corr_matrix(data_dataframe, "pearson"), annot=False, cmap='coolwarm', fmt="", linewidths=1, ax=axes[0], square=True, cbar=False)
    axes[0].set_title('Pearson Correlation')

    sns.heatmap(get_corr_matrix(data_dataframe, "kendall"), annot=False, cmap='coolwarm', fmt="", linewidths=1, ax=axes[1], square=True, cbar=True)
    axes[1].set_title('Kendall Correlation')

    plt.tight_layout()
    plt.savefig(f'fil/Sideby Correlation Comparisons {selection}.png') 
    
show_correlation_matrix_method_comparisons(data)

In [None]:
method_name = "pearson"
sns.heatmap(data.corr(method=method_name), linewidths=1, annot=False, cmap='coolwarm')
plt.title(f'Reordered {method_name} correlation matrix')
plt.show()

In [None]:
# extend the data by adding the next row as additional columns, with column names with "next_" prefix
def extend_data_with_next_row(dataframe):
    next_row = dataframe.shift(-1)
    next_row.columns = [f"next_{col}" for col in next_row.columns]
    return pd.concat([dataframe, next_row], axis=1).iloc[:-1]

data_extended = extend_data_with_next_row(data)

sns.heatmap(get_corr_matrix(data.diff(), method_name), linewidths=1, annot=False, cmap='coolwarm')
plt.title(f'{method_name} correlation matrix with next step')
plt.show()


In [None]:
correlation = data.corr(method="spearman")

def replace_correlation(value):
    """Excludes the values over the diagonals"""
    if abs(value - 1.0) <= 0.01:
        return np.nan
    else:
        return value

def print_significance_report(correlation_matrix, thresh):
    correlation_meaning = analyze_significance(correlation_matrix, thresh)
    
    print(correlation_meaning.to_string())
    
def exclude_self_correlated_values(correlation_matrix):
    return correlation_matrix.apply(lambda column: column.apply(replace_correlation))

correlation_matrix_without_self = exclude_self_correlated_values(correlation)
print_significance_report(correlation_matrix_without_self, 1.2)

In [None]:
from matplotlib.patches import Rectangle

def show_correlation_matrix_heatmap(correlation_matrix):
    z_normalised_correlation = z_score_columns(correlation_matrix)
    
    plt.figure(figsize=(8, 6))
    ax = sns.heatmap(z_normalised_correlation, annot=True, cmap='coolwarm', center=0)
    
    num_columns = z_normalised_correlation.shape[1]
    for i in range(num_columns):
        ax.add_patch(Rectangle((i, 0), 1, num_columns, fill=False, edgecolor='black', lw=2))
        
    ax.add_patch(Rectangle((0, 0), num_columns, num_columns, fill=False, edgecolor='black', lw=5))
    plt.tight_layout()
    plt.title('Correlation Matrix Heatmap, Normalised over the columns')
    plt.savefig(f'fil/Correlation Matrix Heatmap Normalised over the columns {selection}.png') 

show_correlation_matrix_heatmap(correlation_matrix_without_self)
# interpretation: the row represents an activity causing an effect on the column
# the number is computed over the columns and represents the effect size, relative to the other rows

In [None]:
def get_cluster_label(cluster, include_only_first_column = True):
    label = ""
    if include_only_first_column:
        label += f"{cluster_meaning.loc[cluster].iloc[0]}".replace("'", "")
    else:
        for col in cluster_meaning.columns:
            if cluster_meaning.loc[cluster, col] != []:
                label += f"{col} {cluster_meaning.loc[cluster, col]} "
    return label

In [None]:
def plot_dendrogram(pca):
    Z = linkage(pca, method="ward")
    plt.figure(figsize=(6, 3))
    dendrogram(Z)
    plt.title("Hierarchical Clustering Dendrogram")
    plt.tick_params(
        axis="x", bottom=False, labelbottom=False
    )  # Turn off x-axis ticks and labels
    plt.ylabel("Distance")
    plt.show()

if rawDF.shape[0] < 1000:
    plot_dendrogram(p_comp)

In [None]:
custom_cmap = ListedColormap(["purple", "blue", "brown", "green", "red", "gray", "orange", "black", "yellow", "pink"])

def plot_PCA_2D(pca, labels):
    scatter = plt.scatter(pca["PC1"], 
                          pca["PC2"], c=labels, cmap=custom_cmap)
    legend_labels = [get_cluster_label(i) for i in labels.unique()]
    plt.legend(handles=scatter.legend_elements()[0], labels=legend_labels, title="Cluster Labels", loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()

plot_PCA_2D(p_comp, df_cl["cluster"])

In [None]:
def plot_PCA_3D(pca, labels):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection="3d")

    # draws every point
    for i in range(len(np.unique(labels))):
        ax.scatter(
            pca[labels == i]["PC1"],
            pca[labels == i]["PC2"],
            pca[labels == i]["PC3"],
            c=[custom_cmap(i)] * np.sum(labels == i),
            label=get_cluster_label(i),
            s=10,
        )

    ax.legend(loc='center left', bbox_to_anchor=(1.15, 0.5))
    # Labeling axes
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")

    plt.show()

plot_PCA_3D(p_comp, df_cl["cluster"])

In [None]:
def extract_frequencies(dates_array, lam, columns):
    """lam must be a function that takes in a string date and
    parses it to an integer number [1, bins]"""
    cfreqs = []
    bins = len(columns)
    # i represents the cluster name
    for i, C in enumerate(dates_array):
        # extract day of the week
        elw = pd.to_datetime(C).apply(lam)
        elw = elw.value_counts() / len(C)

        # expands the index to include days which don't figure
        elw = elw.reindex(range(1, bins + 1), fill_value=0)

        # sets the name of the column to the cluster
        elw = elw.rename(i)

        cfreqs.append(elw)

    freq_df = pd.concat(cfreqs, axis=1).T
    freq_df.columns = columns
    return freq_df

In [None]:
def plot_histogram_series(x_labels, cfreq, description="Series Histogram"):
    number_of_clusters = len(cfreq)
    fig, axs = plt.subplots(
        number_of_clusters, 1, figsize=(5, number_of_clusters), sharex=True
    )
    fig.suptitle(description)
    for i, series in cfreq.iterrows():
        # Define bin edges to align with the ticks
        bin_edges = np.arange(len(series) + 1) - 0.4

        # Create a histogram for each series
        axs[i].hist(np.arange(len(series)), weights=series, bins=bin_edges, width=0.8)
        axs[i].set_title(get_cluster_label(i), loc="left")
        axs[i].set_ylabel("Frequency")
        axs[i].set_xticks(range(len(series)))  # Set tick positions for x-axis
        axs[i].set_xticklabels(x_labels)
        axs[i].tick_params(axis="x")

    plt.tight_layout()
    plt.show()
    
def plot_histogram(dataframe_with_clusters, bin_extractor_from_date, title_description, bin_titles):
    cluster_grouping = dataframe_with_clusters.drop(columns=["from_time", "to_time"]).groupby("cluster")
    
    start_time_by_cluster = [df.iloc[cluster.index]["from_time"] for _, cluster in cluster_grouping]

    frequencies = extract_frequencies(start_time_by_cluster, bin_extractor_from_date, bin_titles)
    plot_histogram_series(bin_titles, frequencies, "Distribution over the " + title_description)
    return frequencies

In [None]:
def get_hour(x):
    """Extracts the hour 2 hour block from a datetime object"""
    return int(x.strftime("%H"))//2

if time_step.total_seconds() // 3600 < 24:
    distribution_day = plot_histogram(df_cl, get_hour, "2h blocks", range(0, 12))
    # print(distribution_day.to_string())

In [None]:
week_labels = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]

def get_weekday(x):
    A = int(x.strftime("%w"))
    return 7 if A == 0 else A

if time_step.days < 7:
    distribution_week = plot_histogram(df_cl, get_weekday, "week", week_labels)
    # print(distribution_week.to_string())

In [None]:
month_labels = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]

def get_month(x):
    return int(x.strftime("%m"))
        
distribution_month = plot_histogram(df_cl, get_month, "month", month_labels)
# print(distribution_month.to_string())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def get_transition_data(dataframe):
    series = dataframe["cluster"]
    transitions = pd.DataFrame({'from': list(series[:-1]), 'to': list(series[1:])})
    return transitions

def plot_transition_matrix(dataframe):
    transitions = get_transition_data(dataframe)
    transition_counts = transitions.groupby(['from', 'to']).size().unstack(fill_value=0)
    transition_matrix = transition_counts.div(transition_counts.sum(axis=1), axis=0)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(transition_matrix, annot=True, cmap="Blues", fmt=".2f", cbar=True)
    plt.title("Transition Matrix", fontsize=16)
    
    plt.xlabel("To", fontsize=14)
    plt.ylabel("From", fontsize=14)
    
    labels = [get_cluster_label(i) for i in transition_matrix.columns]
    ticks = [i + 0.5 for i in range(len(transition_matrix.columns))]
    
    plt.xticks(ticks=ticks, labels=labels)
    plt.yticks(ticks=ticks, labels=labels)
    
    plt.show()

def restrict_df_to_date_range(dataframe, from_date, to_date):
    return dataframe[(dataframe["from_time"] >= from_date) & (dataframe["from_time"] < to_date)]

# early year university session
from_date = pd.to_datetime("2024-02-01", utc=True)
to_date = pd.to_datetime("2024-06-01", utc=True)

restr_df_cl = restrict_df_to_date_range(df_cl, from_date, to_date)
plot_transition_matrix(df_cl)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def do_time_data_split(dataframe):
    X = dataframe[:-1]
    y = dataframe.shift(-1).dropna()
    return train_test_split(X, y, test_size=0.2, random_state=42)

def try_linear_regression(dataframe):
    X_train, X_test, y_train, y_test = do_time_data_split(dataframe)
    
    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, model.intercept_, model.coef_, r2

def print_linear_model(vars):
    mse, intercept, coef, r2 = vars
    print(f'Mean Squared Error: {mse}')
    print(f'Intercept: {intercept}')
    print(f'Coefficient: {coef}')
    print(f"R-squared: {r2:.4f}")

def compare_linear_models_datasets(data1, data2):
    model1 = try_linear_regression(data1)
    mse1 = model1[0]
    r2_1 = model1[3]
    
    model2 = try_linear_regression(data2)
    mse2 = model2[0]
    r2_2 = model2[3]
    
    print(f"% delta mse {(2*(mse2 - mse1) / (mse1 + mse2)):.4f}")
    
    print(f"min mse {min(mse1, mse2):.4f}")
    print("max r2", max(r2_1, r2_2))
    
def create_dataframe_with_cluster_column(dataframe):
    dataframe = dataframe.copy()
    dataframe["cluster"] = df_cl["cluster"]
    return dataframe

# data is first normalised
scaled_data_df = to_normalised_df(data)

scaled_data_df_cl = create_dataframe_with_cluster_column(scaled_data_df)
l_reg = try_linear_regression(scaled_data_df)
print("R^2 coefficient:", l_reg[3])

In [None]:
def filter_and_normalise(data_dataframe):
    # drops the columns with all 0 values
    kept_and_nonempty_columns = (data_dataframe != 0).any(axis=0)
    original_data = data_dataframe.loc[:, kept_and_nonempty_columns]
    return to_normalised_df(original_data)

def plot_plausible_energy_distributions(filter_and_normalised_data, verbose=False):
    corr_matrix = filter_and_normalised_data.corr()
    eigenvalues, eigenvectors = np.linalg.eigh(corr_matrix)

    if verbose:
        print("\nEigenvalues:")
        print(eigenvalues)

        print("\nMinimum Eigenvalue:")
        print(eigenvalues[0])

        print("\nCorresponding Eigenvector (normalized):")
        print(eigenvectors[0] / np.linalg.norm(eigenvectors[0]))

    eigenvectors_df = pd.DataFrame(eigenvectors.T, columns=filter_and_normalised_data.columns)
    grid = plt.GridSpec(len(eigenvectors_df), 2, width_ratios=[3, 1])

    ax1 = plt.subplot(grid[:, 0])
    sns.heatmap(eigenvectors_df, cmap="coolwarm", center=0, fmt=".2f", annot=True, 
                cbar=False, 
                ax=ax1)

    explained_variance_labels = [f"{lam:.2f}" for lam in eigenvalues]
    explained_variance_labels[0] = "Var[c*x] = " + explained_variance_labels[0]

    ax1.set_yticklabels(explained_variance_labels, rotation=0, fontsize=8, fontweight='bold')

    energy_values = pd.DataFrame({i:filter_and_normalised_data @ eigenvectors_df.iloc[i].to_numpy() for i in range(len(eigenvectors_df))})
    
    x_max_value = energy_values.max(axis=None)
    x_min_value = energy_values.min(axis=None)
    
    for i in range(len(eigenvectors_df)):
        ax2 = plt.subplot(grid[i, 1])
        
        ax2.hist(energy_values[i], bins=20, orientation='vertical', color='blue')
        # set the axis labels small
        ax2.tick_params(axis='both', which='major', labelsize=5)
                
        ax2.set_xlim(x_min_value, x_max_value)
        
        if i == 0:
            ax1.set_title('Heatmap for each interpretation')
            ax2.set_title('Histogram of energies')

    plt.tight_layout()
    plt.show()
    
    return (filter_and_normalised_data @ eigenvectors_df.iloc[0].to_numpy(), eigenvectors_df)
    
eigevectors_dict = {}
eigevectors_dict["general"] = plot_plausible_energy_distributions(filter_and_normalise(data))
eigevectors_dict["lessons"] = plot_plausible_energy_distributions(filter_and_normalise(extract_data_from_df(df[get_df_period_mask("lessons", df)])))
eigevectors_dict["exams"] = plot_plausible_energy_distributions(filter_and_normalise(extract_data_from_df(df[get_df_period_mask("exams", df)])))
eigevectors_dict["university"] = plot_plausible_energy_distributions(filter_and_normalise(extract_data_from_df(df[get_df_period_mask("university", df)])))
eigevectors_dict["not_university"] = plot_plausible_energy_distributions(filter_and_normalise(extract_data_from_df(df[~get_df_period_mask("university", df)])))

In [None]:
def plot_heatmap(dataframe):
    # force square shape
    ax = sns.heatmap(dataframe.T, annot=True, cmap='coolwarm', center=0, fmt=".2f", square=True, cbar=False)
    ax.set_title("Heatmap of the Energy Distribution")
    ax.set_xlabel("Activity")
    ax.set_ylabel("Eigenvector")

    # Adjust layout to prevent overlap
    plt.tight_layout()
    plt.show()

eigenvectors_list = []
for key, (energy_values, eigenvectors_df) in eigevectors_dict.items():
    # only takes the best estimate
    eigenvectors_list.append(eigenvectors_df.iloc[0])
    
merged_eigenvectors = pd.concat(eigenvectors_list, axis=1, keys=[key for key in eigevectors_dict.keys()])

plot_heatmap(merged_eigenvectors)

In [None]:
import umap

n_components = 2

presets = {
    "weekly": {},
    "daily": {
        "UMAP":{
            # NOTE: transformed_data = data.div(1 + data.sum(axis=1), axis=0)
            "n_neighbors": 50,
            "min_dist": 0,
        }
    },
    "4h": {
        "UMAP":{
            "n_neighbors": 70,
            "min_dist": 0.1,
        }
    }
}

reducer = umap.UMAP(
    n_neighbors=1000,
    min_dist=0,
    n_components = n_components
)
# divide each row by sum over row
# transformed_data = data.div(1 + data.sum(axis=1), axis=0)
transformed_data = data.div(1 + data.sum(axis=1), axis=0)
embedding = pd.DataFrame(reducer.fit_transform(transformed_data))

In [None]:
# picks with index 0 the "general" energy function
def plot(data_embedding, n_components, labelling_values, title, cbar=True):
    # filters the embedding data to ones whos index appears in labelling_values
    data_embedding = data_embedding.iloc[labelling_values.index]
    if n_components == 2:
        plt.figure()
        scatter = plt.scatter(data_embedding.iloc[:, 0], data_embedding.iloc[:, 1], c=labelling_values, s=5)
        if cbar:
            plt.colorbar(label='Label')
        plt.xlabel('UMAP 1')
        plt.ylabel('UMAP 2')
        plt.title(f'UMAP Projection {title}')   
        return scatter
    else:
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(data_embedding[:, 0], data_embedding[:, 1], data_embedding[:, 2], c=labelling_values, s=5)
        ax.set_xlabel('UMAP 1')
        ax.set_ylabel('UMAP 2')
        ax.set_zlabel('UMAP 3')
        ax.set_title(f'UMAP Projection {title}')
        plt.show()
        return ax

In [None]:
step_of_day = df["from_time"].dt.hour + df["from_time"].dt.minute / 60
month_of_year = df["from_time"].dt.month
day_of_week = df["from_time"].dt.dayofweek
year = df["from_time"].dt.year
energy = eigevectors_dict["lessons"][0].abs()

df_period_extended = get_all_period_masks(df)
for key in df_period_extended.keys():
    plot(embedding, n_components, df_period_extended[key], key)


In [None]:
import umap.plot
if n_components==2:
    # umap.plot.diagnostic(reducer, diagnostic_type='local_dim')
    umap.plot.diagnostic(reducer, diagnostic_type='vq')
    umap.plot.diagnostic(reducer, diagnostic_type='pca')

In [None]:
def plot_all_data_columns_over_time(n, data):
    # Group the data into chunks of size n by summing the columns
    df_grouped = data.groupby(np.arange(len(df)) // n).sum().div(n)
    columns = data.columns
    # Create a new 'from_time' column for the grouped DataFrame to plot against
    df_grouped['from_time'] = df['from_time'][::n].reset_index(drop=True)

    # Plot the value of one of the columns (e.g., 'Projects') on the y-axis and the time on the x-axis
    for column in columns:
        plt.figure()
        plt.plot(df_grouped['from_time'], df_grouped[column], label=column)
        plt.xlabel('Time')
        plt.ylabel(column)
        plt.title(f'{column} over Time')
        plt.xticks(rotation=45)
        plt.show()

def plot_a_data_columns_over_time(n, data, column):
    # Group the data into chunks of size n by summing the columns
    df_grouped = data.groupby(np.arange(len(df)) // n).sum().div(n)
    columns = data.columns
    # Create a new 'from_time' column for the grouped DataFrame to plot against
    df_grouped['from_time'] = df['from_time'][::n].reset_index(drop=True)

    plt.figure()
    plt.plot(df_grouped['from_time'], df_grouped[column], label=column)
    plt.xlabel('Time')
    plt.ylabel(column)
    plt.title(f'{column} over Time')
    plt.xticks(rotation=45)
    plt.show()

def display_all_data_columns_over_time(n, data):
    # Group the data into chunks of size n by summing the columns
    df_grouped = data.groupby(np.arange(len(df)) // n).sum()

    # Create a new 'from_time' column for the grouped DataFrame to plot against
    df_grouped['from_time'] = df['from_time'][::n].reset_index(drop=True)

    # Divide the grouped data by n
    for column in df_grouped.columns:
        if column != 'from_time':
            df_grouped[column] = df_grouped[column] / n

    # List of columns to plot (excluding 'from_time')
    columns = df_grouped.columns[df_grouped.columns != 'from_time']

    # Plot all columns on the same figure
    plt.figure()
    for column in columns:
        plt.plot(df_grouped['from_time'], df_grouped[column], label=column)

    plt.xlabel('Time')
    plt.ylabel('Average value per step')
    plt.title('Columns over Time')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.legend()
    plt.show()
    
modified_data = df.drop(columns=["from_time", "to_time", "Sleep"])
modified_data["Lessons"] = df["Lessons"] + df["Revision"]
modified_data.drop(columns=["Revision"], inplace=True)

display_all_data_columns_over_time(30, modified_data)

plot_a_data_columns_over_time(7, data, "Projects")