### Task 3 

Compare the embedded approach results with the test set

1. Random data removing - 10 points per 24h period
2. Calculate the error between the imputed data and actual data.
3. Do it 10 000 times for each file
4. Calculate summary statistics the errors for each hour for each file (https://pandas.pydata.org/pandas-docs/stable/getting_started/intro_tutorials/06_calculate_statistics.html, https://www.geeksforgeeks.org/box-plot-in-python-using-matplotlib/)
5. One box plot for 24h for each sensor.

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates
import time

In [None]:
### Get all the paths

paths = []

for root, dirs, files in os.walk("data/data SPL August 2022 all", topdown=False):
    for name in files:
        paths.append(os.path.join(root, name))

In [None]:
def get_data_with_nans(df):
    full_time_range = pd.date_range(start=df["Time"].min(), end=df["Time"].max(), freq="min")
    df = df.set_index("Time").reindex(full_time_range)
    df = df.rename_axis("Time").reset_index()
    df["time_diff"] = df["Time"].diff().dt.total_seconds()
    df["dt_sound_level_dB"] = np.where((df["time_diff"] == 60) | (df.index == 0), df["dt_sound_level_dB"], np.nan)
    df = df.drop(columns=["time_diff"])
    return df

def get_clean_dataframe(path):
    data = pd.read_csv(path)
    df = pd.DataFrame(data)
    df["Time"] = pd.to_datetime(df["Time"])
    df = df.drop_duplicates(subset=["Time"], keep=False) 
    df = get_data_with_nans(df)
    df["hour"] = df["Time"].dt.hour
    df["minute"] = df["Time"].dt.minute
    return df

In [None]:
def get_samples(df, random_state):
    # Extract 10 random values per 24-hour period
    df["date"] = df["Time"].dt.date
    grouped = df[df["dt_sound_level_dB"].notna()].groupby("date")

    sampled_data = grouped.sample(n=10, random_state=random_state)
    remaining_data = df.copy()
    remaining_data.loc[sampled_data.index, "dt_sound_level_dB"] = np.nan
    return sampled_data, remaining_data

In [None]:
def get_median_values(group):
    return np.ceil(group.median())

def get_imputed_data(df):
    imputed_values = df.groupby(["hour", "minute"])["dt_sound_level_dB"].transform(get_median_values)
    if imputed_values.isna().any():
        imputed_values = imputed_values.interpolate(method="linear", limit_direction="both")
        
    df["dt_sound_level_dB"] = df["dt_sound_level_dB"].fillna(imputed_values)
    return df

In [None]:
def get_error_stats(path):
    dataframe = get_clean_dataframe(path)
    error_data = []
    
    for iteration in range(10000):
        start_time = np.floor(time.time() * 1000)
        sampled_data, remaining_data = get_samples(dataframe, random_state=iteration)
        imputed_data = get_imputed_data(remaining_data)
        sampled_data_indices = sampled_data.index.to_numpy()
        imputed_subset = imputed_data.loc[sampled_data_indices]
        error = pd.DataFrame()
        error['error_db_level'] = np.abs(sampled_data["dt_sound_level_dB"] - imputed_subset["dt_sound_level_dB"])
        error['hour'] = imputed_subset['hour']
        error['iteration'] = iteration
        error_data.append(error)
        print(str(iteration) + ". duration:", np.floor(time.time() * 1000) - start_time)

    errors = pd.concat(error_data, ignore_index=True)
    grouped_data = errors.groupby('hour')['error_db_level'].agg(list)
    return grouped_data

In [None]:
def create_box_plot(path):
    error_stats = get_error_stats(path)
    plt.figure(figsize=(12, 6))
    plt.boxplot(error_stats)
    
    plt.title(path)
    plt.xlabel("Hour of the Day")
    plt.ylabel("Error (dB)")
    
    filename = path.replace("data/data SPL August 2022 all/", "").replace(".csv", "")
    plt.savefig(f"box_plots/{filename}.png")

In [None]:
def create_plots_for_all_sensors():
    for path in paths[13:]:
        start_time = np.floor(time.time() * 1000)
        create_box_plot(path)
        print("Total duration for 1 plot: ", np.floor(time.time() * 1000) - start_time)
create_plots_for_all_sensors()

In [None]:
def plot(path, timestamps, db_levels, interval):    
    plt.figure(figsize=(16, 3))
    plt.step(timestamps, db_levels, label="SPL (dB)")
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H:%M'))
    plt.gca().xaxis.set_major_locator(mdates.MinuteLocator(interval=interval))
    plt.title(path)
    plt.xlabel("Time")
    plt.ylabel("Decibels (dB)")
    plt.grid(True)
    plt.xticks(rotation=90)
    y_min = db_levels.min()
    y_max = db_levels.max()
    y_ticks = np.arange(y_min, y_max, 1)
    plt.yticks(y_ticks)
    plt.legend()
    plt.show()

In [None]:
# for check purposes

def get_one_sensor_plot(path, start_time, end_time, interval=1):
    df = get_clean_dataframe(path)
    df = df[(df["Time"] >= start_time) & (df["Time"] < end_time)]
    plot(path, df['Time'], df['dt_sound_level_dB'], interval=interval)

get_one_sensor_plot(path=paths[0], start_time="2022-08-01 02:07:00", end_time="2022-08-01 03:07:00")

In [None]:
def get_one_sensor_imputed_plot(path, start_time, end_time, interval=1):
    df = get_clean_dataframe(path)
    df = get_imputed_data(df)
    df = df[(df["Time"] >= start_time) & (df["Time"] < end_time)]
    plot(paths[0], df['Time'], df['dt_sound_level_dB'], interval=interval)
    
get_one_sensor_imputed_plot(path=paths[0], start_time="2022-08-01 02:07:00", end_time="2022-08-01 03:07:00")