# Data exploration of data in timestream db


## Initialization


### Imports


In [None]:
from os import makedirs
from itertools import repeat
from typing import Optional, Sequence

import awswrangler as wr
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import scipy.signal as signal
from dotenv import load_dotenv
from numpy.polynomial import Polynomial

load_dotenv()


### Data retrieval


In [None]:
df: pd.DataFrame = wr.timestream.query('SELECT * FROM "when-to-water"."sensor-data"')
print(f"Retrieved {len(df)} records")


### Constants, classes and functions


In [None]:
SENSOR_PLANT_MAPPING: dict = {
    "PWS_1": "Goldfruchtpalme",
    "PWS_2": "Pilea",
    "PWS_3": "Drachenbaum",
}
PLANTS: tuple[str, ...] = tuple(SENSOR_PLANT_MAPPING.values())
COLORS: tuple[str, ...] = (
    "#E69F00",
    "#56B4E9",
    "#009E73",
    "#F0E442",
    "#0072B2",
    "#D55E00",
    "#CC79A7",
)
MINIMUM_MOISTURE: dict = {
    "Goldfruchtpalme": 10,
    "Pilea": 20,
    "Drachenbaum": 30,
}


def remove_outliers(
    df: pd.DataFrame, columns: Sequence[str], n_std: int
) -> pd.DataFrame:
    print(f"Old length: {len(df)}")
    for col in columns:
        mean = df[col].mean()
        sd = df[col].std()

        df = df[(df[col] <= mean + (n_std * sd))]
        df = df[(df[col] >= mean - (n_std * sd))]

    print(f"New length: {len(df)}")
    return df


## Transformations


### General


In [None]:
df["time"] = pd.to_datetime(df["time"])
df.rename(
    columns={"measure_value::double": "value", "sensor_name": "plant"}, inplace=True
)
df["value"] = df["value"].astype(float)


df["plant"] = df["plant"].map(SENSOR_PLANT_MAPPING)
df.dropna(inplace=True)
# drop power
df = df[df["measure_name"] != "power"]

# add unit to measurement name
df["measure_name"] = df["measure_name"].str.replace("_", " ") + " in " + df["unit"]
measurements = df["measure_name"].unique().tolist()

# drop unit
df.drop(columns=["unit"], inplace=True)

# remove 0 moisture
df = df[~((df["measure_name"] == "soil moisture in %") & (df["value"] == 0))]

# Resample df to hourly measures
df.set_index("time", inplace=True)
df = df.groupby(["plant", "measure_name"]).resample("H").mean().reset_index()

df.set_index(["time", "plant", "measure_name"], inplace=True)
df = df.unstack().reset_index()
df.columns = [" ".join(col).strip().replace("value ", "") for col in df.columns.values]
df.reset_index(drop=True, inplace=True)

df.set_index("time", inplace=True)
for plant in PLANTS:
    df[df["plant"] == plant] = df[df["plant"] == plant].interpolate(method="time")

df.reset_index(inplace=True)


### Identify valleys/peaks in moisture


In [None]:
DISTANCE = 3
PROMINENCE = 2

all_peaks: list = []
all_valleys: list = []

for plant in PLANTS:
    df_plant = df[df["plant"] == plant]
    peaks = signal.find_peaks(
        df_plant["soil moisture in %"],
        distance=DISTANCE,
        prominence=PROMINENCE,
    )[0]
    valleys = signal.find_peaks(
        -df_plant["soil moisture in %"],
        distance=DISTANCE,
        prominence=PROMINENCE,
    )[0]
    # translate row to index
    all_peaks += [df_plant.index[peak] for peak in peaks]
    all_valleys += [df_plant.index[valley] for valley in valleys]

df["peak"] = df.index.isin(all_peaks)
df["valley"] = df.index.isin(all_valleys)


### Pick descends and normalize soil moisture


In [None]:
decending_dfs: list[pd.DataFrame] = []
for plant in PLANTS:
    last_peak = -1
    last_valley = -1
    df_plant = df[df["plant"] == plant].copy()
    df_plant.reset_index(drop=True, inplace=True)
    for row in df_plant.itertuples():
        if row.peak:
            last_peak = row.Index
        if row.valley:
            last_valley = row.Index
            if last_peak > -1 and last_peak < last_valley:
                df_candidate = df_plant.iloc[last_peak:last_valley].copy()
                mininmum_dt = df_candidate["time"].min()
                # offset in days
                df_candidate["offset"] = (
                    (df_candidate["time"] - mininmum_dt).dt.total_seconds() / 3600 / 24
                )
                if (
                    df_candidate.iloc[0]["soil moisture in %"]
                    < df_candidate.iloc[-1]["soil moisture in %"]
                ):
                    continue

                # remove outliers
                # df_candidate = remove_outliers(df_candidate,["soil moisture in %"],1)
                if df_candidate.empty:
                    continue

                # normalize
                df_candidate["soil moisture in %"] = df_candidate[
                    "soil moisture in %"
                ] + (100 - df_candidate["soil moisture in %"].max())

                decending_dfs.append(df_candidate)

    all_decending_dfs = pd.concat(decending_dfs)


## Regression


In [None]:
polyfits: dict = {}
for plant in PLANTS:
    polyfits[plant] = Polynomial.fit(
        all_decending_dfs[all_decending_dfs["plant"] == plant]["offset"],
        all_decending_dfs[all_decending_dfs["plant"] == plant]["soil moisture in %"],
        1,
    )


## Find next watering time


In [None]:
for plant in PLANTS:
    newest_time = df[df["plant"] == plant]["time"].max()
    newest_moisture = df[(df["plant"] == plant) & (df["time"] == newest_time)][
        "soil moisture in %"
    ]
    roots_current = (polyfits[plant] - newest_moisture).roots()
    roots_minimum = (polyfits[plant] - MINIMUM_MOISTURE[plant]).roots()
    print(
        f"{plant} will reach {MINIMUM_MOISTURE[plant]} % moisture in {int(roots_minimum[0] - roots_current[0])} days"
    )


## Plots


In [None]:
figure = go.Figure(
    layout=go.Layout(
        title="Moisture Regression",
        width=1280,
        height=720,
        template="plotly_white",
    )
)

for index, plant in enumerate(PLANTS):
    figure.add_trace(
        go.Scatter(
            x=all_decending_dfs[all_decending_dfs["plant"] == plant]["offset"],
            y=all_decending_dfs[all_decending_dfs["plant"] == plant][
                "soil moisture in %"
            ],
            mode="markers",
            name=plant,
            opacity=0.5,
            marker=dict(color=COLORS[index], size=3),
        )
    )
    poly_x, poly_y = polyfits[plant].linspace(100)
    figure.add_trace(
        go.Scatter(
            x=poly_x,
            y=poly_y,
            mode="lines",
            name=f"regression {plant}",
            opacity=1,
            line=dict(color=COLORS[index], width=2, dash="longdash"),
        )
    )

figure.update_scenes(
    dict(
        xaxis_title="Time after last watering in days",
        yaxis_title="soil moisture in %",
    )
)

makedirs("./dist", exist_ok=True)
figure.write_html("./dist/moisture_regression.html")

figure.show()
