In [55]:
import pandas as pd
import polars as pl
import os
from hampel import hampel
from datetime import datetime, timezone
import matplotlib.pyplot as plt

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

start_time = datetime(2024, 4, 11, 0, 0, 0).replace(tzinfo=timezone.utc)
end_time = datetime(2024, 7, 11, 0, 0, 0).replace(tzinfo=timezone.utc)

df = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "1m_cal_corr_acropolis.parquet"))

In [87]:
df_filtered = df.filter(pl.col("system_name").is_in(["tum-esm-midcost-raspi-10"])).filter(pl.col("creation_timestamp").is_between(start_time, end_time))
df_filtered = df_filtered.select("creation_timestamp", "gmp343_corrected").cast({"gmp343_corrected": pl.Float32})

data = df_filtered.get_column("gmp343_corrected").to_pandas()

# Apply the Hampel filter
result = hampel(data, window_size=120, n_sigma=2.0)

print(result.filtered_data)

0         458.851868
1         462.119598
2         466.166412
3         461.548615
4         462.653809
             ...    
119337    457.288391
119338    458.418182
119339    458.797546
119340    459.016510
119341    458.511230
Length: 119342, dtype: float32


In [88]:
print(len(result.outlier_indices) / len(data))

0.022858675068291128


In [89]:
df_filtered = df_filtered.with_columns((pl.from_pandas(result.filtered_data)).alias("hampel_filter"))
df_filtered = df_filtered.with_columns(pl.col("gmp343_corrected").ne(pl.col("hampel_filter")).alias("Flagged"),)

In [90]:
import plotly.express as px
fig = px.scatter(df_filtered, x="creation_timestamp", y="gmp343_corrected", color="Flagged")
fig.show()