# 1. Import Brick Schema

In [77]:
from brickschema import Graph
g = Graph(load_brick=True)

In [78]:
q_all_equips = """
PREFIX brick: <https://brickschema.org/schema/Brick#>
PREFIX rdfs:  <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?equip WHERE {
    ?equip rdfs:subClassOf* brick:Equipment .
}
ORDER BY ?equip
"""
equip_list = [str(r[0]) for r in g.query(q_all_equips)]
print(len(equip_list), "equipment classes found")
print(equip_list[:25])

q_preferred_equips = """
PREFIX brick: <https://brickschema.org/schema/Brick#>
PREFIX rdfs:  <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?equip WHERE {
    ?equip rdfs:subClassOf* brick:Equipment .
    FILTER NOT EXISTS { ?equip brick:aliasOf ?alias }
}
ORDER BY ?equip
"""
pref_equips = [str(r[0]) for r in g.query(q_preferred_equips)]
print(len(pref_equips), "preferred equipment classes")


355 equipment classes found
['https://brickschema.org/schema/Brick#AED', 'https://brickschema.org/schema/Brick#AHU', 'https://brickschema.org/schema/Brick#Absorption_Chiller', 'https://brickschema.org/schema/Brick#Access_Control_Equipment', 'https://brickschema.org/schema/Brick#Access_Reader', 'https://brickschema.org/schema/Brick#Active_Chilled_Beam', 'https://brickschema.org/schema/Brick#Air_Cooled_Chiller', 'https://brickschema.org/schema/Brick#Air_Diffuser', 'https://brickschema.org/schema/Brick#Air_Handler_Unit', 'https://brickschema.org/schema/Brick#Air_Handling_Unit', 'https://brickschema.org/schema/Brick#Air_Plenum', 'https://brickschema.org/schema/Brick#Audio_Visual_Equipment', 'https://brickschema.org/schema/Brick#Automated_External_Defibrillator', 'https://brickschema.org/schema/Brick#Automatic_Switch', 'https://brickschema.org/schema/Brick#Automatic_Tint_Window', 'https://brickschema.org/schema/Brick#Automatic_Transfer_Switch', 'https://brickschema.org/schema/Brick#BACnet_C

In [79]:
import pandas as pd
pd.Series(equip_list, name="equipment classes found").to_csv("brick_equipment_list.csv", index=False)
pd.Series(pref_equips, name="preferred equipment classes").to_csv("brick_prefequipment_list.csv", index=False)

# 2. Data Pipeline
data preprocessing, feature engineering

(a) load building operation data

In [80]:
import numpy as np
from pathlib import Path

INPUT_PATH = "Test_data.csv"
SHEET_NAME = 0
RESAMPLE_RULE = "15T"
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [81]:
def read_alternating(path: str, sheet_name=0):
    if path.lower().endswith((".xlsx", ".xls")):
        df = pd.read_excel(path, sheet_name=sheet_name)
    else:
        df = pd.read_csv(path)
    # 1 pair - 2 column(time, value)
    cols = list(df.columns)
    if len(cols) % 2 != 0:
        raise ValueError("odd column, Please set (time,value)")
    long_parts = []
    for i in range(0, len(cols), 2):
        tcol, vcol = cols[i], cols[i+1]
        sub = df[[tcol, vcol]].copy()
        sub.columns = ["timestamp", "value"]
        # timestamp pacing
        sub["timestamp"] = pd.to_datetime(sub["timestamp"], errors="coerce")
        sub = sub.dropna(subset=["timestamp"]).copy()
        # extract value row
        sub = sub.dropna(subset=["value"], how="all")
        sub["point_id"] = f"p_{(i//2)+1}" if vcol is None else (str(vcol) or f"p_{(i//2)+1}")
        # save point_name to value column header
        sub["point_name"] = str(vcol)
        long_parts.append(sub[["point_id","point_name","timestamp","value"]])
    long_df = pd.concat(long_parts, ignore_index=True)
    return long_df

def detect_type_per_point(g: pd.DataFrame) -> str:
    # {0,1} value = binary
    vals = pd.to_numeric(g["value"], errors="coerce").dropna()
    uniq = set(np.unique(vals.values))
    if len(uniq) <= 2 and uniq.issubset({0,1}):
        return "binary"
    return "ratio"

def preprocess_long(long_df: pd.DataFrame):
    # change time: past to current
    long_df = long_df.sort_values("timestamp").reset_index(drop=True)
    long_df["value"] = pd.to_numeric(long_df["value"], errors="coerce")
    # determine type
    types = long_df.groupby("point_id").apply(detect_type_per_point).rename("data_type").reset_index()
    long_df = long_df.merge(types, on="point_id", how="left")
    return long_df

In [82]:
# ---- Ratio process ----
def build_ratio_features(long_df: pd.DataFrame, rule: str = "15T") -> pd.DataFrame:
    """
    long_df: columns = [point_id, timestamp, value, data_type, ...]
    rule   : resample freq (default '15T')

    처리 순서:
      1) time set by point
      2) same timestamp process
      3) generate uniform grid (date_range) after reindex
      4) ffill/bfill + interpolate
      5) statistical calculate
    """
    def safe_autocorr(series: pd.Series, lag: int) -> float:
        if lag <= 0 or len(series) <= lag + 1 or series.isna().all():
            return 0.0
        try:
            ac = series.autocorr(lag=lag)
            return float(ac if ac is not None else 0.0)
        except Exception:
            return 0.0

    ratio = long_df[long_df["data_type"] == "ratio"].copy()
    feats = []

    if ratio.empty:
        return pd.DataFrame(columns=[
            "point_id","n","missing_ratio","mean","std","min","p25","median","p75","max","iqr",
            "skew","kurt","range","zero_ratio","madiff_mean","max_diff","pct_change_std",
            "daily_cycle_strength","weekly_cycle_strength"
        ])

    daily_lag  = int(pd.Timedelta("24H")  / pd.Timedelta(rule))
    weekly_lag = int(pd.Timedelta("168H") / pd.Timedelta(rule))

    for pid, g in ratio.groupby("point_id"):
        g = g.dropna(subset=["timestamp"]).sort_values("timestamp")
        s = g.set_index("timestamp")["value"]

        s = s.groupby(level=0).mean()

        if s.index.size == 0:
            feats.append({
                "point_id": pid, "n": 0, "missing_ratio": 1.0,
                "mean": 0.0, "std": 0.0, "min": 0.0, "p25": 0.0, "median": 0.0, "p75": 0.0, "max": 0.0,
                "iqr": 0.0, "skew": 0.0, "kurt": 0.0, "range": 0.0, "zero_ratio": 0.0,
                "madiff_mean": 0.0, "max_diff": 0.0, "pct_change_std": 0.0,
                "daily_cycle_strength": 0.0, "weekly_cycle_strength": 0.0
            })
            continue

        full_index = pd.date_range(start=s.index.min(), end=s.index.max(), freq=rule)
        s = s.reindex(full_index)

        s = s.ffill().bfill().interpolate()

        diffs = s.diff().abs()

        # statistical calculate
        std_ = float(s.std() if s.size > 1 else 0.0)
        skew_ = float(0.0 if std_ in [0, np.nan] else s.skew())
        kurt_ = float(0.0 if std_ in [0, np.nan] else s.kurt())

        stats = {
            "point_id": pid,
            "n": int(s.shape[0]),
            "missing_ratio": float(g["value"].isna().mean()),
            "mean": float(s.mean()),
            "std": std_,
            "min": float(s.min()),
            "p25": float(s.quantile(0.25)),
            "median": float(s.median()),
            "p75": float(s.quantile(0.75)),
            "max": float(s.max()),
            "iqr": float(s.quantile(0.75) - s.quantile(0.25)),
            "skew": skew_,
            "kurt": kurt_,
            "range": float(s.max() - s.min()),
            "zero_ratio": float((s == 0).mean()),
            "madiff_mean": float(diffs.mean() if diffs.size else 0.0),
            "max_diff": float(diffs.max() if diffs.size else 0.0),
            "pct_change_std": float(s.pct_change().std() if s.size > 1 else 0.0),
            "daily_cycle_strength": safe_autocorr(s, daily_lag),
            "weekly_cycle_strength": safe_autocorr(s, weekly_lag),
        }
        feats.append(stats)

    return pd.DataFrame(feats)

In [83]:
# ---- Binary process ----
def build_binary_features(long_df: pd.DataFrame, rule: str = "15T"):
    """
    long_df: columns = [point_id, timestamp, value, data_type, ...]
    rule   : resample freq (default '15T')

    output:
      - feat_binary_events: event log based summary
      - feat_binary_ffill : 15m resample+FFill based summary
    """
    binary = long_df[long_df["data_type"] == "binary"].copy()

    ev_rows = []
    if not binary.empty:
        for pid, g in binary.groupby("point_id"):
            # timestamp error solve
            g = g.dropna(subset=["timestamp", "value"]).sort_values("timestamp")
            g = g.groupby("timestamp", as_index=False).last()

            if g.empty:
                ev_rows.append({"point_id": pid, "switch_count": 0,
                                "avg_on_duration_s": 0.0, "on_ratio": 0.0})
                continue

            v = g["value"].astype(int).reset_index(drop=True)
            t = g["timestamp"].astype("int64").reset_index(drop=True) // 10**9  # seconds

            transitions = int((v.diff().fillna(0) != 0).sum())

            # average on time
            if (v == 1).sum() >= 2:
                on_times = t[v == 1].diff().dropna()
                avg_on_s = float(on_times.mean()) if not on_times.empty else 0.0
            else:
                avg_on_s = 0.0

            on_ratio = float((v == 1).mean())
            ev_rows.append({
                "point_id": pid,
                "switch_count": transitions,
                "avg_on_duration_s": avg_on_s,
                "on_ratio": on_ratio
            })
    feat_binary_events = pd.DataFrame(ev_rows)

    # (B) 15m resample + FFill based summary
    ff_rows = []
    if not binary.empty:
        for pid, g in binary.groupby("point_id"):
            g = g.dropna(subset=["timestamp", "value"]).sort_values("timestamp")
            g = g.groupby("timestamp", as_index=False).last()
            if g.empty:
                ff_rows.append({"point_id": pid, "ffill_on_ratio": 0.0, "ffill_switch_count": 0})
                continue

            s = g.set_index("timestamp")["value"].astype(int)
            full_index = pd.date_range(start=s.index.min(), end=s.index.max(), freq=rule)
            s = s.reindex(full_index).ffill().fillna(0).astype(int)
            sdiff = s.diff().fillna(0).abs()

            ff_rows.append({
                "point_id": pid,
                "ffill_on_ratio": float((s == 1).mean()),
                "ffill_switch_count": int((sdiff > 0).sum()),
            })
    feat_binary_ffill = pd.DataFrame(ff_rows)

    return feat_binary_events, feat_binary_ffill

In [84]:
def main():
    long_df = read_alternating(INPUT_PATH, SHEET_NAME)
    long_df = preprocess_long(long_df)
    long_df.to_csv(OUTPUT_DIR/"long_timeseries.csv", index=False)

    feat_ratio = build_ratio_features(long_df)
    feat_bin_ev, feat_bin_ff = build_binary_features(long_df)

    # murge (NaN -> 0)
    features = (feat_ratio.set_index("point_id")
                .join(feat_bin_ev.set_index("point_id"), how="outer")
                .join(feat_bin_ff.set_index("point_id"), how="outer")).reset_index()
    for c in features.columns:
        if c != "point_id":
            features[c] = features[c].fillna(0)

    # save
    feat_ratio.to_csv(OUTPUT_DIR/"features_ratio.csv", index=False)
    feat_bin_ev.to_csv(OUTPUT_DIR/"features_binary_events.csv", index=False)
    feat_bin_ff.to_csv(OUTPUT_DIR/"features_binary_ffill.csv", index=False)
    features.to_csv(OUTPUT_DIR/"features_all.csv", index=False)

    # print summary
    print("=== Summary ===")
    print("points:", long_df["point_id"].nunique())
    print("ratio points:", (long_df.groupby("point_id")["data_type"].first()=="ratio").sum())
    print("binary points:", (long_df.groupby("point_id")["data_type"].first()=="binary").sum())
    print("Saved to:", OUTPUT_DIR.resolve())

if __name__ == "__main__":
    main()

  types = long_df.groupby("point_id").apply(detect_type_per_point).rename("data_type").reset_index()


=== Summary ===
points: 7
ratio points: 4
binary points: 3
Saved to: /Users/kim-yujin/Desktop/ALDA_Proj/outputs


  daily_lag  = int(pd.Timedelta("24H")  / pd.Timedelta(rule))
  daily_lag  = int(pd.Timedelta("24H")  / pd.Timedelta(rule))
  weekly_lag = int(pd.Timedelta("168H") / pd.Timedelta(rule))
  weekly_lag = int(pd.Timedelta("168H") / pd.Timedelta(rule))
  full_index = pd.date_range(start=s.index.min(), end=s.index.max(), freq=rule)
  full_index = pd.date_range(start=s.index.min(), end=s.index.max(), freq=rule)
  full_index = pd.date_range(start=s.index.min(), end=s.index.max(), freq=rule)
  c /= stddev[:, None]
  full_index = pd.date_range(start=s.index.min(), end=s.index.max(), freq=rule)
  full_index = pd.date_range(start=s.index.min(), end=s.index.max(), freq=rule)
  full_index = pd.date_range(start=s.index.min(), end=s.index.max(), freq=rule)
  full_index = pd.date_range(start=s.index.min(), end=s.index.max(), freq=rule)


# 3. Generate candidate

In [85]:
INP  = "outputs/features_all.csv"
OUT  = "outputs/tag_candidate_sheet.csv"

# unit set: "F",  "C"
TEMP_UNIT = "F"   

def f(r, k, d=np.nan):
    try:
        return float(r[k])
    except Exception:
        return d

def to_c_if_needed(x):
    if np.isnan(x): 
        return x
    if TEMP_UNIT.upper() == "F":
        return (x - 32.0) * 5.0/9.0
    return x

def tag_candidates(row):
    mean  = f(row, "mean")
    std   = f(row, "std", 0.0)
    rng   = f(row, "range", 0.0)
    zero  = f(row, "zero_ratio", 0.0)
    daily = f(row, "daily_cycle_strength", 0.0)

    tags = {}

    # ---- (A) Binary determine: True - activate evidence
    swc  = f(row, "ffill_switch_count", np.nan)
    sw   = f(row, "switch_count", np.nan)
    onrt = f(row, "on_ratio", np.nan)
    is_binary = (
        (not np.isnan(swc) and swc > 0) or
        (not np.isnan(sw)  and sw  > 0) or
        (not np.isnan(onrt) and 0.0 < onrt < 1.0)
    )

    if is_binary:
        switches = int(swc if not np.isnan(swc) else (sw if not np.isnan(sw) else 0))
        base = 0.65 + min(0.35, switches/50.0)
        tags["status"]  = base
        tags["command"] = base*0.85
        tags["mode"]    = base*0.80
        return tags

    # ---- (B) Ratio signal: temperature/CO2/flow rate/pressure
    mean_c = to_c_if_needed(mean)

    # Temperature band(Celcius)
    if not np.isnan(mean_c):
        # CHW ~ 3–14°C, Air ~ 16–32°C, HW ~ 35–75°C (rough guide)
        if 3 <= mean_c <= 14:
            tags["temperature"] = max(tags.get("temperature",0), 0.8 + 0.1*max(0,daily))
        if 16 <= mean_c <= 32:
            tags["temperature"] = max(tags.get("temperature",0), 0.75 + 0.15*max(0,daily))
        if 35 <= mean_c <= 75:
            tags["temperature"] = max(tags.get("temperature",0), 0.7 + 0.05*max(0,daily))

    # CO2-like: 300~2000ppm, large fluctuation, few 0, major week cycle
    if not np.isnan(mean) and 250 <= mean <= 2000 and rng >= 200 and zero < 0.05:
        tags["co2"] = max(tags.get("co2",0), 0.7 + 0.2*max(0,daily))

    # Flow-like: often 0, large variance when on 
    if zero >= 0.05 and (np.isnan(mean) or mean >= 0) and rng > 1 and (std > 0.1 or rng > 5):
        tags["flow"] = max(tags.get("flow",0), 0.6 + 0.2*min(0.5, zero))

    # Pressure-like: + baseline, medium fluctuation, minor week cycle
    if not np.isnan(mean) and mean >= 50 and rng >= 10 and daily < 0.3:
        base = 0.55 + 0.1*(rng>50) + 0.05*(std<10)
        tags["pressure"] = max(tags.get("pressure",0), base)

    # if Ratio suppose generic sensor 
    tags["sensor"] = max(tags.get("sensor",0), 0.5)
    return tags

feat = pd.read_csv(INP)
rows=[]
for _, r in feat.iterrows():
    t = tag_candidates(r)
    top = sorted(t.items(), key=lambda x: -x[1])[:3]
    rows.append({
        "point_id": r["point_id"],
        "tag1": top[0][0] if len(top)>0 else "",
        "tag1_score": round(top[0][1],3) if len(top)>0 else "",
        "tag2": top[1][0] if len(top)>1 else "",
        "tag2_score": round(top[1][1],3) if len(top)>1 else "",
        "tag3": top[2][0] if len(top)>2 else "",
        "tag3_score": round(top[2][1],3) if len(top)>2 else "",
        "gold_tags": ""   # tag confirmed by person(multi): 예) "temperature,sensor"
    })

out = pd.DataFrame(rows)
out.to_csv(OUT, index=False)
print("saved:", OUT)

saved: outputs/tag_candidate_sheet.csv


In [86]:
# now mannually add gold_tag
df = pd.read_csv("outputs/tag_candidate_sheet.csv")

df.loc[df["point_id"] == "AHU-4 SaTmp", "gold_tags"] = "temperature"
df.loc[df["point_id"] == "AHU-4 ChwEnTmp", "gold_tags"] = "temperature"
df.loc[df["point_id"] == "AHU-4 AvgCcoilTmp", "gold_tags"] = "temperature"
df.loc[df["point_id"] == "AHU-4 AvgMaTmp", "gold_tags"] = "temperature"
df.loc[df["point_id"] == "AHU-4 HrwSts", "gold_tags"] = "status"
df.loc[df["point_id"] == "AHU-4 EconMd", "gold_tags"] = "status"

df.to_csv("outputs/tag_candidate_sheet_labeled.csv", index=False)
print("complete add label:", "outputs/tag_candidate_sheet_labeled.csv")

complete add label: outputs/tag_candidate_sheet_labeled.csv


  df.loc[df["point_id"] == "AHU-4 SaTmp", "gold_tags"] = "temperature"


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier

df = pd.read_csv("outputs/tag_candidate_sheet_labeled.csv")
df = df[df["gold_tags"].notna() & (df["gold_tags"]!="")].copy()

# X
feat = pd.read_csv("outputs/features_all.csv").set_index("point_id")
X = feat.loc[df["point_id"]].drop(columns=["n"], errors="ignore").values  

# y (multi label)
Y = df["gold_tags"].apply(lambda s: [t.strip() for t in s.split(",") if t.strip()])
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(Y)

clf = OneVsRestClassifier(XGBClassifier(
    n_estimators=400, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, tree_method="hist", random_state=42
))

# simple CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
major = y.argmax(axis=1)

counts = np.bincount(major)
min_class = counts[counts > 0].min() if counts.size else 0

desired_splits = 5
n_splits = max(2, min(desired_splits, int(min_class)))  

clf = OneVsRestClassifier(XGBClassifier(
    n_estimators=400, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8,
    tree_method="hist", random_state=42
))

scores = []

if n_splits >= 2:
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    for tr, te in skf.split(X, major):
        clf.fit(X[tr], y[tr])
        pred = (clf.predict_proba(X[te]) > 0.5).astype(int)
        scores.append(f1_score(y[te], pred, average="macro", zero_division=0))
    print(f"Macro F1 (CV mean, {n_splits}-fold):", float(np.mean(scores)))
else:
    X_tr, X_te, y_tr, y_te, major_tr, major_te = train_test_split(
        X, y, major, test_size=0.25, random_state=42,
        stratify=major if len(np.unique(major)) > 1 else None
    )
    clf.fit(X_tr, y_tr)
    pred = (clf.predict_proba(X_te) > 0.5).astype(int)
    print("Macro F1 (holdout):", f1_score(y_te, pred, average="macro", zero_division=0))

Macro F1 (CV mean, 2-fold): 0.4
