In [6]:
import pandas as pd
from pathlib import Path

# ==== CONFIG ====
FILENAME = "btcfuture_agg.csv"   # just the filename; we’ll search for it
KEEP     = "last"                # or "first" if you prefer earliest row per day

# ==== FIND THE FILE ====
root = Path.cwd()
candidates = list(root.rglob(FILENAME))
if not candidates:
    raise FileNotFoundError(f"Could not find '{FILENAME}' under {root}. "
                            f"Tip: check the exact filename or set FILENAME to the right name.")
IN_FILE = candidates[0]          # take the first match; change if you have multiple
OUT_FILE = IN_FILE               # overwrite in place

print(f"Using file: {IN_FILE}")

# ==== LOAD ====
df = pd.read_csv(IN_FILE)

# detect a timestamp column
time_col = next((c for c in ["open_time","close_time","timestamp","date","time"] if c in df.columns), None)
if time_col is None:
    raise ValueError("No time column found. Expected one of: open_time/close_time/timestamp/date/time")

# ==== NORMALIZE TO DAILY (UTC) ====
dt = pd.to_datetime(df[time_col], errors="coerce", utc=True)
df["date"] = dt.dt.tz_localize(None).dt.normalize()

# ==== DEDUPE BY DAY ====
before = len(df)
clean = (
    df.sort_values(time_col)                 # deterministic keep
      .drop_duplicates(subset="date", keep=KEEP)
      .sort_values("date")
      .reset_index(drop=True)
)

# ==== SAVE & CHECK ====
clean.to_csv(OUT_FILE, index=False)
assert clean["date"].is_unique, "Still found duplicate dates after dedupe."

print(f"Rows before: {before} | after: {len(clean)} | removed: {before - len(clean)}")
print(f"Saved back to: {OUT_FILE}")



Using file: /Users/duncanwan/Desktop/learning/Bitcoin/Clean_data/btcfuture_agg.csv
Rows before: 154 | after: 123 | removed: 31
Saved back to: /Users/duncanwan/Desktop/learning/Bitcoin/Clean_data/btcfuture_agg.csv
