In [1]:
import pandas as pd
from pathlib import Path
import difflib
import json

BASE = Path('''/Users/corin44/Documents/myofitness/myo_datas''')
PROCESSED = BASE / 'data' / 'processed'
PROCESSED.mkdir(parents=True, exist_ok=True)

ex_path = PROCESSED / 'exercise_dataset_cleaned.csv'
df_ex = pd.read_csv(ex_path, low_memory=False)

# Build candidate list
if 'Exercise_clean' in df_ex.columns:
    names = df_ex['Exercise_clean'].dropna().unique().tolist()
else:
    names = df_ex['Exercise'].dropna().str.lower().str.replace(r"[^a-z0-9 ]", "", regex=True).str.strip().unique().tolist()

# simple clustering using difflib
names = sorted(names)
canonical = {}
used = set()
for name in names:
    if name in used:
        continue
    # find close matches
    matches = difflib.get_close_matches(name, names, n=50, cutoff=0.8)
    group = set(matches)
    for m in matches:
        used.add(m)
    # choose canonical as shortest tokenized form
    cand = sorted(group, key=lambda s: (len(s.split()), len(s)))[0]
    for m in group:
        canonical[m] = cand

# write preview: top 200 mappings
items = list(canonical.items())[:200]
preview_df = pd.DataFrame(items, columns=['raw','canonical'])
preview_path = PROCESSED / 'exercises_canonical_preview.csv'
preview_df.to_csv(preview_path, index=False)

# save full mapping
with open(PROCESSED / 'exercises_canonical_mapping.json','w') as f:
    json.dump(canonical, f, indent=2, ensure_ascii=False)

print('Saved preview to', preview_path)



Saved preview to /Users/corin44/Documents/myofitness/myo_datas/data/processed/exercises_canonical_preview.csv
