In [1]:
import pandas as pd
from pathlib import Path
import json
import re
from collections import Counter, defaultdict

BASE = Path('''/Users/corin44/Documents/myofitness/myo_datas''')
PROCESSED = BASE / 'data' / 'processed'
PROCESSED.mkdir(parents=True, exist_ok=True)

ex_path = PROCESSED / 'exercise_dataset_cleaned.csv'
df_ex = pd.read_csv(ex_path, low_memory=False)

# load existing mapping if present
map_path = PROCESSED / 'exercises_canonical_mapping.json'
if map_path.exists():
    with open(map_path,'r') as f:
        base_map = json.load(f)
else:
    base_map = {}

# helper normalize
STOPWORDS = set(['exercise','rep','reps','set','sets','kg','lb','lbs','bodyweight','with','and','the'])

def normalize_name(s):
    if pd.isna(s):
        return ''
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9 ]+", ' ', s)
    tokens = [t.strip() for t in s.split() if t.strip() and t.strip() not in STOPWORDS]
    tokens = [t for t in tokens if not t.isdigit()]
    tokens = sorted(tokens)
    return ' '.join(tokens)

# build candidates from column
if 'Exercise_clean' in df_ex.columns:
    raw_names = df_ex['Exercise_clean'].dropna().astype(str).unique().tolist()
else:
    raw_names = df_ex['Exercise'].dropna().astype(str).str.lower().str.replace(r"[^a-z0-9 ]", "", regex=True).str.strip().unique().tolist()

# group by normalized token signature
sig_to_names = defaultdict(list)
for n in raw_names:
    sig = normalize_name(n)
    sig_to_names[sig].append(n)

# build mapping, seed with existing
canonical = dict(base_map)
for sig, names in sig_to_names.items():
    if not names:
        continue
    # choose canonical: most common in dataset (approx by counting occurrences)
    counts = Counter(df_ex[df_ex['Exercise_clean'].isin(names)]['Exercise_clean']) if 'Exercise_clean' in df_ex.columns else Counter(df_ex[df_ex['Exercise'].str.lower().str.replace(r"[^a-z0-9 ]","",regex=True).str.strip().isin(names)]['Exercise'])
    if counts:
        cand = counts.most_common(1)[0][0]
    else:
        cand = sorted(names, key=lambda s: (len(s.split()), len(s)))[0]
    for n in names:
        canonical[n] = cand

# apply mapping to dataframe

def map_to_canonical(x):
    if pd.isna(x):
        return x
    key = str(x)
    if key in canonical:
        return canonical[key]
    # fallback: normalized match
    sig = normalize_name(key)
    names = sig_to_names.get(sig, [])
    if names:
        return canonical.get(names[0], key)
    return key

if 'Exercise_clean' in df_ex.columns:
    df_ex['Exercise_canonical'] = df_ex['Exercise_clean'].apply(map_to_canonical)
else:
    df_ex['Exercise_canonical'] = df_ex['Exercise'].apply(lambda s: map_to_canonical(str(s).lower().strip()))

# save updated dataset
out_ds = PROCESSED / 'exercise_dataset_canonical.csv'
df_ex.to_csv(out_ds, index=False)

# save mapping files
with open(PROCESSED / 'exercises_canonical_mapping_refined.json','w') as f:
    json.dump(canonical, f, indent=2, ensure_ascii=False)

# preview top mappings
items = list(canonical.items())[:500]
preview_df = pd.DataFrame(items, columns=['raw','canonical'])
preview_df.to_csv(PROCESSED / 'exercises_canonical_preview_refined.csv', index=False)

print('Saved refined mapping and merged dataset')



Saved refined mapping and merged dataset
