In [21]:
import json
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the raw JSON data
with open("data/raw_team_stats.json", "r") as f:
    data = json.load(f)

output_records = []
extra_features_list = []  # To hold extra feature vectors for each record
extra_keys = None  # Will be fixed based on the first record's keys

for i, record in enumerate(data):
    # Retrieve meta_data and season_id 
    meta = record.get("meta_data", {}).copy()
    season_id = record.get("season_id", None)
    
    # --- Extract Manual (Averaged) Features from "stats" subbranch ---
    stats_nested = record.get("stats", {}).get("stats", {})
    if 'matches_played' not in stats_nested:
        print(f"Record {i} skipped: 'matches_played' not found.")
        continue
    matches_played = stats_nested['matches_played']
    
    manual_features = {}
    manual_keys = []
    for key, value in stats_nested.items():
        if key.startswith("ttl_"):
            new_key = "avg_" + key[4:]
            manual_features[new_key] = value / matches_played
            manual_keys.append(key)
    
    # --- Extract Extra Features from All Subbranches Except the "stats" subbranch ---
    extra = {}
    stats_groups = record.get("stats", {})
    for group, sub_dict in stats_groups.items():
        if group == "stats":  # Skip manual features branch
            continue
        if isinstance(sub_dict, dict):
            for key, value in sub_dict.items():
                if isinstance(value, (int, float)):
                    # Prepend group name to avoid key conflicts
                    combined_key = f"{group}_{key}"
                    extra[combined_key] = value
                    
    # Set the ordering of extra keys based on the first record
    if extra_keys is None:
        extra_keys = sorted(extra.keys())
    
    # Create a list of extra features in the fixed order
    record_extra = [extra.get(k, 0) for k in extra_keys]
    extra_features_list.append(record_extra)
    
    # Build the record with manual features, meta data, and season_id.
    new_record = {
        "meta_data": meta,
        "stats": {
            "stats": manual_features  # Manual features will be here.
        },
        "season_id": season_id
    }
    output_records.append(new_record)

# --- Perform PCA on the collected extra features ---
X_extra = pd.DataFrame(extra_features_list, columns=extra_keys)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_extra)

# Set number of PCA components to 4 (or less if not enough extra features)
n_components = 4 if X_scaled.shape[1] >= 4 else X_scaled.shape[1]
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

# --- Append the PCA Components to Each Record as "pca_features" ---
for i, record in enumerate(output_records):
    pca_dict = {f"pca_{j+1}": X_pca[i, j] for j in range(n_components)}
    record["pca_features"] = pca_dict

# --- Save the final engineered records to a JSON file ---
with open("data/feature_eng_stats.json", "w") as f:
    json.dump(output_records, f, indent=4)

print(f"manual features: {manual_keys}")
print(f"extra features: {extra_keys}")
print("Feature engineering complete: PCA on remaining features (4 components) appended as 'pca_features'.")

manual features: ['ttl_gls', 'ttl_ast', 'ttl_non_pen_gls', 'ttl_xg', 'ttl_non_pen_xg', 'ttl_xag', 'ttl_pk_made', 'ttl_pk_att', 'ttl_yellow_cards', 'ttl_red_cards', 'ttl_carries_prog', 'ttl_passes_prog']
extra features: ['defense_pct_tkl_drb_suc', 'defense_ttl_blocks', 'defense_ttl_clearances', 'defense_ttl_def_error', 'defense_ttl_int', 'defense_ttl_sh_blocked', 'defense_ttl_tkl', 'defense_ttl_tkl_att_third', 'defense_ttl_tkl_def_third', 'defense_ttl_tkl_drb', 'defense_ttl_tkl_drb_att', 'defense_ttl_tkl_mid_third', 'defense_ttl_tkl_plus_int', 'defense_ttl_tkl_won', 'gca_avg_gca', 'gca_avg_sca', 'gca_ttl_def_gca', 'gca_ttl_def_sca', 'gca_ttl_fld_gca', 'gca_ttl_fld_sca', 'gca_ttl_gca', 'gca_ttl_pass_dead_gca', 'gca_ttl_pass_live_gca', 'gca_ttl_pass_live_sca', 'gca_ttl_sca', 'gca_ttl_sh_gca', 'gca_ttl_sh_sca', 'gca_ttl_take_on_gca', 'gca_ttl_take_on_sca', 'keepers_avg_gls_ag', 'keepers_clean_sheet_pct', 'keepers_clean_sheets', 'keepers_pk_att_ag', 'keepers_pk_made_ag', 'keepers_pk_miss_ag