## Loading and Inspecting Data

In [3]:
import json
import pandas as pd
import numpy as np
import os, sys
import yaml
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

import src.data_management as my_dm
seed=456

In [2]:


with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

train_file_path = config["data"]["input_train_path"]
test_file_path = config["data"]["input_test_path"]

train_data = []

print(f"Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            # json.loads() parses one line (one JSON object) into a Python dictionary
            train_data.append(json.loads(line))

    print(f"Successfully loaded {len(train_data)} battles.")

except FileNotFoundError:
    print(f"ERROR: Could not find the training file at '{train_file_path}'.")
    print("Please make sure you have added the competition data to this notebook.")

Loading data from '../data/raw/train.jsonl'...
Successfully loaded 10000 battles.


In [5]:
def create_simple_features(data: list[dict]) -> pd.DataFrame:
    """
    A very basic feature extraction function.
    It only uses the aggregated base stats of the player's team and opponent's lead.
    """
    feature_list = []
    for battle in data:
        features = {}
        features["battle_id"] = battle.get("battle_id", -1)
        # --- Player 1 Team Features ---
        p1_team = battle.get('p1_team_details', [])
        if p1_team:
            stats = ['base_hp', 'base_atk', 'base_def', 'base_spa', 'base_spd', 'base_spe']
            for stat in stats:               ### This helps the model to have a better idea abt the time instead of having only mean
                values = [p.get(stat, 0) for p in p1_team]
                features[f'p1_mean_{stat}'] = np.mean(values)
                features[f'p1_min_{stat}'] = np.min(values)
                features[f'p1_max_{stat}'] = np.max(values)
                features[f'p1_std_{stat}'] = np.std(values)
            features['p1_mean_hp'] = np.mean([p.get('base_hp', 0) for p in p1_team])
            features['p1_mean_spe'] = np.mean([p.get('base_spe', 0) for p in p1_team])
            features['p1_mean_atk'] = np.mean([p.get('base_atk', 0) for p in p1_team])
            features['p1_mean_def'] = np.mean([p.get('base_def', 0) for p in p1_team])
            features['p1_mean_spa'] = np.mean([p.get('base_spa', 0) for p in p1_team])
            features['p1_mean_spd'] = np.mean([p.get('base_spd', 0) for p in p1_team])
            ### We can also build derivated feature like how much is off/def our team
            # team stats
            base_atk = features['p1_mean_base_atk']
            base_spa = features['p1_mean_base_spa']
            base_def = features['p1_mean_base_def']
            base_spd = features['p1_mean_base_spd']
            base_spe = features['p1_mean_base_spe']
            base_hp  = features['p1_mean_base_hp']

            ## constructing new features
            offense = base_atk + base_spa
            defense = base_def + base_spd
            features['p1_offense_mean']    = offense
            features['p1_defense_mean']    = defense
            features['p1_atk_def_ratio']   = offense / (defense + 1e-6)
            # average per-Pokémon total base stats
            p1_totals = [sum(p.get(s, 0) for s in stats) for p in p1_team]
            features['p1_total_base_power'] = float(np.mean(p1_totals))
            features['p1_stat_variety']     = float(np.std(p1_totals))
            features['p1_style_index']      = offense / (offense + defense + 1e-6)
            features['p1_hp_ratio']         = base_hp / (offense + defense + base_spe + 1e-6)
            # fastest member speed (robust comparator vs P2 lead)
            features['p1_max_speed']        = float(np.max([p.get('base_spe', 0) for p in p1_team]))
        
    
            
###########################################AGGIUNTA####################################################
          # Estrazione tipi
            type_counts = {t: 0 for t in all_types}
            for p in p1_team:
                for t in p.get('types', []):
                    type_counts[t] += 1
            team_size = len(p1_team)
            for t in all_types:
                features[f'p1_type_{t}'] = type_counts[t] / team_size if team_size > 0 else 0
##########################################AGGIUNTA#####################################################
        
        # --- Player 2 Lead Features ---
        p2_lead = battle.get('p2_lead_details')
        if p2_lead:
            # Player 2's lead Pokémon's stats
            features['p2_lead_hp'] = p2_lead.get('base_hp', 0)
            features['p2_lead_spe'] = p2_lead.get('base_spe', 0)
            features['p2_lead_atk'] = p2_lead.get('base_atk', 0)
            features['p2_lead_def'] = p2_lead.get('base_def', 0)
            features['p2_lead_spd'] = p2_lead.get('base_spd', 0)
            features['p2_lead_spa'] = p2_lead.get('base_spa', 0)
            ## types of p2 lead
            for t in all_types:
                features[f'p2_lead_type_{t}'] = 0.0
            for t in p2_lead.get('types', []):
                if t in all_types:
                    features[f'p2_lead_type_{t}'] = 1.0
                if battle.get('player_won') is not None:
                    features['player_won'] = int(battle['player_won'])
        ## Extracting battle features
        timeline = battle.get("battle_timeline", [])
        if len(timeline) > 0:
            # Average HP percentage for both players
            p1_hp = [turn["p1_pokemon_state"].get("hp_pct", np.nan) for turn in timeline]
            p2_hp = [turn["p2_pokemon_state"].get("hp_pct", np.nan) for turn in timeline]

            features["p1_mean_hp_pct"] = np.nanmean(p1_hp)
            features["p2_mean_hp_pct"] = np.nanmean(p2_hp)
            features["p1_final_hp"] = p1_hp[-1]
            features["p2_final_hp"] = p2_hp[-1]
            features["p1_total_damage"] = max(p1_hp) - min(p1_hp)
            features["p2_total_damage"] = max(p2_hp) - min(p2_hp)

            # Count total moves used
            p1_moves = [turn.get("p1_move_details", {}).get("name") for turn in timeline if turn.get("p1_move_details")]
            p2_moves = [turn.get("p2_move_details", {}).get("name") for turn in timeline if turn.get("p2_move_details")]
            features["p1_total_moves"] = len(p1_moves)
            features["p2_total_moves"] = len(p2_moves)

            # Count unique move types used
            p1_move_types = [turn["p1_move_details"].get("type") for turn in timeline if turn.get("p1_move_details")]
            p2_move_types = [turn["p2_move_details"].get("type") for turn in timeline if turn.get("p2_move_details")]
            features["p1_unique_move_types"] = len(set(p1_move_types))
            features["p2_unique_move_types"] = len(set(p2_move_types))

            # Count total statuses inflicted
            p1_status = [turn["p1_pokemon_state"].get("status") for turn in timeline]
            p2_status = [turn["p2_pokemon_state"].get("status") for turn in timeline]
            features["p1_status_changes"] = sum(1 for s in p1_status if s not in ["nostatus", "noeffect", None])
            features["p2_status_changes"] = sum(1 for s in p2_status if s not in ["nostatus", "noeffect", None])

            # Count turns with effects like "reflect", "light screen", etc.
            p1_effects = [e for turn in timeline for e in turn["p1_pokemon_state"].get("effects", []) if e not in ["noeffect"]]
            p2_effects = [e for turn in timeline for e in turn["p2_pokemon_state"].get("effects", []) if e not in ["noeffect"]]
            features["p1_total_effects"] = len(p1_effects)
            features["p2_total_effects"] = len(p2_effects)

            # Boosts (attack, defense, etc.)
            boost_keys = ["atk", "def", "spa", "spd", "spe"]
            for key in boost_keys:
                p1_boosts = [turn["p1_pokemon_state"]["boosts"].get(key, 0) for turn in timeline]
                p2_boosts = [turn["p2_pokemon_state"]["boosts"].get(key, 0) for turn in timeline]
                features[f"p1_mean_boost_{key}"] = np.mean(p1_boosts)
                features[f"p2_mean_boost_{key}"] = np.mean(p2_boosts)
            
            # Number of fnt pokemon for each player
            features["p1_fnt_pokemon_number"] = sum([1 for turn in timeline if turn["p1_pokemon_state"]["status"] == "fnt"])
            features["p2_fnt_pokemon_number"] = sum([1 for turn in timeline if turn["p2_pokemon_state"]["status"] == "fnt"])

            # Number of time the player switched pokemon
            features["p1_switch_number"] = sum([1 for turn in timeline if turn["p1_move_details"] == None])
            features["p2_switch_number"] = sum([1 for turn in timeline if turn["p2_move_details"] == None])

            # Number of SPECIAL or PHYSICAL moves and Number of STATUS moves of p1 and p2
            features["p1_attack_moves"] = sum([1 for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
            features["p2_attack_moves"] = sum([1 for turn in timeline if turn.get("p2_move_details") and turn["p2_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
            features["p1_status_moves"] = sum([1 for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["category"] not in ["SPECIAL", "PHYSICAL"]])
            features["p2_status_moves"] = sum([1 for turn in timeline if turn.get("p2_move_details") and turn["p2_move_details"]["category"] not in ["SPECIAL", "PHYSICAL"]])

            # Number of same pokemon type moves
            features["p1_same_type_moves_number"] = sum([1 for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["type"] in my_dm.pokemon_type(turn["p1_pokemon_state"]["name"]) and turn["p1_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
            features["p2_same_type_moves_number"] = sum([1 for turn in timeline if turn.get("p2_move_details") and turn["p2_move_details"]["type"] in my_dm.pokemon_type(turn["p2_pokemon_state"]["name"]) and turn["p2_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])

            # Average of multiplier effectivness
            features["p1_effectivness_avg"] = np.mean([my_dm.effectiveness(turn["p1_move_details"]["type"], my_dm.pokemon_type(turn["p2_pokemon_state"]["name"])) for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
            features["p2_effectivness_avg"] = np.mean([my_dm.effectiveness(turn["p2_move_details"]["type"], my_dm.pokemon_type(turn["p1_pokemon_state"]["name"])) for turn in timeline if turn.get("p2_move_details") and turn["p2_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])

            # Number of supereffective moves
            features["p1_supereffective_moves_count"] = sum([my_dm.is_supereffective(turn["p1_move_details"]["type"], my_dm.pokemon_type(turn["p2_pokemon_state"]["name"])) for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
            features["p2_supereffective_moves_count"] = sum([my_dm.effectiveness(turn["p2_move_details"]["type"], my_dm.pokemon_type(turn["p1_pokemon_state"]["name"])) for turn in timeline if turn.get("p2_move_details") and turn["p2_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])


        feature_list.append(features)
        
    return pd.DataFrame(feature_list).fillna(0)


all_types = set()
for battle in train_data:
    for p in battle['p1_team_details']:
        all_types.update(p['types'])

all_types = sorted(list(all_types))



def extract_type_features(team):
    type_counts = {t: 0 for t in all_types}
    for p in team:
        for t in p['types']:
            type_counts[t] += 1
    
    team_size = len(team)
    for t in type_counts:
        type_counts[t] /= team_size
    return type_counts

team = train_data[0]['p1_team_details']
type_features = extract_type_features(team)
print(type_features)

        ########################################################################################
        #####################################AGGIUNTA###########################################
        ########################################################################################






# Create feature DataFrames for both training and test sets
print("Processing training data...")
train_df = create_simple_features(train_data)
train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)
train_df.to_csv(config["data"]["processed_train_path"], index=False)

print("\nProcessing test data...")
test_data = []
with open(test_file_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
test_df = create_simple_features(test_data)
test_df =test_df.sample(frac=1, random_state=seed).reset_index(drop=True)
test_df.to_csv(config["data"]["processed_test_path"], index=False)

{'dragon': 0.0, 'electric': 0.0, 'fire': 0.0, 'flying': 0.0, 'ghost': 0.0, 'grass': 0.16666666666666666, 'ground': 0.0, 'ice': 0.0, 'normal': 0.5, 'notype': 0.6666666666666666, 'poison': 0.0, 'psychic': 0.5, 'rock': 0.0, 'water': 0.16666666666666666}
Processing training data...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



Processing test data...


In [None]:
"""
DEBUG
"""

for battle in train_data:
    timeline = battle["battle_timeline"]
    #print(sum([1 for turn in timeline if turn["p2_pokemon_state"]["status"] == "fnt"]))
    #print([turn for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["type"] in my_dm.pokemon_type(turn["p1_pokemon_state"]["name"]) and turn["p1_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
    #a = [[my_dm.is_supereffective(turn["p1_move_details"]["type"], my_dm.pokemon_type(turn["p2_pokemon_state"]["name"])), turn["p1_move_details"]["type"], my_dm.pokemon_type(turn["p2_pokemon_state"]["name"])] for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]]
    #print(a)

[[True, 'ICE', ['GRASS', 'PSYCHIC']], [True, 'ELECTRIC', ['WATER', 'PSYCHIC']], [True, 'ELECTRIC', ['WATER', 'PSYCHIC']], [True, 'ELECTRIC', ['WATER', 'PSYCHIC']], [False, 'ELECTRIC', ['NORMAL']], [True, 'ELECTRIC', ['WATER', 'PSYCHIC']], [False, 'PSYCHIC', ['GRASS', 'PSYCHIC']], [False, 'PSYCHIC', ['GRASS', 'PSYCHIC']], [False, 'PSYCHIC', ['GRASS', 'PSYCHIC']], [False, 'NORMAL', ['NORMAL']], [False, 'ICE', ['NORMAL']], [False, 'PSYCHIC', ['NORMAL']], [False, 'PSYCHIC', ['GRASS', 'PSYCHIC']], [False, 'PSYCHIC', ['GRASS', 'PSYCHIC']], [False, 'PSYCHIC', ['NORMAL']], [False, 'PSYCHIC', ['NORMAL']]]
[[False, 'ICE', ['NORMAL']], [False, 'ICE', ['NORMAL']], [False, 'ICE', ['NORMAL']], [False, 'NORMAL', ['GRASS', 'PSYCHIC']], [True, 'ICE', ['GRASS', 'PSYCHIC']], [False, 'PSYCHIC', ['GRASS', 'PSYCHIC']], [False, 'PSYCHIC', ['GRASS', 'PSYCHIC']], [False, 'PSYCHIC', ['GRASS', 'PSYCHIC']], [False, 'PSYCHIC', ['GRASS', 'PSYCHIC']], [False, 'PSYCHIC', ['NORMAL']], [False, 'NORMAL', ['NORMAL']], [F