## Loading and Inspecting Data

In [23]:
import json
import pandas as pd
import numpy as np
import os, sys
import yaml

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
import src.data_management as my_dm
from src.feature_creation import create_features, create_essentials_features
seed=456

In [24]:


with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

train_file_path = config["data"]["input_train_path"]
test_file_path = config["data"]["input_test_path"]

train_data = []

print(f"Loading data from '{train_file_path}'...")
try:
    with open(train_file_path, 'r') as f:
        for line in f:
            # json.loads() parses one line (one JSON object) into a Python dictionary
            train_data.append(json.loads(line))

    print(f"Successfully loaded {len(train_data)} battles.")

except FileNotFoundError:
    print(f"ERROR: Could not find the training file at '{train_file_path}'.")
    print("Please make sure you have added the competition data to this notebook.")

Loading data from '../data/train/raw/train.jsonl'...
Successfully loaded 9999 battles.


In [26]:

# Create feature DataFrames for both training and test sets
print("Processing training data...")
train_df = create_features(train_data)
train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)

### removing correlations
corr_matrix = train_df.corr().abs()
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)
threshold = .8
to_drop = [column for column in upper.columns if any(upper[column] > threshold) and column!='player_won']

print("Variabili da rimuovere:", to_drop)
train_df = train_df.drop(columns=to_drop)

## exporting in csv
train_df.to_csv(config["data"]["processed_train_path"], index=False)

print("\nProcessing test data...")
test_data = []
with open(test_file_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))
test_df = create_features(test_data)
test_df =test_df.sample(frac=1, random_state=seed).reset_index(drop=True)
## removing correlations in test
test_df = test_df.drop(columns=to_drop)


test_df.to_csv(config["data"]["processed_test_path"], index=False)

Processing training data...


  features["p1-p2_mean_base_power"] = np.nanmean([turn['p1_move_details']['base_power'] for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]]) - np.nanmean([turn['p2_move_details']['base_power'] for turn in timeline if turn.get("p2_move_details") and turn["p2_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
  value = np.nanmean([my_dm.move_effectiveness(turn["p2_move_details"]["type"], turn["p1_pokemon_state"]["name"], turn["p2_move_details"]["type"] in my_dm.pokemon_type(turn["p2_pokemon_state"]["name"])) for turn in timeline if turn.get("p2_move_details") and turn["p2_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
  value = np.nanmean([my_dm.move_effectiveness(turn["p1_move_details"]["type"], turn["p2_pokemon_state"]["name"], turn["p1_move_details"]["type"] in my_dm.pokemon_type(turn["p1_pokemon_state"]["name"])) for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["category"]

Variabili da rimuovere: ['p1_std_base_hp', 'p1_min_base_atk', 'p1_std_base_atk', 'p1_min_base_def', 'p1_std_base_def', 'p1_min_base_spa', 'p1_min_base_spd', 'p1_max_base_spd', 'p1_std_base_spd', 'p1_mean_hp', 'p1_defense_mean', 'p1_total_base_power', 'p1_style_index', 'p1_hp_ratio', 'p1_max_speed', 'p1_type_ground', 'p1_type_poison', 'p1_type_rock', 'p2_lead_def', 'p2_lead_spa', 'p2_lead_type_poison', 'p2_lead_type_rock', 'p1-p2_noeffect_count', 'p1_mean_boost_spd', 'p2_mean_boost_spd']

Processing test data...


  features["p1-p2_mean_base_power"] = np.nanmean([turn['p1_move_details']['base_power'] for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]]) - np.nanmean([turn['p2_move_details']['base_power'] for turn in timeline if turn.get("p2_move_details") and turn["p2_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
  value = np.nanmean([my_dm.move_effectiveness(turn["p1_move_details"]["type"], turn["p2_pokemon_state"]["name"], turn["p1_move_details"]["type"] in my_dm.pokemon_type(turn["p1_pokemon_state"]["name"])) for turn in timeline if turn.get("p1_move_details") and turn["p1_move_details"]["category"] in ["SPECIAL", "PHYSICAL"]])
  value = np.nanmean([my_dm.move_effectiveness(turn["p2_move_details"]["type"], turn["p1_pokemon_state"]["name"], turn["p2_move_details"]["type"] in my_dm.pokemon_type(turn["p2_pokemon_state"]["name"])) for turn in timeline if turn.get("p2_move_details") and turn["p2_move_details"]["category"]