This model generates WAR statistics for every player.

We can use a rudimentary approach:
1. Partition our data to train each model for each position + league.
2. Conduct dimensionality reduction using principal component analysis (PCA) to reduce features of each dataset into a singular WAR statistic.
3. Train each model to generate a WAR statistic for every player.

We have adapted the "outfield" dataset to only contain relevant per90 statistics (no counting stats), and removed players with less than 5 games to prevent extreme outliers.

Definitions for basic statistics can be found [here](https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats). For other types of statistics, go to "Player Stats" > _"Statistic"_

In [2]:
# Basic Setup
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Change to your local folder before editing
os.chdir("C:/Users/tobyt/Desktop/Coding/Personal/wins-above-replacement-soccer/WAR-based-soccer-valuation")

In [14]:
# TODO: Need an easier method of generating all WAR values for outfield players with the same methodology

# Master dataframe - DO NOT CHANGE DIRECTLY
master_dataset = pd.read_csv("./model/outfield_final_vF.csv")

# Generating dynamic variable names
outfield_positions = {"centre-forward":"Centre-Forward", "left-back":"Left-Back", "right-back":"Right-Back", 
                      "right-winger":"Right Winger", "central-midfield":"Central Midfield", "attacking-midfield":"Attacking Midfield", 
                      "centre-back": "Centre-Back", "defensive-midfield":"Defensive Midfield", "second-striker": "Second Striker", 
                      "left-winger":"Left Winger", "right-midfield":"Right Midfield", "left-midfield":"Left Midfield"}
league_types = {"":"", "ENG":"Premier League", "ESP":"La Liga", "GER":"Bundesliga", "ITA":"Serie A", "FRA":"Ligue 1"} # first one is overall

for position in outfield_positions:
    weight_string = "".join(["./model/weights/", position, "-stats-weights.csv"])
    weights_df = pd.read_csv(weight_string)

    position_stats = weights_df["stat"].to_list()
    position_weights = weights_df["weight"].to_numpy()
    position_stat_direction = weights_df["direction"].to_numpy()

    position_dataset = master_dataset[(master_dataset.player_position == outfield_positions[position])]

    for league in league_types:
        if league != "":
            copied_dataset = position_dataset[(position_dataset.Comp == league_types[league])]
        else:
            copied_dataset = position_dataset
    
        # Filtering players that only start 5+ games
        copied_dataset_filtered = copied_dataset[(copied_dataset.Starts_Playing >= 5)]

        # Removing NaN observations
        copied_dataset_filtered = copied_dataset_filtered.dropna(subset=position_stats).reset_index()

        if copied_dataset_filtered.shape[0] == 0:
            print(position + " - " + league_types[league] + ": No Entries!")
            continue

        copied_dataset_modified = copied_dataset_filtered[position_stats].copy()

        # Scaling our dataset
        copied_dataset_scaled = StandardScaler().fit_transform(copied_dataset_modified)

        # Weights of each stat - Remove if unnecessary
        copied_dataset_weighted = copied_dataset_scaled * position_weights

        # PCA decomposition
        pca = PCA(n_components=1)
        pca_features = pca.fit_transform(copied_dataset_weighted)

        war = pd.DataFrame(pca_features)
        war.columns = ["calculated_war"]

        copied_dataset_filtered.insert(copied_dataset_filtered.columns.get_loc("unique_ID") + 1, "calculated_war", war["calculated_war"])

        # Percentile Calculations - rough approximation
        sorted_war = np.sort(war["calculated_war"])
        percentile_ranks = (sorted_war < war["calculated_war"].values[:,None]).mean(axis=1)
        percentile_ranks = np.round(percentile_ranks, decimals=4) * 100

        copied_dataset_filtered.insert(copied_dataset_filtered.columns.get_loc("unique_ID") + 2, "percentile", percentile_ranks)

        if league != "":
            output_string = "".join(["./output/PCA/overall_", position, "_war_", league, ".csv"])
        else:
            output_string = "".join(["./output/PCA/overall_", position, "_war.csv"])

        if os.path.exists(output_string):
            os.remove(output_string)

        copied_dataset_filtered.to_csv(output_string)

        print(position + " - " + league_types[league] + ": Completed!")

        # Resetting variables
        vars = [copied_dataset, copied_dataset_filtered, copied_dataset_modified, copied_dataset_scaled, copied_dataset_weighted,
                pca, pca_features, war, sorted_war, percentile_ranks, output_string]
        for var in vars:
            var = None
    
    position_dataset = None


centre-forward - : Completed!
centre-forward - Premier League: Completed!
centre-forward - La Liga: Completed!
centre-forward - Bundesliga: Completed!
centre-forward - Serie A: Completed!
centre-forward - Ligue 1: Completed!
left-back - : Completed!
left-back - Premier League: Completed!
left-back - La Liga: Completed!
left-back - Bundesliga: Completed!
left-back - Serie A: Completed!
left-back - Ligue 1: Completed!
right-back - : Completed!
right-back - Premier League: Completed!
right-back - La Liga: Completed!
right-back - Bundesliga: Completed!
right-back - Serie A: Completed!
right-back - Ligue 1: Completed!
right-winger - : Completed!
right-winger - Premier League: Completed!
right-winger - La Liga: Completed!
right-winger - Bundesliga: Completed!
right-winger - Serie A: Completed!
right-winger - Ligue 1: Completed!
central-midfield - : Completed!
central-midfield - Premier League: Completed!
central-midfield - La Liga: Completed!
central-midfield - Bundesliga: Completed!
central

  explained_variance_ = (S**2) / (n_samples - 1)


second-striker - Bundesliga: Completed!
second-striker - Serie A: Completed!
second-striker - Ligue 1: No Entries!
left-winger - : Completed!
left-winger - Premier League: Completed!
left-winger - La Liga: Completed!
left-winger - Bundesliga: Completed!
left-winger - Serie A: Completed!
left-winger - Ligue 1: Completed!
right-midfield - : Completed!
right-midfield - Premier League: Completed!
right-midfield - La Liga: No Entries!
right-midfield - Bundesliga: Completed!
right-midfield - Serie A: Completed!
right-midfield - Ligue 1: Completed!
left-midfield - : Completed!
left-midfield - Premier League: Completed!
left-midfield - La Liga: No Entries!
left-midfield - Bundesliga: Completed!
left-midfield - Serie A: Completed!
left-midfield - Ligue 1: Completed!
