In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from datetime import datetime
import re

In [2]:
import pandas as pd
import numpy as np
import re

def process_fm_data(filepath, league_rep_filepath, game_date_str='1/5/2035'):
    """
    Loads, cleans, and engineers features for a Football Manager player dataset.

    Args:
        filepath (str): Path to the HTML file with player data.
        league_rep_filepath (str): Path to the CSV/HTML file with the ordered league reputation list.
        game_date_str (str): The current in-game date as a string (e.g., '1/5/2035').

    Returns:
        pandas.DataFrame: A fully cleaned and processed DataFrame.
    """
    # --- 1. Load Player Data ---
    try:
        df = pd.read_html(filepath)[0].copy()
        df = df.iloc[1:]
    except (IOError, IndexError) as e:
        print(f"Error reading player file {filepath}: {e}")
        return None

    # --- 2. Define Helper Functions ---
    def parse_height(height_str):
        match = re.match(r"(\d+)'(\d+)", str(height_str))
        if match:
            feet, inches = map(int, match.groups())
            return feet * 12 + inches
        return np.nan

    def clean_value(value):
        # (This function remains unchanged)
        if pd.isna(value) or value == 'Not for Sale': return np.nan
        value_str = str(value).replace('Â£', '').replace('£', '').strip()
        if not value_str: return np.nan
        def convert_suffix(val_str):
            val_str = val_str.strip()
            if 'M' in val_str: return float(val_str.replace('M', ''))
            if 'K' in val_str: return float(val_str.replace('K', '')) / 1000
            return float(val_str)
        if '-' in value_str:
            low, high = value_str.split('-')
            return (convert_suffix(low) + convert_suffix(high)) / 2.0
        else:
            return convert_suffix(value_str)

    def clean_fee(fee):
        # (This function remains unchanged)
        if pd.isna(fee) or isinstance(fee, (int, float)): return fee
        fee_str = str(fee).replace('Â£', '').replace('£', '').strip()
        if not fee_str or fee_str in ['-', '- - -', 'Free']: return 0.0
        try:
            if 'M' in fee_str: return float(fee_str.replace('M', ''))
            elif 'K' in fee_str: return float(fee_str.replace('K', '')) / 1000
            else: return np.nan
        except ValueError: return np.nan
            
    def combine_apps(apps_str):
        # (This function remains unchanged)
        if pd.isna(apps_str): return 0
        numbers = re.findall(r'\d+', str(apps_str))
        return sum(int(num) for num in numbers) if numbers else 0

    # --- 3. Apply Cleaning and Type Conversions ---
    df['Height'] = df['Height'].apply(parse_height)
    df['Wage'] = df['Wage'].str.replace(r"[^\d.]", "", regex=True).replace('', np.nan).astype('Float64')
    df['Transfer Value'] = df['Transfer Value'].apply(clean_value)
    df['Last Trans. Fee'] = df['Last Trans. Fee'].apply(clean_fee)
    if 'Transfer Fees Received' in df.columns:
        df['Transfer Fees Received'] = df['Transfer Fees Received'].apply(clean_fee)
    df['Total Apps'] = df['Apps'].apply(combine_apps)
    df['Transfer_Status_bool'] = (df['Transfer Status'] != 'Not set').astype(int)
    df['Country'] = df['Based'].str.split('(').str[0].str.strip()

    # --- 4. Feature Engineering ---
    # Date-based features
    df['Expires'] = pd.to_datetime(df['Expires'], errors='coerce')
    df['Begins'] = pd.to_datetime(df['Begins'], errors='coerce')
    current_game_date = pd.to_datetime(game_date_str)
    
    df['Days Until Expiry'] = (df['Expires'] - current_game_date).dt.days
    years_since_signing = (current_game_date - df['Begins']).dt.days / 365.25
    df['Age_at_Signing'] = df['Age'].astype(float) - years_since_signing
    df['Years_at_Club'] = (current_game_date - df['Begins']).dt.days / 365.25

    # Personality tier mapping
    tier_1 = ['Slack', 'Casual', 'Temperamental', 'Spineless', 'Low Self-Belief', 'Easily Discouraged', 'Low Determination']
    tier_2 = ['Fickle', 'Mercenary', 'Unambitious', 'Unsporting', 'Realist']
    tier_3 = ['Balanced', 'Light-Hearted', 'Jovial', 'Very Loyal', 'Devoted', 'Loyal', 'Fairly Loyal', 'Honest', 'Sporting', 'Fairly Sporting']
    tier_4 = ['Perfectionist', 'Resolute', 'Professional', 'Fairly Professional', 'Iron Willed', 'Resilient', 'Spirited', 'Driven', 'Determined', 'Fairly Determined', 'Charismatic Leader', 'Born Leader', 'Leader', 'Very Ambitious', 'Fairly Ambitious', 'Ambitious']
    tier_5 = ['Model Professional']
    personality_tiers = [tier_1, tier_2, tier_3, tier_4, tier_5]
    personality_map = {p: i + 1 for i, tier in enumerate(personality_tiers) for p in tier}
    df['Personality_Tier'] = df['Personality'].map(personality_map)
    df['Personality_Tier'] = df['Personality_Tier'].fillna(0) # Handle unmapped personalities

    # ** NEW ** Division Ranking from external file
    try:
        league_rep_df = pd.read_html(league_rep_filepath)[0] # Or pd.read_html if it's an html file
        ordered_leagues = league_rep_df['Name'].tolist()
        num_leagues = len(ordered_leagues)
        league_rank_map = {league: num_leagues - i for i, league in enumerate(ordered_leagues)}
        df['Division_Rank'] = df['Division'].map(league_rank_map)
        df['Division_Rank']= df['Division_Rank'].fillna(0)
    except IOError:
        print(f"Warning: League reputation file not found at {league_rep_filepath}. Skipping division ranking.")
        df['Division_Rank'] = 0 # Create column with default value if file fails
        
    # ** NEW ** One-hot encode Country to capture market effects
    country_dummies = pd.get_dummies(df['Country'], prefix='Country', dummy_na=True)
    df = pd.concat([df, country_dummies], axis=1)

    return df

In [3]:
league_rep = 'league rankings.html'
all_players_df = process_fm_data('all_players_1-5-35.html', league_rep)
gk_df = process_fm_data('gk_fixed_1-5-35.html', league_rep)
cb_df = process_fm_data('cb_1-5-35.html', league_rep)
fb_df = process_fm_data('fb_1-5-35.html', league_rep)
dm_df = process_fm_data('dm_1-5-35.html', league_rep)
cm_df = process_fm_data('cm_1-5-35.html', league_rep)
am_df = process_fm_data('am_1-5-35.html', league_rep)
wf_df = process_fm_data('wf_1-5-35.html', league_rep)
st_df = process_fm_data('st_1-5-35.html', league_rep)

In [4]:
all_players_df.to_csv('all_players_df.csv', index = False)
gk_df.to_csv('gk_df.csv', index = False)
cb_df.to_csv('cb_df.csv', index = False)
fb_df.to_csv('fb_df.csv', index = False)
dm_df.to_csv('dm_df.csv', index = False)
cm_df.to_csv('cm_df.csv', index = False)
am_df.to_csv('am_df.csv', index = False)
wf_df.to_csv('wf_df.csv', index = False)
st_df.to_csv('st_df.csv', index = False)