In [1]:
import pandas as pd
from pulp import LpMaximize, LpProblem, LpVariable, lpSum
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Set up logging
import logging
logging.basicConfig(level=logging.INFO)

# Function to clean player names
def clean_name(name):
    return re.split(r'\n', name)[0].strip()

def calculate_fantasy_score(df, match_difficulty=None, difficulty_weight=2.0):
    score = pd.Series(0, index=df.index)
    
    # Calculate total matches from minutes played
    df['matches'] = df['minutes'] / 90
    
    # Average stats per match
    df['avg_goals'] = df['Performance Gls'] / df['matches']
    df['avg_assists'] = df['Performance Ast'] / df['matches']
    df['avg_yellows'] = df['Performance CrdY'] / df['matches']
    df['avg_reds'] = df['Performance CrdR'] / df['matches']
    
    # Goals
    score += df['avg_goals'] * df['pos'].map({
        'FW': 125000, 
        'MF': 150000, 
        'DF': 175000, 
        'GK': 250000
    }).fillna(0)
    
    # Assists
    score += df['avg_assists'] * 60000
    
    # Yellow and Red Cards
    score -= df['avg_yellows'] * 20000
    score -= df['avg_reds'] * 50000

    # Goalkeeper specific statistics
    is_gk = df['pos'] == 'GK'
    score[is_gk] += df['saves'][is_gk] * 5000
    score[is_gk] -= df['goals_against'][is_gk] * 10000
    score[is_gk] += df['save%'][is_gk] * 1000
    
    # Adjust score based on match difficulty if provided
    if match_difficulty:
        score *= (df['nation'].map(match_difficulty).fillna(1) ** difficulty_weight)
    
    # Weight score by the likelihood of starting (total minutes / 860)
    weight = df['minutes'] / 860
    score *= weight
    
    return score

# Function to calculate multipliers from odds
def calculate_multipliers(odds):
    probabilities = {key: 1 / value for key, value in odds.items()}
    total = sum(probabilities.values())
    normalized_probabilities = {key: value / total for key, value in probabilities.items()}
    return {key: 1 / value for key, value in normalized_probabilities.items()}

# Parse and store match odds
match_odds = {
    'GER': {'win': 1.30, 'draw': 6.00, 'lose': 12.50},
    'SCO': {'win': 12.50, 'draw': 6.00, 'lose': 1.30},
    'HUN': {'win': 3.50, 'draw': 3.50, 'lose': 2.20},
    'CHE': {'win': 2.20, 'draw': 3.50, 'lose': 3.50},
    'ESP': {'win': 1.78, 'draw': 3.85, 'lose': 5.00},
    'HRV': {'win': 5.00, 'draw': 3.85, 'lose': 1.78},
    'ITA': {'win': 1.39, 'draw': 4.85, 'lose': 9.70},
    'ALB': {'win': 9.70, 'draw': 4.85, 'lose': 1.39},
    'POL': {'win': 5.20, 'draw': 4.15, 'lose': 1.70},
    'NLD': {'win': 1.70, 'draw': 4.15, 'lose': 5.20},
    'SVN': {'win': 5.00, 'draw': 3.75, 'lose': 1.82},
    'DNK': {'win': 1.82, 'draw': 3.75, 'lose': 5.00},
    'SRB': {'win': 7.00, 'draw': 4.65, 'lose': 1.50},
    'ENG': {'win': 1.50, 'draw': 4.65, 'lose': 7.00},
    'ROU': {'win': 3.70, 'draw': 3.60, 'lose': 2.17},
    'UKR': {'win': 2.17, 'draw': 3.60, 'lose': 3.70},
    'BEL': {'win': 1.51, 'draw': 4.55, 'lose': 6.80},
    'SVK': {'win': 6.80, 'draw': 4.55, 'lose': 1.51},
    'AUT': {'win': 7.00, 'draw': 4.75, 'lose': 1.50},
    'FRA': {'win': 1.50, 'draw': 4.75, 'lose': 7.00},
    'TUR': {'win': 1.80, 'draw': 3.85, 'lose': 4.90},
    'GEO': {'win': 4.90, 'draw': 3.85, 'lose': 1.80},
    'PRT': {'win': 1.57, 'draw': 4.40, 'lose': 7.00},
    'CZE': {'win': 7.00, 'draw': 4.40, 'lose': 1.57}
}

# Calculate match difficulty multipliers
match_difficulty = {team: calculate_multipliers(odds)['win'] for team, odds in match_odds.items()}

# Load the cleaned player prices
df_prices = pd.read_csv(r'data\cleaned_player_prices.csv')
df_prices['name'] = df_prices['name'].apply(clean_name).str.strip().str.title()

# Initialize FBRef data source for the Big 5 European Leagues combined
import soccerdata as sd
fbref = sd.FBref(leagues="Big 5 European Leagues Combined", seasons="2023-2024")

# Get player season statistics
player_season_stats = fbref.read_player_season_stats(stat_type="standard")
df_stats = pd.DataFrame(player_season_stats).reset_index()
df_stats.columns = [' '.join(col).strip() for col in df_stats.columns.values]
df_stats.rename(columns={'player': 'name'}, inplace=True)
df_stats['name'] = df_stats['name'].str.strip().str.title()

# Define the list of nationalities to include
nationalities_to_include = ['GER', 'ESP', 'ITA', 'NLD', 'ENG', 'BEL', 'FRA', 'PRT', 'DEN']
df_stats = df_stats[df_stats['nation'].isin(nationalities_to_include)]

# Load Euro Qualifying data
df_qualifying = pd.read_csv(r'data\aggregated_player_stats.csv')
df_qualifying['name'] = df_qualifying['name'].apply(clean_name).str.strip().str.title()

# Merge dataframes
df_combined = pd.merge(df_stats, df_qualifying[['name', 'minutes', 'saves', 'goals_against', 'save%']], on='name', how='inner')
df_combined = pd.merge(df_combined, df_prices, on='name', how='inner')

# Enforce specific position rules
position_corrections = {
    'Antoine Griezmann': 'FW',
    'Kingsley Coman': 'FW',
    'Yannick Carrasco': 'MF',
    'Cengiz Ünder': '',
    'Oleksandr Zinchenko': 'MF',
    'Pedro Porro': ''
}
df_combined.loc[df_combined['name'].isin(position_corrections.keys()), 'pos'] = df_combined['name'].map(position_corrections)

# Update the fantasy score calculation to use the total minutes
df_combined['fantasy_score_with_difficulty'] = calculate_fantasy_score(df_combined, match_difficulty, difficulty_weight=2.0)
df_combined['score_with_difficulty'] = df_combined['fantasy_score_with_difficulty'] / df_combined['Price']

# Prepare data for linear regression
features = ['minutes', 'Performance Gls', 'Performance Ast', 'Performance CrdY', 'Performance CrdR', 'saves', 'goals_against', 'save%']
target = 'fantasy_score_with_difficulty'

# Drop rows with missing values in the selected features
df_combined = df_combined.dropna(subset=features + [target])

# Split the data into training and testing sets
X = df_combined[features]
y = df_combined[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict fantasy scores using the trained model
df_combined['predicted_fantasy_score'] = lr_model.predict(df_combined[features])

# Calculate and display the mean squared error
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Define formations with their respective position requirements
formations = {
    '5-4-1': {'GK': 1, 'DF': 5, 'MF': 4, 'FW': 1},
    '5-3-2': {'GK': 1, 'DF': 5, 'MF': 3, 'FW': 2},
    '4-5-1': {'GK': 1, 'DF': 4, 'MF': 5, 'FW': 1},
    '4-4-2': {'GK': 1, 'DF': 4, 'MF': 4, 'FW': 2},
    '4-3-3': {'GK': 1, 'DF': 4, 'MF': 3, 'FW': 3},
    '3-5-2': {'GK': 1, 'DF': 3, 'MF': 5, 'FW': 2},
    '3-4-3': {'GK': 1, 'DF': 3, 'MF': 4, 'FW': 3}
}

# Calculate value per million
df_combined['value_per_million'] = df_combined['predicted_fantasy_score'] / (df_combined['Price'] / 1_000_000)

# Optimization using Linear Programming
def optimize_team(df, formation, budget=50_000_000):
    # Define the optimization problem
    prob = LpProblem("Fantasy_Team_Selection", LpMaximize)

    # Define decision variables
    player_vars = LpVariable.dicts("Player", df.index, cat='Binary')

    # Objective function: Maximize total fantasy score
    prob += lpSum([player_vars[i] * df.loc[i, 'predicted_fantasy_score'] for i in df.index])

    # Constraint: Total cost should not exceed the budget
    prob += lpSum([player_vars[i] * df.loc[i, 'Price'] for i in df.index]) <= budget

    # Positional constraints
    for pos, count in formation.items():
        prob += lpSum([player_vars[i] for i in df.index if pos in df.loc[i, 'pos']]) == count

    # Nationality constraint: No more than 4 players from the same nation
    for nation in df['nation'].unique():
        prob += lpSum([player_vars[i] for i in df.index if df.loc[i, 'nation'] == nation]) <= 4

    # Solve the optimization problem
    prob.solve()

    # Extract the selected team
    selected_team = df[[player_vars[i].varValue == 1 for i in df.index]]
    
    return selected_team

# Store optimized teams for each formation
formation_results = []

for formation_name, formation in formations.items():
    optimized_team = optimize_team(df_combined, formation)
    total_score = optimized_team['predicted_fantasy_score'].sum()
    formation_results.append((formation_name, optimized_team, total_score))

# Display the results for each formation
for formation_name, team, score in formation_results:
    print(f"Formation: {formation_name}")
    print(team[['name', 'Price', 'predicted_fantasy_score', 'value_per_million', 'pos']])
    print(f"Total Cost: {team['Price'].sum() / 1_000_000:.2f} million")
    print(f"Total Fantasy Score: {score:.2f}")
    print("="*40)

# Save all formation results to CSV files
for formation_name, team, _ in formation_results:
    team[['name', 'Price', 'predicted_fantasy_score', 'value_per_million', 'pos']].to_csv(f'optimized_team_{formation_name}.csv', index=False)


Mean Squared Error: 2120398700051.0042
Formation: 5-4-1
                   name      Price  predicted_fantasy_score  \
30    Vitaliy Mykolenko  2500000.0             5.500089e+05   
51          Pedro Porro  3000000.0             1.305064e+05   
68      Jude Bellingham  8000000.0             1.645954e+06   
88        Arthur Theate  3500000.0             3.945364e+05   
92           Harry Kane  9000000.0             3.266685e+06   
99        Attila Szalai  2500000.0             3.902884e+05   
126       Etrit Berisha  3000000.0             2.518025e+06   
132     Davide Frattesi  4000000.0             7.051800e+05   
133    Federico Dimarco  4500000.0             5.529153e+05   
134    Francesco Acerbi  3000000.0             5.118186e+05   
153  Lorenzo Pellegrini  3500000.0             5.836142e+05   
162     Lazar Samardzic  2500000.0             6.071421e+05   

     value_per_million pos  
30       220003.549189  DF  
51        43502.133158      
68       205744.217470  MF  
88      