In [2]:
import pandas as pd
from pulp import LpMaximize, LpProblem, LpVariable, lpSum
import re
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)


# Function to clean player names
def clean_name(name):
    return re.split(r'\n', name)[0].strip()

def calculate_fantasy_score(df, match_difficulty=None):
    score = pd.Series(0, index=df.index)
    
    # Goals
    score += df['Performance Gls'] * df['pos'].map({
        'FW': 125000, 
        'MF': 150000, 
        'DF': 175000, 
        'GK': 250000
    }).fillna(0)
    
    # Assists
    score += df['Performance Ast'] * 60000
    
    # Yellow and Red Cards
    score -= df['Performance CrdY'] * 20000
    score -= df['Performance CrdR'] * 50000

    # Goalkeeper specific statistics
    is_gk = df['pos'] == 'GK'
    score[is_gk] += df['saves'][is_gk] * 5000
    score[is_gk] -= df['goals_against'][is_gk] * 10000
    score[is_gk] += df['save%'][is_gk] * 1000
    
    # Adjust score based on match difficulty if provided
    if match_difficulty:
        score *= df['nation'].map(match_difficulty).fillna(1)
    
    # Weight score by the likelihood of starting (total minutes / 720)
    weight = df['minutes'] / 720
    score *= weight
    
    return score

# Function to calculate multipliers from odds
def calculate_multipliers(odds):
    probabilities = {key: 1 / value for key, value in odds.items()}
    total = sum(probabilities.values())
    normalized_probabilities = {key: value / total for key, value in probabilities.items()}
    return {key: 1 / value for key, value in normalized_probabilities.items()}

# Parse and store match odds
match_odds = {
    'GER': {'win': 1.30, 'draw': 6.00, 'lose': 12.50},
    'SCO': {'win': 12.50, 'draw': 6.00, 'lose': 1.30},
    'HUN': {'win': 3.50, 'draw': 3.50, 'lose': 2.20},
    'CHE': {'win': 2.20, 'draw': 3.50, 'lose': 3.50},
    'ESP': {'win': 1.78, 'draw': 3.85, 'lose': 5.00},
    'HRV': {'win': 5.00, 'draw': 3.85, 'lose': 1.78},
    'ITA': {'win': 1.39, 'draw': 4.85, 'lose': 9.70},
    'ALB': {'win': 9.70, 'draw': 4.85, 'lose': 1.39},
    'POL': {'win': 5.20, 'draw': 4.15, 'lose': 1.70},
    'NLD': {'win': 1.70, 'draw': 4.15, 'lose': 5.20},
    'SVN': {'win': 5.00, 'draw': 3.75, 'lose': 1.82},
    'DNK': {'win': 1.82, 'draw': 3.75, 'lose': 5.00},
    'SRB': {'win': 7.00, 'draw': 4.65, 'lose': 1.50},
    'ENG': {'win': 1.50, 'draw': 4.65, 'lose': 7.00},
    'ROU': {'win': 3.70, 'draw': 3.60, 'lose': 2.17},
    'UKR': {'win': 2.17, 'draw': 3.60, 'lose': 3.70},
    'BEL': {'win': 1.51, 'draw': 4.55, 'lose': 6.80},
    'SVK': {'win': 6.80, 'draw': 4.55, 'lose': 1.51},
    'AUT': {'win': 7.00, 'draw': 4.75, 'lose': 1.50},
    'FRA': {'win': 1.50, 'draw': 4.75, 'lose': 7.00},
    'TUR': {'win': 1.80, 'draw': 3.85, 'lose': 4.90},
    'GEO': {'win': 4.90, 'draw': 3.85, 'lose': 1.80},
    'PRT': {'win': 1.57, 'draw': 4.40, 'lose': 7.00},
    'CZE': {'win': 7.00, 'draw': 4.40, 'lose': 1.57}
}

# Calculate match difficulty multipliers
match_difficulty = {team: calculate_multipliers(odds)['win'] for team, odds in match_odds.items()}

# Load the cleaned player prices
df_prices = pd.read_csv(r'data\cleaned_player_prices.csv')
df_prices['name'] = df_prices['name'].apply(clean_name).str.strip().str.title()

# Initialize FBRef data source for the Big 5 European Leagues combined
import soccerdata as sd
fbref = sd.FBref(leagues="Big 5 European Leagues Combined", seasons="2023-2024")

# Get player season statistics
player_season_stats = fbref.read_player_season_stats(stat_type="standard")
df_stats = pd.DataFrame(player_season_stats).reset_index()
df_stats.columns = [' '.join(col).strip() for col in df_stats.columns.values]
df_stats.rename(columns={'player': 'name'}, inplace=True)
df_stats['name'] = df_stats['name'].str.strip().str.title()

# Define the list of nationalities to include
nationalities_to_include = ['GER', 'ESP', 'ITA', 'NLD', 'ENG', 'BEL', 'FRA', 'PRT', 'DEN', 'TUR', 'UKR', 'CHE']
df_stats = df_stats[df_stats['nation'].isin(nationalities_to_include)]

# Load Euro Qualifying data
df_qualifying = pd.read_csv(r'data\aggregated_player_stats.csv')
df_qualifying['name'] = df_qualifying['name'].apply(clean_name).str.strip().str.title()

# Merge dataframes
df_combined = pd.merge(df_stats, df_qualifying[['name', 'minutes', 'saves', 'goals_against', 'save%']], on='name', how='inner')
df_combined = pd.merge(df_combined, df_prices, on='name', how='inner')

# Enforce specific position rules
position_corrections = {
    'Antoine Griezmann': 'FW',
    'Kingsley Coman': 'FW',
    'Yannick Carrasco': 'MF',
    'Cengiz Ünder': '',
    'Oleksandr Zinchenko': 'MF'
}
df_combined.loc[df_combined['name'].isin(position_corrections.keys()), 'pos'] = df_combined['name'].map(position_corrections)

# Update the fantasy score calculation to use the total minutes
df_combined['fantasy_score_with_difficulty'] = calculate_fantasy_score(df_combined, match_difficulty)
df_combined['score_with_difficulty'] = df_combined['fantasy_score_with_difficulty'] / df_combined['Price']

# Define formations with their respective position requirements
formations = {
    '5-4-1': {'GK': 1, 'DF': 5, 'MF': 4, 'FW': 1},
    '5-3-2': {'GK': 1, 'DF': 5, 'MF': 3, 'FW': 2},
    '4-5-1': {'GK': 1, 'DF': 4, 'MF': 5, 'FW': 1},
    '4-4-2': {'GK': 1, 'DF': 4, 'MF': 4, 'FW': 2},
    '4-3-3': {'GK': 1, 'DF': 4, 'MF': 3, 'FW': 3},
    '3-5-2': {'GK': 1, 'DF': 3, 'MF': 5, 'FW': 2},
    '3-4-3': {'GK': 1, 'DF': 3, 'MF': 4, 'FW': 3}
}

# Calculate value per million
df_combined['value_per_million'] = df_combined['fantasy_score_with_difficulty'] / (df_combined['Price'] / 1_000_000)

# Optimization using Linear Programming
def optimize_team(df, budget=50_000_000):
    # Define the optimization problem
    prob = LpProblem("Fantasy_Team_Selection", LpMaximize)

    # Define decision variables
    player_vars = LpVariable.dicts("Player", df.index, cat='Binary')

    # Objective function: Maximize total fantasy score
    prob += lpSum([player_vars[i] * df.loc[i, 'fantasy_score_with_difficulty'] for i in df.index])

    # Constraint: Total cost should not exceed the budget
    prob += lpSum([player_vars[i] * df.loc[i, 'Price'] for i in df.index]) <= budget

    # Positional constraints
    positions = {'GK': 1, 'DF': 3, 'MF': 4, 'FW': 3}
    for pos, count in positions.items():
        prob += lpSum([player_vars[i] for i in df.index if pos in df.loc[i, 'pos']]) == count

    # Nationality constraint: No more than 4 players from the same nation
    for nation in df['nation'].unique():
        prob += lpSum([player_vars[i] for i in df.index if df.loc[i, 'nation'] == nation]) <= 4

    # Solve the optimization problem
    prob.solve()

    # Extract the selected team
    selected_team = df[[player_vars[i].varValue == 1 for i in df.index]]
    
    return selected_team

# Optimize team selection
optimized_team = optimize_team(df_combined)

# Create the summary DataFrame
optimized_team_summary = optimized_team[['name', 'Price', 'fantasy_score_with_difficulty', 'value_per_million', 'pos']]

# Save the summary to a new CSV file
optimized_team_summary.to_csv('optimized_team.csv', index=False)

# Display the summary
print("Optimized Team:")
print(optimized_team_summary)
print(f"Total Cost: {optimized_team['Price'].sum() / 1_000_000:.2f} million")
print(f"Total Fantasy Score: {optimized_team['fantasy_score_with_difficulty'].sum():.2f}")


NameError: name 'logging' is not defined