In [None]:
import requests
import pandas as pd

url = "https://understat.com/getLeagueData/La%20liga/2025"

headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Language": "es,es-ES;q=0.9,en;q=0.8,en-US;q=0.6,es-PE;q=0.5",
    "Referer": "https://understat.com/league/La_liga",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36 Edg/143.0.0.0",
    "X-Requested-With": "XMLHttpRequest"
}

cookies = {
    "PHPSESSID": "4ca0f5lhl3ifmml7bajcokjl6j",
    "UID": "a66ff55c7d0aaa41",
    "_ym_uid": "1765320297494815908",
    "_ym_d": "1765320297",
    "_ym_isad": "2",
    "_ym_visorc": "w",
    "PROMOTIONS": "eyI3Ijp7Im5hbWUiOiJhZHNlbnNlIiwidmlld3MiOlsxNzY1Mzg3MzkwOTM5LDE3NjUzODc0NDc1NTZdLCJjbGlja3MiOltdfX0="
}

# Descargar datos
response = requests.get(url, headers=headers, cookies=cookies)
data = response.json()

# Extraer partidos de todos los equipos
rows = []

for team_id, team_info in data["teams"].items():
    team_name = team_info["title"]

    for match in team_info["history"]:
        row = match.copy()
        row["team"] = team_name
        rows.append(row)

df = pd.DataFrame(rows)

# --- Crear un dataset tipo "read_team_match_stats" ---

# Quiero separar local y visitante
home = df[df["h_a"] == "h"].copy()
away = df[df["h_a"] == "a"].copy()

# Crear ID único de partido
home["match_id"] = home["date"] + "_" + home["scored"].astype(str) + "_" + home["missed"].astype(str)
away["match_id"] = away["date"] + "_" + away["missed"].astype(str) + "_" + away["scored"].astype(str)

# Combinar local y visitante
matches = home.merge(
    away, on="match_id", suffixes=("_home", "_away")
)

# Reordenar columnas a estilo soccerdata
matches = matches[[
    "date_home",
    "team_home",
    "team_away",
    "scored_home", "missed_home",
    "scored_away", "missed_away",
    "xG_home", "xGA_home",
    "xG_away", "xGA_away",
    "npxG_home", "npxGA_home",
    "npxG_away", "npxGA_away",
    "deep_home", "deep_allowed_home",
    "deep_away", "deep_allowed_away",
    "result_home",
    "xpts_home"
]]

matches.head()

Unnamed: 0,date_home,team_home,team_away,scored_home,missed_home,scored_away,missed_away,xG_home,xGA_home,xG_away,...,npxG_home,npxGA_home,npxG_away,npxGA_away,deep_home,deep_allowed_home,deep_away,deep_allowed_away,result_home,xpts_home
0,2025-08-25 19:30:00,Sevilla,Getafe,1,2,2,1,0.833075,1.32944,1.32944,...,0.833075,1.32944,1.32944,0.833075,4,4,4,4,l,0.8265
1,2025-09-12 19:00:00,Sevilla,Elche,2,2,2,2,0.346397,0.781151,0.781151,...,0.346397,0.781151,0.781151,0.346397,5,1,1,5,d,0.7806
2,2025-09-23 19:30:00,Sevilla,Villarreal,1,2,2,1,0.83617,2.40523,2.40523,...,0.83617,2.40523,2.40523,0.83617,9,6,6,9,l,0.3876
3,2025-10-05 14:15:00,Sevilla,Barcelona,4,1,1,4,2.89414,2.4823,2.4823,...,2.15086,1.73902,1.73902,2.15086,5,10,10,5,w,1.6833
4,2025-10-18 12:00:00,Sevilla,Mallorca,1,3,3,1,1.25884,1.95555,1.95555,...,1.25884,1.95555,1.95555,1.25884,4,2,2,4,l,0.8509


In [None]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date_home          157 non-null    object 
 1   team_home          157 non-null    object 
 2   team_away          157 non-null    object 
 3   scored_home        157 non-null    int64  
 4   missed_home        157 non-null    int64  
 5   scored_away        157 non-null    int64  
 6   missed_away        157 non-null    int64  
 7   xG_home            157 non-null    float64
 8   xGA_home           157 non-null    float64
 9   xG_away            157 non-null    float64
 10  xGA_away           157 non-null    float64
 11  npxG_home          157 non-null    float64
 12  npxGA_home         157 non-null    float64
 13  npxG_away          157 non-null    float64
 14  npxGA_away         157 non-null    float64
 15  deep_home          157 non-null    int64  
 16  deep_allowed_home  157 non

In [None]:
import numpy as np

df = matches.copy()

# Probabilidad aproximada usando xG real del local y visitante
df['prob_home_win'] = df['xG_home'] / (df['xG_home'] + df['xG_away'])
df['prob_away_win'] = df['xG_away'] / (df['xG_home'] + df['xG_away'])
df['prob_draw'] = 1 - (df['prob_home_win'] + df['prob_away_win'])

df[['team_home','team_away','xG_home','xG_away',
    'prob_home_win','prob_away_win','prob_draw']].head()

Unnamed: 0,team_home,team_away,xG_home,xG_away,prob_home_win,prob_away_win,prob_draw
0,Sevilla,Getafe,0.833075,1.32944,0.385234,0.614766,0.0
1,Sevilla,Elche,0.346397,0.781151,0.307213,0.692787,0.0
2,Sevilla,Villarreal,0.83617,2.40523,0.257966,0.742034,0.0
3,Sevilla,Barcelona,2.89414,2.4823,0.5383,0.4617,0.0
4,Sevilla,Mallorca,1.25884,1.95555,0.391626,0.608374,0.0


In [None]:
print(df.columns)

Index(['date_home', 'team_home', 'team_away', 'scored_home', 'missed_home',
       'scored_away', 'missed_away', 'xG_home', 'xGA_home', 'xG_away',
       'xGA_away', 'npxG_home', 'npxGA_home', 'npxG_away', 'npxGA_away',
       'deep_home', 'deep_allowed_home', 'deep_away', 'deep_allowed_away',
       'result_home', 'xpts_home', 'prob_home_win', 'prob_away_win',
       'prob_draw'],
      dtype='object')


In [None]:
import numpy as np

def create_betting_target(df, prob_home_col='prob_home_win', prob_away_col='prob_away_win', threshold=0.5):
    """
    Crea la columna target 'apostar / no apostar' basada en un umbral de probabilidad.

    Args:
        df (DataFrame): dataset que contiene las columnas de probabilidades
        prob_home_col (str): nombre de la columna con probabilidad de victoria local
        prob_away_col (str): nombre de la columna con probabilidad de victoria visitante
        threshold (float): probabilidad mínima para considerar apostar

    Returns:
        DataFrame: dataset con columna 'target' agregada
    """
    df = df.copy()
    df['target'] = np.where(
        (df[prob_home_col] > threshold) | (df[prob_away_col] > threshold),
        1,
        0
    )
    return df


In [None]:
#Escalar acciones en zona peligrosa
df['home_deep_scaled'] = df['deep_home'] / (df['deep_home'] + df['deep_away'])
df['away_deep_scaled'] = df['deep_away'] / (df['deep_home'] + df['deep_away'])
#Escalar xG
df['home_xg_scaled'] = df['xG_home'] / (df['xG_home'] + df['xG_away'])
df['away_xg_scaled'] = df['xG_away'] / (df['xG_home'] + df['xG_away'])
#Diferencia NP-XG
df['home_np_diff'] = df['npxG_home'] - df['npxGA_home']
df['away_np_diff'] = df['npxG_away'] - df['npxGA_away']


In [None]:
# Escalado defensivo
df['home_xga_scaled'] = df['xGA_home'] / (df['xGA_home'] + df['xGA_away'])
df['away_xga_scaled'] = df['xGA_away'] / (df['xGA_home'] + df['xGA_away'])

df['home_deep_allowed_scaled'] = df['deep_allowed_home'] / (df['deep_allowed_home'] + df['deep_allowed_away'])
df['away_deep_allowed_scaled'] = df['deep_allowed_away'] / (df['deep_allowed_home'] + df['deep_allowed_away'])

In [None]:
#Escalamos:
df['home_np_diff_scaled'] = df['home_np_diff'] / (abs(df['home_np_diff']) + abs(df['away_np_diff']) + 1e-6)
df['away_np_diff_scaled'] = df['away_np_diff'] / (abs(df['home_np_diff']) + abs(df['away_np_diff']) + 1e-6)

In [None]:
# Ventaja local
home_adv = 0.05

# Ponderaciones
w_xg   = 0.6     # más peso a xG porque es tu mejor métrica
w_deep = 0.25
w_np   = 0.15
draw_factor = 0.5

# Probabilidad de victoria local
df['prob_home_win'] = (
    w_xg * df['home_xg_scaled'] +
    w_deep * df['home_deep_scaled'] +
    w_np * df['home_np_diff_scaled'] +
    home_adv
)

# Probabilidad de victoria visitante
df['prob_away_win'] = (
    w_xg * df['away_xg_scaled'] +
    w_deep * df['away_deep_scaled'] +
    w_np * df['away_np_diff_scaled'] -
    home_adv
)

# Probabilidad bruta de empate
df['draw_raw'] = np.exp(-np.abs(df['xG_home'] - df['xG_away']))

total = df['prob_home_win'] + df['prob_away_win'] + df['draw_raw']

df['prob_home_win'] = df['prob_home_win'] / total
df['prob_away_win'] = df['prob_away_win'] / total
df['prob_draw']     = df['draw_raw'] / total

In [None]:
# --- Crear target apostar / no apostar ---
df = create_betting_target(df, prob_home_col='prob_home_win', prob_away_col='prob_away_win', threshold=0.5)

# --- Revisar resultados (columnas corregidas) ---
df[['team_home','team_away','prob_home_win','prob_away_win','prob_draw','target']].head(10)

Unnamed: 0,team_home,team_away,prob_home_win,prob_away_win,prob_draw,target
0,Sevilla,Getafe,0.227005,0.35569,0.417305,0
1,Sevilla,Elche,0.245529,0.322113,0.432358,0
2,Sevilla,Villarreal,0.264382,0.538838,0.19678,1
3,Sevilla,Barcelona,0.351298,0.210712,0.437991,0
4,Sevilla,Mallorca,0.279362,0.351098,0.36954,0
5,Sevilla,Osasuna,0.260745,0.262448,0.476807,0
6,Sevilla,Real Betis,0.209563,0.360202,0.430234,0
7,Real Sociedad,Espanyol,0.327355,0.24125,0.431395,0
8,Real Sociedad,Real Madrid,0.331922,0.277169,0.390909,0
9,Real Sociedad,Mallorca,0.876423,0.006965,0.116612,1


In [None]:
import pandas as pd
import numpy as np

# Ventana de forma reciente
window = 5

# Columnas disponibles en tu dataset
rolling_cols = [
    'xG_home', 'xG_away',
    'npxG_home', 'npxG_away',
    'deep_home', 'deep_away'
]

# DataFrame vacío para guardar rolling features
rolling_features = pd.DataFrame()

# Lista de equipos (corregido)
teams = pd.concat([df['team_home'], df['team_away']]).unique()

for team in teams:
    # Filtrar partidos del equipo (corregido)
    team_df = df[(df['team_home'] == team) | (df['team_away'] == team)].sort_values('date_home')

    # Crear identificador único del partido
    team_df['match_id'] = team_df.index

    temp = pd.DataFrame()
    temp['match_id'] = team_df['match_id']

    # Rolling para cada columna real
    for col in rolling_cols:
        temp[f'{col}_rolling_mean'] = team_df[col].rolling(window=window, min_periods=1).mean().values

    temp['team'] = team
    rolling_features = pd.concat([rolling_features, temp], axis=0)

rolling_features.head()

Unnamed: 0,match_id,xG_home_rolling_mean,xG_away_rolling_mean,npxG_home_rolling_mean,npxG_away_rolling_mean,deep_home_rolling_mean,deep_away_rolling_mean,team
55,55,2.67586,0.901645,1.93258,0.901645,12.0,4.0,Sevilla
0,0,1.754468,1.115543,1.382827,1.115543,8.0,4.0,Sevilla
134,134,1.839338,1.283802,1.591578,1.283802,8.333333,4.666667,Sevilla
1,1,1.466103,1.158139,1.280283,1.158139,7.5,3.75,Sevilla
119,119,1.354934,1.10491,1.057622,1.10491,6.8,3.8,Sevilla


In [None]:
def resumen_apuesta_rentable(matches, rolling_features, partido_idx, cuotas, n_last=5):
    """
    Genera un resumen de apuesta rentable de un partido con rolling features,
    EV y últimos rivales, adaptado a tus columnas actuales.
    """
    partido = matches.iloc[partido_idx]
    home_team = partido['team_home']
    away_team = partido['team_away']
    match_id = partido['index'] if 'index' in partido else partido.name  # Usa el 'index' si está disponible, sino el nombre del índice

    # --- Rolling del equipo local y visitante ---
    home_rolling = rolling_features[
    (rolling_features['team'] == home_team) &
    (rolling_features['match_id'] == match_id)
    ].iloc[0]

    away_rolling = rolling_features[
        (rolling_features['team'] == away_team) &
        (rolling_features['match_id'] == match_id)
    ].iloc[0]


    # --- Probabilidades implícitas ---
    p_home = partido['prob_home_win']
    p_away = partido['prob_away_win']
    p_draw = partido['prob_draw']

    # --- Valor esperado (EV) ---
    ev_home = p_home * cuotas['home'] - 1
    ev_away = p_away * cuotas['away'] - 1
    ev_draw = p_draw * cuotas['draw'] - 1

    # --- Últimos partidos del local ---
    home_history = matches[
        (matches['team_home'] == home_team) | (matches['team_away'] == home_team)
    ].sort_values('date_home').tail(n_last)

    home_last = []
    for _, row in home_history.iterrows():
        if row['team_home'] == home_team:
            rival = row['team_away']
            xg_for = row['xG_home']
            xg_against = row['xG_away']
            result = "W" if row['scored_home'] > row['scored_away'] else ("L" if row['scored_home'] < row['scored_away'] else "D")
            venue = "Local"
        else:
            rival = row['team_home']
            xg_for = row['xG_away']
            xg_against = row['xG_home']
            result = "W" if row['scored_away'] > row['scored_home'] else ("L" if row['scored_away'] < row['scored_home'] else "D")
            venue = "Visitante"
        home_last.append(f"{venue} vs {rival}: {result} (xG {xg_for:.2f}-{xg_against:.2f})")

    # --- Últimos partidos del visitante ---
    away_history = matches[
        (matches['team_home'] == away_team) | (matches['team_away'] == away_team)
    ].sort_values('date_home').tail(n_last)

    away_last = []
    for _, row in away_history.iterrows():
        if row['team_home'] == away_team:
            rival = row['team_away']
            xg_for = row['xG_home']
            xg_against = row['xG_away']
            result = "W" if row['scored_home'] > row['scored_away'] else ("L" if row['scored_home'] < row['scored_away'] else "D")
            venue = "Local"
        else:
            rival = row['team_home']
            xg_for = row['xG_away']
            xg_against = row['xG_home']
            result = "W" if row['scored_away'] > row['scored_home'] else ("L" if row['scored_away'] < row['scored_home'] else "D")
            venue = "Visitante"
        away_last.append(f"{venue} vs {rival}: {result} (xG {xg_for:.2f}-{xg_against:.2f})")

    # --- Resumen final ---
    resumen = f"""
Partido: {home_team} vs {away_team} ({partido['date_home']})

FORMA RECIENTE - {home_team}
- xG promedio últimos {n_last} partidos: {home_rolling['xG_home_rolling_mean']:.2f}
- NP-xG promedio: {home_rolling['npxG_home_rolling_mean']:.2f}
- Deep completions: {home_rolling['deep_home_rolling_mean']:.2f}

Últimos {n_last} partidos:
{chr(10).join(home_last)}

--------------------------------------------

FORMA RECIENTE - {away_team}
- xG promedio últimos {n_last} partidos: {away_rolling['xG_away_rolling_mean']:.2f}
- NP-xG promedio: {away_rolling['npxG_away_rolling_mean']:.2f}
- Deep completions: {away_rolling['deep_away_rolling_mean']:.2f}

Últimos {n_last} partidos:
{chr(10).join(away_last)}

--------------------------------------------

Probabilidades implícitas:
- Local: {p_home:.2f}
- Empate: {p_draw:.2f}
- Visitante: {p_away:.2f}

Cuotas:
- Local: {cuotas['home']}
- Empate: {cuotas['draw']}
- Visitante: {cuotas['away']}

Valor esperado (EV):
- EV local: {ev_home:.2f}
- EV empate: {ev_draw:.2f}
- EV visitante: {ev_away:.2f}

Apuesta recomendada: {'Sí' if partido['target']==1 else 'No'}
"""
    return resumen

In [None]:
#Colocar cuotas
cuotas = {'home': 2.0, 'draw': 3.5, 'away': 3.0}
print("**************")
print ("No Dones Más")
print("**************")
texto_resumen = resumen_apuesta_rentable(df, rolling_features, 136, cuotas)
print(texto_resumen)

**************
No Dones Más
**************

Partido: Girona vs Espanyol (2025-09-26 19:00:00)

FORMA RECIENTE - Girona
- xG promedio últimos 5 partidos: 1.47
- NP-xG promedio: 1.32
- Deep completions: 8.20

Últimos 5 partidos:
Local vs Alaves: W (xG 1.26-2.01)
Visitante vs Real Betis: D (xG 1.49-1.58)
Local vs Real Madrid: D (xG 1.05-2.21)
Visitante vs Elche: L (xG 0.78-1.60)
Visitante vs Real Sociedad: W (xG 1.17-1.02)

--------------------------------------------

FORMA RECIENTE - Espanyol
- xG promedio últimos 5 partidos: 1.27
- NP-xG promedio: 1.12
- Deep completions: 2.60

Últimos 5 partidos:
Local vs Villarreal: L (xG 0.48-1.47)
Local vs Sevilla: W (xG 1.17-1.80)
Visitante vs Celta Vigo: W (xG 0.60-0.64)
Local vs Rayo Vallecano: W (xG 1.58-0.29)
Visitante vs Getafe: W (xG 0.75-0.31)

--------------------------------------------

Probabilidades implícitas:
- Local: 0.39
- Empate: 0.51
- Visitante: 0.10

Cuotas:
- Local: 2.0
- Empate: 3.5
- Visitante: 3.0

Valor esperado (EV):
- EV

In [None]:
!pip install --upgrade google-genai

Collecting google-genai
  Downloading google_genai-1.55.0-py3-none-any.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.9/47.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading google_genai-1.55.0-py3-none-any.whl (703 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m703.4/703.4 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-genai
  Attempting uninstall: google-genai
    Found existing installation: google-genai 1.54.0
    Uninstalling google-genai-1.54.0:
      Successfully uninstalled google-genai-1.54.0
Successfully installed google-genai-1.55.0


In [None]:
#Ordenar tu DataFrame por Fecha
# Orden descendente por fecha
df_sorted = df.sort_values('date_home', ascending=False).reset_index(drop=False)
df_sorted.head()

Unnamed: 0,index,date_home,team_home,team_away,scored_home,missed_home,scored_away,missed_away,xG_home,xGA_home,...,home_np_diff,away_np_diff,home_xga_scaled,away_xga_scaled,home_deep_allowed_scaled,away_deep_allowed_scaled,home_np_diff_scaled,away_np_diff_scaled,draw_raw,target
0,31,2025-12-13 20:00:00,Getafe,Espanyol,0,1,1,0,0.305925,0.75472,...,-0.448795,0.448795,0.711567,0.288433,0.4,0.6,-0.499999,0.499999,0.638397,0
1,72,2025-12-13 17:30:00,Barcelona,Osasuna,2,0,0,2,2.14468,0.228217,...,1.916463,-1.916463,0.096177,0.903823,0.055556,0.944444,0.5,-0.5,0.147126,1
2,148,2025-12-13 15:15:00,Mallorca,Elche,3,1,1,3,2.581,0.311674,...,2.269326,-2.269326,0.107746,0.892254,0.636364,0.363636,0.5,-0.5,0.103382,1
3,40,2025-12-13 13:00:00,Atletico Madrid,Valencia,2,1,1,2,2.07076,1.26068,...,0.81008,-0.81008,0.378419,0.621581,0.6,0.4,0.5,-0.5,0.444822,0
4,14,2025-12-12 20:00:00,Real Sociedad,Girona,1,2,2,1,1.02349,1.16894,...,-0.14545,0.14545,0.533171,0.466829,0.409091,0.590909,-0.499998,0.499998,0.864633,0


In [None]:
from google import genai

client = genai.Client(api_key="AIzaSyBcR7nyprg_eg65wQpIJ11IEzEYnowR_JY")

def analizar_resumen_gemini(resumen):
    """
    Analiza un resumen de Rolling Features de un partido usando Gemini IA.
    """
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=f"""
Analiza este resumen de Rolling Features de un partido e identifica cómo se contrarrestarían ambos equipos a partir de sus valores.

REGLAS IMPORTANTES (PIENSA COMO APOSTADOR):
- Prioriza siempre el resultado MÁS PROBABLE, no solo el de mayor valor esperado (EV).
- Si hay un favorito claro por xG, forma reciente y localía, NO recomiendes empate como apuesta principal.
- El empate solo puede recomendarse si:
  1) Los equipos tienen niveles similares, o
  2) El favorito muestra debilidades defensivas claras, o
  3) El contexto reciente indica alta probabilidad de igualdad.
- No recomiendes resultados de baja probabilidad únicamente porque tengan EV positivo.

FORMATO:
- Respuesta ordenada y clara, en párrafos.
- Explica como ingeniero analítico, pero con lenguaje de apostador (sin exceso de tecnicismo).
- Analiza cómo se contrarrestan usando xG, NP-xG y Deep Completions.
- Analiza últimos partidos y dificultad de rivales.

APUESTA:
- Recomienda SOLO UNA apuesta principal en categorías clásicas:
  (Gana Local, Gana Visitante, Gana o Empata).
- La apuesta debe ser la más lógica y realista según el análisis global.
- Si el favorito tiene cuota muy baja, sugiere “Gana o Empata” antes que empate puro.

ESCENARIOS:
- Explica brevemente en qué casos ganaría el local, el visitante o empatarían.
- El empate debe describirse solo como escenario posible, NO como apuesta principal salvo que esté claramente justificado.

Resumen del partido:
{resumen}
"""
    )
    return response.text


In [None]:
print(df.columns)
print("************")
print(df_sorted.columns)

Index(['date_home', 'team_home', 'team_away', 'scored_home', 'missed_home',
       'scored_away', 'missed_away', 'xG_home', 'xGA_home', 'xG_away',
       'xGA_away', 'npxG_home', 'npxGA_home', 'npxG_away', 'npxGA_away',
       'deep_home', 'deep_allowed_home', 'deep_away', 'deep_allowed_away',
       'result_home', 'xpts_home', 'prob_home_win', 'prob_away_win',
       'prob_draw', 'home_deep_scaled', 'away_deep_scaled', 'home_xg_scaled',
       'away_xg_scaled', 'home_np_diff', 'away_np_diff', 'home_xga_scaled',
       'away_xga_scaled', 'home_deep_allowed_scaled',
       'away_deep_allowed_scaled', 'home_np_diff_scaled',
       'away_np_diff_scaled', 'draw_raw', 'target'],
      dtype='object')
************
Index(['index', 'date_home', 'team_home', 'team_away', 'scored_home',
       'missed_home', 'scored_away', 'missed_away', 'xG_home', 'xGA_home',
       'xG_away', 'xGA_away', 'npxG_home', 'npxGA_home', 'npxG_away',
       'npxGA_away', 'deep_home', 'deep_allowed_home', 'deep_away',

In [None]:
#Instalar la biblioteca BeautifulSoup
!pip install BeautifulSoup4



In [None]:
#Librerias necesarias
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import csv
from urllib.parse import urljoin

In [None]:
import requests
import pandas as pd
API_KEY = "57f283ddc0f04099a793bc638c82b016"
competicion = "PD"  # Cambiado a 'PD' para La Liga
url = f"https://api.football-data.org/v4/competitions/{competicion}/matches"

headers = {
    "X-Auth-Token": API_KEY
}
response = requests.get(url,headers=headers).json()
response

{'filters': {'season': '2025'},
 'resultSet': {'count': 380,
  'first': '2025-08-15',
  'last': '2026-05-24',
  'played': 157},
 'competition': {'id': 2014,
  'name': 'Primera Division',
  'code': 'PD',
  'type': 'LEAGUE',
  'emblem': 'https://crests.football-data.org/laliga.png'},
 'matches': [{'area': {'id': 2224,
    'name': 'Spain',
    'code': 'ESP',
    'flag': 'https://crests.football-data.org/760.svg'},
   'competition': {'id': 2014,
    'name': 'Primera Division',
    'code': 'PD',
    'type': 'LEAGUE',
    'emblem': 'https://crests.football-data.org/laliga.png'},
   'season': {'id': 2429,
    'startDate': '2025-08-17',
    'endDate': '2026-05-24',
    'currentMatchday': 16,
    'winner': None},
   'id': 544214,
   'utcDate': '2025-08-15T17:00:00Z',
   'status': 'FINISHED',
   'matchday': 1,
   'stage': 'REGULAR_SEASON',
   'group': None,
   'lastUpdated': '2025-12-14T00:20:50Z',
   'homeTeam': {'id': 298,
    'name': 'Girona FC',
    'shortName': 'Girona',
    'tla': 'GIR',
 

In [None]:
df_historico = df_sorted.copy()  # contiene los partidos jugados hasta hoy

In [None]:
df_historico.columns

Index(['index', 'date_home', 'team_home', 'team_away', 'scored_home',
       'missed_home', 'scored_away', 'missed_away', 'xG_home', 'xGA_home',
       'xG_away', 'xGA_away', 'npxG_home', 'npxGA_home', 'npxG_away',
       'npxGA_away', 'deep_home', 'deep_allowed_home', 'deep_away',
       'deep_allowed_away', 'result_home', 'xpts_home', 'prob_home_win',
       'prob_away_win', 'prob_draw', 'home_deep_scaled', 'away_deep_scaled',
       'home_xg_scaled', 'away_xg_scaled', 'home_np_diff', 'away_np_diff',
       'home_xga_scaled', 'away_xga_scaled', 'home_deep_allowed_scaled',
       'away_deep_allowed_scaled', 'home_np_diff_scaled',
       'away_np_diff_scaled', 'draw_raw', 'target'],
      dtype='object')

In [None]:
import pandas as pd
import numpy as np
import requests

# --- 1️⃣ Traer partidos futuros ---
competicion = "PD"  # Cambiado a 'PD' para La Liga
url = f"https://api.football-data.org/v4/competitions/{competicion}/matches"
headers = {
    "X-Auth-Token": API_KEY
}
params = {
    "dateFrom": "2025-12-10",
    "dateTo": "2026-05-24", # Actualizar la fecha para cubrir toda la temporada de La Liga
}

resp = requests.get(url, headers=headers, params=params)
data = resp.json()["matches"]

fixtures = pd.DataFrame([
    {
        "game_id": m["id"],
        "date": m["utcDate"],
        "home_team": m["homeTeam"]["name"],
        "away_team": m["awayTeam"]["name"],
        "status": m["status"]
    } for m in data
])

# --- 2️⃣ Mapear nombres a los usados en tu histórico ---
team_name_mapping = {
    'Girona FC': 'Girona',
    'Rayo Vallecano de Madrid': 'Rayo Vallecano',
    'Villarreal CF': 'Villarreal',
    'Real Oviedo': 'Real Oviedo',
    'RCD Mallorca': 'Mallorca',
    'FC Barcelona': 'Barcelona',
    'Deportivo Alavés': 'Alaves',
    'Levante UD': 'Levante',
    'Valencia CF': 'Valencia',
    'Real Sociedad de Fútbol': 'Real Sociedad',
    'RC Celta de Vigo': 'Celta Vigo',
    'Athletic Club': 'Athletic Club',
    'RCD Espanyol de Barcelona': 'Espanyol',
    'Club Atlético de Madrid': 'Atletico Madrid',
    'Elche CF': 'Elche',
    'Real Madrid CF': 'Real Madrid',
    'CA Osasuna': 'Osasuna',
    'Real Betis Balompié': 'Real Betis',
    'Sevilla FC': 'Sevilla',
    'Getafe CF': 'Getafe' # Añadido Getafe CF
}

fixtures['home_team'] = fixtures['home_team'].replace(team_name_mapping)
fixtures['away_team'] = fixtures['away_team'].replace(team_name_mapping)

# --- 3️⃣ Crear estadísticas históricas agregadas ---

# Estadísticas históricas para locales
team_stats_home = df_historico.groupby("team_home").agg({
    "xG_home": "mean",
    "deep_home": "mean",
    "npxG_home": "mean" , # np-xG para locales
    "xGA_home": "mean",
    "deep_allowed_home": "mean"
}).rename(columns={
    "xG_home": "avg_home_xg",
    "deep_home": "avg_home_deep_completions",
    "npxG_home": "avg_home_np_xg_difference",
    "xGA_home": "home_xga",
    "deep_allowed_home": "home_deep_allowed"
})

# Estadísticas históricas para visitantes
team_stats_away = df_historico.groupby("team_away").agg({
    "xG_away": "mean",
    "deep_away": "mean",
    "npxG_away": "mean",  # np-xG para visitantes
    "xGA_away": "mean",
    "deep_allowed_away": "mean"

}).rename(columns={
    "xG_away": "avg_away_xg",
    "deep_away": "avg_away_deep_completions",
    "npxG_away": "avg_away_np_xg_difference",
    "xGA_away": "away_xga",
    "deep_allowed_away": "away_deep_allowed"
})

# --- 4️⃣ Merge estadísticas históricas con fixtures ---
fixtures = fixtures.merge(team_stats_home, left_on="home_team", right_index=True, how="left")
fixtures = fixtures.merge(team_stats_away, left_on="away_team", right_index=True, how="left")

# Renombrar columnas finales
fixtures = fixtures.rename(columns={
    "avg_home_xg": "home_xg",
    "avg_home_ppda": "home_ppda",
    "avg_home_deep_completions": "home_deep_completions",
    "avg_home_np_xg_difference": "home_np_xg_difference",
    "avg_away_xg": "away_xg",
    "avg_away_ppda": "away_ppda",
    "avg_away_deep_completions": "away_deep_completions",
    "avg_away_np_xg_difference": "away_np_xg_difference"
})

# Llenar posibles NaN
fixtures = fixtures.fillna(fixtures.mean(numeric_only=True))

# --- 5️⃣ Calcular probabilidades aproximadas ---
home_adv = 0.05
draw_factor = 0.5
w_xg = 0.6
w_deep = 0.25
w_np = 0.15

fixtures['prob_home_win'] = (
    w_xg * fixtures['home_xg'] +
    w_deep * fixtures['home_deep_completions'] +
    w_np * fixtures['home_np_xg_difference'] +
    home_adv
)
fixtures['prob_away_win'] = (
    w_xg * fixtures['away_xg'] +
    w_deep * fixtures['away_deep_completions'] +
    w_np * fixtures['away_np_xg_difference'] -
    home_adv
)
fixtures['draw_raw'] = np.exp(-np.abs(fixtures['home_xg'] - fixtures['away_xg']))
total = fixtures['prob_home_win'] + fixtures['prob_away_win'] + fixtures['draw_raw']
fixtures['prob_home_win'] = fixtures['prob_home_win'] / total
fixtures['prob_away_win'] = fixtures['prob_away_win'] / total
fixtures['prob_draw'] = fixtures['draw_raw'] / total

# --- 6️⃣ Crear target apostar/no apostar ---
threshold = 0.5
fixtures['target'] = np.where(
    (fixtures['prob_home_win'] > threshold) | (fixtures['prob_away_win'] > threshold),
    1,
    0
)

# --- Listo ---
fixtures.head()

Unnamed: 0,game_id,date,home_team,away_team,status,home_xg,home_deep_completions,home_np_xg_difference,home_xga,home_deep_allowed,away_xg,away_deep_completions,away_np_xg_difference,away_xga,away_deep_allowed,prob_home_win,prob_away_win,draw_raw,prob_draw,target
0,544369,2025-12-12T20:00:00Z,Real Sociedad,Girona,FINISHED,1.75737,6.75,1.571551,1.23966,5.5,1.13568,5.25,1.04277,1.853109,9.0,0.534448,0.370753,0.537036,0.094799,1
1,544362,2025-12-13T13:00:00Z,Atletico Madrid,Valencia,FINISHED,2.630807,11.111111,2.465636,0.940162,3.666667,0.839884,3.25,0.839884,2.130486,8.125,0.753885,0.219786,0.166806,0.02633,1
2,544367,2025-12-13T15:15:00Z,Mallorca,Elche,FINISHED,1.203491,4.125,1.110582,1.239522,7.25,0.653829,3.5,0.653829,1.810755,6.125,0.510022,0.340554,0.577145,0.149425,1
3,544363,2025-12-13T17:30:00Z,Barcelona,Osasuna,FINISHED,2.789667,12.666667,2.707081,0.917546,3.555556,0.892405,4.555556,0.809819,1.665176,9.0,0.736415,0.242732,0.149979,0.020853,1
4,544368,2025-12-13T20:00:00Z,Getafe,Espanyol,FINISHED,0.81484,2.625,0.81484,1.153737,4.125,1.521465,4.285714,1.415282,1.272151,5.571429,0.3329,0.542442,0.493306,0.124658,1


In [None]:
# --- Variables escaladas a partir de tu histórico ---
# xG escalado
fixtures["home_xg_scaled"] = fixtures["home_xg"] / (fixtures["home_xg"] + fixtures["away_xg"])
fixtures["away_xg_scaled"] = fixtures["away_xg"] / (fixtures["home_xg"] + fixtures["away_xg"])

#Escalado defensivo
fixtures["home_xga_scaled"] = fixtures["home_xga"] / (fixtures["home_xga"] + fixtures["away_xga"])
fixtures["away_xga_scaled"] = fixtures["away_xga"] / (fixtures["home_xga"] + fixtures["away_xga"])

# Deep completions escalado
fixtures["home_deep_scaled"] = fixtures["home_deep_completions"] / (fixtures["home_deep_completions"] + fixtures["away_deep_completions"])
fixtures["away_deep_scaled"] = fixtures["away_deep_completions"] / (fixtures["home_deep_completions"] + fixtures["away_deep_completions"])

# Escalar defensas profundas
fixtures['home_deep_allowed_scaled'] = fixtures['home_deep_allowed'] / (
    fixtures['home_deep_allowed'] + fixtures['away_deep_allowed'] + 1e-6
)
fixtures['away_deep_allowed_scaled'] = fixtures['away_deep_allowed'] / (
    fixtures['home_deep_allowed'] + fixtures['away_deep_allowed'] + 1e-6
)


# NP-xG Difference escalado
fixtures["home_np_diff_scaled"] = fixtures["home_np_xg_difference"] / (
    abs(fixtures["home_np_xg_difference"]) + abs(fixtures["away_np_xg_difference"])
)
fixtures["away_np_diff_scaled"] = fixtures["away_np_xg_difference"] / (
    abs(fixtures["home_np_xg_difference"]) + abs(fixtures["away_np_xg_difference"])
)

In [None]:
home_adv = 0.05
w_xg = 0.5
w_deep = 0.3
w_np = 0.2

# Probabilidades aproximadas
fixtures['prob_home_win'] = (
    w_xg * fixtures['home_xg_scaled'] +
    w_deep * fixtures['home_deep_scaled'] +
    w_np * fixtures['home_np_diff_scaled'] +
    home_adv
)

fixtures['prob_away_win'] = (
    w_xg * fixtures['away_xg_scaled'] +
    w_deep * fixtures['away_deep_scaled'] +
    w_np * fixtures['away_np_diff_scaled'] -
    home_adv
)

# Probabilidad de empate
fixtures['draw_raw'] = np.exp(-np.abs(fixtures['home_xg'] - fixtures['away_xg']))
total = fixtures['prob_home_win'] + fixtures['prob_away_win'] + fixtures['draw_raw']
fixtures['prob_home_win'] /= total
fixtures['prob_away_win'] /= total
fixtures['prob_draw'] = fixtures['draw_raw'] / total


In [None]:
import pandas as pd
import numpy as np

def resumen_fixture_actual(fixture_row, historical_matches_df, n_last=5, cuotas=None):
    """
    Genera un resumen del partido usando las columnas actuales del fixtures.
    - fixture_row: fila del partido a analizar
    - historical_matches_df: dataframe con los partidos históricos para buscar el historial reciente de los equipos.
    - n_last: cuántos partidos anteriores mostrar
    - cuotas: diccionario opcional con cuotas {'home': , 'draw': , 'away': }
    """
    home_team = fixture_row['home_team']
    away_team = fixture_row['away_team']
    # Aseguramos que date_fixture sea timezone-naive
    date_fixture = pd.to_datetime(fixture_row['date']).tz_localize(None)

    # Probabilidades
    p_home = fixture_row['prob_home_win']
    p_away = fixture_row['prob_away_win']
    p_draw = fixture_row['prob_draw']

    # Valor esperado si se pasan cuotas
    ev_home = ev_draw = ev_away = None
    if cuotas is not None:
        ev_home = p_home * cuotas['home'] - 1
        ev_away = p_away * cuotas['away'] - 1
        ev_draw = p_draw * cuotas['draw'] - 1

    # Últimos n partidos del local antes del partido
    home_history = historical_matches_df[
        ((historical_matches_df['team_home'] == home_team) | (historical_matches_df['team_away'] == home_team)) &
        (pd.to_datetime(historical_matches_df['date_home']).dt.tz_localize(None) < date_fixture) # Convert to timezone-naive
    ].sort_values('date_home', ascending=False).head(n_last)

    home_last = []
    for _, row in home_history.iterrows():
        if row['team_home'] == home_team:
            rival = row['team_away']
            xg_for = row['xG_home']
            xg_against = row['xG_away']
            result = "W" if row['scored_home'] > row['scored_away'] else ("L" if row['scored_home'] < row['scored_away'] else "D")
            venue = "Local"
        else:
            rival = row['team_home']
            xg_for = row['xG_away']
            xg_against = row['xG_home']
            result = "W" if row['scored_away'] > row['scored_home'] else ("L" if row['scored_away'] < row['scored_home'] else "D")
            venue = "Visitante"
        home_last.append(f"{venue} vs {rival}: {result} (xG {xg_for:.2f}-{xg_against:.2f})")

    # Últimos n partidos del visitante
    away_history = historical_matches_df[
        ((historical_matches_df['team_home'] == away_team) | (historical_matches_df['team_away'] == away_team)) &
        (pd.to_datetime(historical_matches_df['date_home']).dt.tz_localize(None) < date_fixture) # Convert to timezone-naive
    ].sort_values('date_home', ascending=False).head(n_last)

    away_last = []
    for _, row in away_history.iterrows():
        if row['team_home'] == away_team:
            rival = row['team_away']
            xg_for = row['xG_home']
            xg_against = row['xG_away']
            result = "W" if row['scored_home'] > row['scored_away'] else ("L" if row['scored_home'] < row['scored_away'] else "D")
            venue = "Local"
        else:
            rival = row['team_home']
            xg_for = row['xG_away']
            xg_against = row['xG_home']
            result = "W" if row['scored_away'] > row['scored_home'] else ("L" if row['scored_away'] < row['scored_home'] else "D")
            venue = "Visitante"
        away_last.append(f"{venue} vs {rival}: {result} (xG {xg_for:.2f}-{xg_against:.2f})")

    # Construir resumen
    resumen = f"""
Partido: {home_team} vs {away_team} ({date_fixture.strftime('%Y-%m-%d')})

Estadísticas promedio - {home_team}:
- xG promedio: {fixture_row['home_xg']:.2f}
- NP-xG promedio: {fixture_row['home_np_xg_difference']:.2f}
- Deep completions: {fixture_row['home_deep_completions']:.2f}

Últimos {n_last} partidos del local:
{chr(10).join(home_last)}

--------------------------------------------

Estadísticas promedio - {away_team}:
- xG promedio: {fixture_row['away_xg']:.2f}
- NP-xG promedio: {fixture_row['away_np_xg_difference']:.2f}
- Deep completions: {fixture_row['away_deep_completions']:.2f}

Últimos {n_last} partidos del visitante:
{chr(10).join(away_last)}

--------------------------------------------

Probabilidades implícitas:
- Local: {p_home:.2f}
- Empate: {p_draw:.2f}
- Visitante: {p_away:.2f}
"""
    if cuotas is not None:
        resumen += f"""
Cuotas:
- Local: {cuotas['home']}
- Empate: {cuotas['draw']}
- Visitante: {cuotas['away']}

Valor esperado (EV):
- EV local: {ev_home:.2f}
- EV empate: {ev_draw:.2f}
- EV visitante: {ev_away:.2f}
"""

    return resumen

In [None]:
#Resumen de Gemini
#analisis = analizar_resumen_gemini(resumen)
#print(analisis)

In [None]:
fixtures.columns

Index(['game_id', 'date', 'home_team', 'away_team', 'status', 'home_xg',
       'home_deep_completions', 'home_np_xg_difference', 'home_xga',
       'home_deep_allowed', 'away_xg', 'away_deep_completions',
       'away_np_xg_difference', 'away_xga', 'away_deep_allowed',
       'prob_home_win', 'prob_away_win', 'draw_raw', 'prob_draw', 'target',
       'home_xg_scaled', 'away_xg_scaled', 'home_xga_scaled',
       'away_xga_scaled', 'home_deep_scaled', 'away_deep_scaled',
       'home_deep_allowed_scaled', 'away_deep_allowed_scaled',
       'home_np_diff_scaled', 'away_np_diff_scaled'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# --- 1. Preparar el dataset histórico para el entrenamiento ---

# Columnas a usar como características (X)
features = [
    'home_xg_scaled', 'away_xg_scaled',
    'home_deep_scaled', 'away_deep_scaled',
    'home_np_diff_scaled', 'away_np_diff_scaled',
    #Métricas defensivas
    'home_xga_scaled', 'away_xga_scaled',
    'home_deep_allowed_scaled', 'away_deep_allowed_scaled'
]

X_historical = df_historico[features]

# Variable objetivo (y): resultado del partido desde la perspectiva del local
y_historical = df_historico['result_home']

# Codificar la variable objetivo (result_home) a valores numéricos
# 'w' (victoria local) -> 2, 'd' (empate) -> 1, 'l' (derrota local) -> 0
le = LabelEncoder()
y_historical_encoded = le.fit_transform(y_historical)

# Asegurarse de que no haya NaNs en las características
X_historical = X_historical.fillna(X_historical.mean(numeric_only=True))

# --- 2. Dividir el dataset histórico en entrenamiento y prueba ---
X_train, X_test, y_train, y_test = train_test_split(X_historical, y_historical_encoded, test_size=0.2, random_state=42)

# --- 3. Entrenar el modelo de Regresión Logística ---
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

print("Modelo de Regresión Logística entrenado exitosamente.")

Modelo de Regresión Logística entrenado exitosamente.




In [None]:
# --- 4. Preparar el dataset 'fixtures' para la predicción ---

# Asegurarse de que el DataFrame 'fixtures' tenga las mismas columnas de características
# y manejar NaNs de la misma manera que el entrenamiento
X_fixtures = fixtures[features].fillna(fixtures[features].mean(numeric_only=True))

# --- 5. Realizar predicciones de probabilidad en 'fixtures' ---
# predict_proba devuelve las probabilidades de cada clase (0, 1, 2 para 'l', 'd', 'w')
probabilities = model.predict_proba(X_fixtures)

# Las columnas de probabilidad en 'probabilities' están en el orden de las clases codificadas
# lo.classes_ te da el orden original de las etiquetas ('d', 'l', 'w')
# Necesitamos mapearlas a 'prob_home_win', 'prob_draw', 'prob_away_win'

# Mapear las probabilidades a las columnas correspondientes
prob_map = {label: idx for idx, label in enumerate(le.classes_)}

fixtures['prob_home_win_logreg'] = probabilities[:, prob_map['w']]
fixtures['prob_draw_logreg'] = probabilities[:, prob_map['d']]
fixtures['prob_away_win_logreg'] = probabilities[:, prob_map['l']]

# --- 6. Mostrar las primeras filas de fixtures con las nuevas probabilidades ---
display(fixtures[['home_team', 'away_team', 'prob_home_win_logreg', 'prob_draw_logreg', 'prob_away_win_logreg']].head(20))

Unnamed: 0,home_team,away_team,prob_home_win_logreg,prob_draw_logreg,prob_away_win_logreg
0,Real Sociedad,Girona,0.509235,0.26908,0.221685
1,Atletico Madrid,Valencia,0.63929,0.22964,0.13107
2,Mallorca,Elche,0.573481,0.240949,0.18557
3,Barcelona,Osasuna,0.621794,0.2435,0.134705
4,Getafe,Espanyol,0.284174,0.290299,0.425527
5,Sevilla,Real Oviedo,0.426085,0.29919,0.274725
6,Celta Vigo,Athletic Club,0.349066,0.295306,0.355628
7,Levante,Villarreal,0.332276,0.298577,0.369147
8,Alaves,Real Madrid,0.363262,0.257103,0.379635
9,Rayo Vallecano,Real Betis,0.454838,0.280573,0.264589


In [None]:
# Seleccionar el primer partido del dataframe
partido = fixtures.iloc[8].copy()  # fila 0

# Actualizar las probabilidades del partido con las del modelo de Regresión Logística
partido['prob_home_win'] = partido['prob_home_win_logreg']
partido['prob_away_win'] = partido['prob_away_win_logreg']
partido['prob_draw'] = partido['prob_draw_logreg']

# Llamar a la función pasando esa fila y el df_historico para el historial
cuotas_ejemplo = {'home': 5.65, 'draw': 3.90, 'away': 1.60}
resumen = resumen_fixture_actual(partido, historical_matches_df=df_historico, n_last=5, cuotas=cuotas_ejemplo)

# Ahora lo pasas a Gemini IA
analisis = analizar_resumen_gemini(resumen)
print("=== RESUMEN DEL PARTIDO ===")
print(resumen)
print("\n=== ANÁLISIS DE GEMINI ===")
print(analisis)

=== RESUMEN DEL PARTIDO ===

Partido: Alaves vs Real Madrid (2025-12-14)

Estadísticas promedio - Alaves:
- xG promedio: 1.47
- NP-xG promedio: 1.10
- Deep completions: 3.88

Últimos 5 partidos del local:
Local vs Real Sociedad: W (xG 1.89-1.10)
Visitante vs Barcelona: L (xG 2.09-2.51)
Local vs Celta Vigo: L (xG 0.31-1.32)
Visitante vs Girona: L (xG 2.01-1.26)
Local vs Espanyol: W (xG 2.18-1.04)

--------------------------------------------

Estadísticas promedio - Real Madrid:
- xG promedio: 2.32
- NP-xG promedio: 2.16
- Deep completions: 9.89

Últimos 5 partidos del visitante:
Local vs Celta Vigo: L (xG 2.42-1.61)
Visitante vs Athletic Club: W (xG 2.37-0.92)
Visitante vs Girona: D (xG 2.21-1.05)
Visitante vs Elche: D (xG 3.88-1.60)
Visitante vs Rayo Vallecano: D (xG 0.84-1.61)

--------------------------------------------

Probabilidades implícitas:
- Local: 0.36
- Empate: 0.26
- Visitante: 0.38

Cuotas:
- Local: 5.65
- Empate: 3.9
- Visitante: 1.6

Valor esperado (EV):
- EV local: 1

In [None]:
output_csv_path = 'predicciones_partidos_premier_league.csv'
fixtures.to_csv(output_csv_path, index=False)

print(f"El DataFrame 'fixtures' se ha exportado exitosamente a {output_csv_path}")

El DataFrame 'fixtures' se ha exportado exitosamente a predicciones_partidos_premier_league.csv
