In [4]:
import re

import pandas as pd
import requests
from bs4 import BeautifulSoup

def get_urls(code, league):

    response = requests.get(f"https://fbref.com/en/comps/{code}/stats/{league}-Stats")
    if response.status_code != 200:
        print(f"Failed to fetch data for {league}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    ## print(soup.text)

    ## Extract content between <!-- and --!>
    comments = re.findall(r"<!--\s*([\s\S]*?)\s*-->", str(soup), re.DOTALL)

    if not comments:
        print(f"No comments found for {league}")
        return None

    table_html = "".join(comments)
    table_soup = BeautifulSoup(table_html, "html.parser")
    table = table_soup.find("table")

    if not table:
        print(f"No table found for {league}")
        return None

    headers = [th.text.strip() for th in table.find("thead").find_all("th")]
    data = []
    names = []
    headers = headers[7:]
    for row in table.find("tbody").find_all("tr", class_=lambda x: x != "thead"):
        cells = row.find_all("a", href=True)
        if cells:
            data.append([cell['href'] for cell in cells][0])
            names.append([cell.text for cell in cells][0])
    df = pd.DataFrame({"names": names, "urls": data})
    df.to_csv(f"data/urls-{league}.csv", index=False)
    return df

# get_urls("Premier-League")
LEAGUES = {
    "Premier League": 9,
    "La Liga": 12,
    "Bundesliga": 20,
    "Serie A": 11,
    "Ligue 1": 13,
}

for league, code in LEAGUES.items():
    get_urls(code, league)

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

def scrape(player_url):
    parts = player_url.split("/")
    url = f"https://fbref.com{'/'.join(parts[:-1])}/all_comps/{parts[-1]}-Stats---All-Competitions"
    # print(url)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.11"
    }
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch data for {url}. Got {response}")
        return -1, -1

    soup = BeautifulSoup(response.text, "html.parser")
    # print(soup)
    tables = soup.find_all("table")
    if not tables:
        print(f"No table found for {url}")
        return -1, -1

    #Standard Stats
    def get_stats(table):
        # print(table.caption.text)
        tr = table.find("tfoot").find_all("tr")[1]
        headers = [d.text for d in tr.find_all("th")]
        data = []
        row = table.find("tfoot").find("tr")
        seasons = row.find("th")
        stats = [d.text for d in row.find_all("td")]
        combined_data = {"header": headers, "stats": stats}
        return combined_data

    data = {}
    for i, table in enumerate(tables[1:]):
        if table.caption.text not in data:
            try:
                data[table.caption.text] = get_stats(table)
            except:
                break
    return data

all_data = {}
df = pd.read_csv("./data/urls-Ligue-1.csv")
for i, url in enumerate(df["urls"]):
    print(i)
    time.sleep(1)
    data = scrape(url)
    if data == -1:
        continue
    all_data[url.split("/")[-1]] = data

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17


KeyboardInterrupt: 

In [None]:
table_names = set()
for player in all_data.values():
    table_names.update(player.keys())
head = []
# Step 2: Process each table type separately
for table in table_names:
    table_data = []
    column_headers = None

    for player_name, player_tables in all_data.items():
        if table in player_tables:
            headers = player_tables[table]["header"]
            stats = player_tables[table]["stats"]

            # Store the first player's headers for reference
            if column_headers is None:
                column_headers = ["Player"] + headers
                head.append(headers)

            # Add player name as the first column
            row = [player_name] + stats
            table_data.append(row)

    # Create DataFrame for the table
    df = pd.DataFrame(table_data, columns=column_headers)

    # Save to CSV
    file_name = f"{table}.csv"
    df.to_csv(file_name, index=False)
    print(f"Saved {file_name}")

In [None]:
print(head)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Define CSV file paths
csv_files = {
    "Possession": "/content/Standard Stats Table.csv",
    "Defensive": "/content/Defensive Actions Table.csv",
    "Final_Third": "/content/Goal and Shot Creation Table.csv",
    "Standard": "/content/Standard Stats Table.csv",
    "Passing": "/content/Passing Table.csv",
    "Miscellaneous": "/content/Miscellaneous Stats Table.csv"
}

# Load required tables
df_possession = pd.read_csv(csv_files["Possession"])
df_defensive = pd.read_csv(csv_files["Defensive"])
df_final_third = pd.read_csv(csv_files["Final_Third"])
df_standard = pd.read_csv(csv_files["Standard"])
df_passing = pd.read_csv(csv_files["Passing"])
df_misc = pd.read_csv(csv_files["Miscellaneous"])

# Merge Data on Player Name
df = df_standard[["Player", "MP"]].copy()
df = df.merge(df_possession[["Player", "PrgC", "PrgP", "PrgR"]], on="Player", how="left")
df = df.merge(df_defensive[["Player", "Tkl", "Int", "Blocks"]], on="Player", how="left")
df = df.merge(df_final_third[["Player", "GCA", "SCA"]], on="Player", how="left")
df = df.merge(df_passing[["Player", "xA", "KP", "TotDist", "PrgDist"]], on="Player", how="left")
df = df.merge(df_misc[["Player", "Won%"]], on="Player", how="left")

# Fill missing values with 0
df.fillna(0, inplace=True)

# Normalize data using Min-Max Scaling
scaler = MinMaxScaler()
df[["PrgC", "PrgP", "PrgR", "Tkl", "Int", "Blocks",
    "GCA", "SCA", "xA", "KP", "MP"]] = scaler.fit_transform(
    df[["PrgC", "PrgP", "PrgR", "Tkl", "Int", "Blocks",
        "GCA", "SCA", "xA", "KP", "MP"]]
)

# Define weights for each component
weights = {
    "Possession": 0.25,
    "Defensive": 0.25,
    "Final_Third": 0.4,
    "Adaptability": 0.1
}

# Compute Component Scores
df["Possession_Score"] = (df["PrgC"] + df["PrgP"] + df["PrgR"]) / 3
df["Defensive_Score"] = (df["Tkl"] + df["Int"] + df["Blocks"]) / 3
df["Final_Third_Score"] = (df["GCA"] + df["SCA"] + df["xA"] + df["KP"] + df["TotDist"] + df["PrgDist"]) / 6
df["Adaptability_Score"] = (df["MP"] +  df["Won%"])/2

# Compute final TIS Score
df["TIS"] = (
    weights["Possession"] * df["Possession_Score"] +
    weights["Defensive"] * df["Defensive_Score"] +
    weights["Final_Third"] * df["Final_Third_Score"] +
    weights["Adaptability"] * df["Adaptability_Score"]
) * 100  # Scale to 0-100

# Save TIS scores to CSV
df[["Player", "TIS"]].to_csv("Tactical_Impact_Score.csv", index=False)

# Display top players by TIS
print(df[["Player", "TIS"]].sort_values(by="TIS", ascending=False).head(10))


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Define relevant files and columns for TIS calculation
FILES_COLUMNS = {
    "Possession Table.csv": ["Player", "Comp", "90s", "Touches", "PrgC", "CPA", "PrgR"],
    "Defensive Actions Table.csv": ["Player", "Comp", "90s", "Tkl", "Int", "Tkl+Int", "Blocks"],
    "Goal and Shot Creation Table.csv": ["Player", "Comp", "90s", "SCA", "GCA"]
}

def load_relevant_data(base_path="/content/"):
    """Loads relevant data from CSV files and merges them on key columns."""
    dataframes = []

    for file, columns in FILES_COLUMNS.items():
        df = pd.read_csv(base_path + file, usecols=columns)
        dataframes.append(df)

    # Merge on common columns
    merged_df = dataframes[0]
    for df in dataframes[1:]:
        merged_df = merged_df.merge(df, on=["Player", "Comp"], how="outer")

    return merged_df

def calculate_tis(df):
    """Computes Tactical Impact Score (TIS)."""

    # Fill missing values
    df.fillna(0, inplace=True)
    # Replace infinite values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)

    # Ensure 90s is non-zero to avoid division errors
    df["90s"] = df["90s"].replace(0, np.nan).fillna(df["90s"].mean())
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    # Convert data to float64 for precision
    df[numeric_cols] = df[numeric_cols].astype(float)


    # Define weighted contributions
    df["Possession Influence"] = (df["Touches"] + df["PrgC"] + df["CPA"] + df["PrgR"]) / df["90s"]*4
    df["Defensive Contribution"] = (df["Tkl"] + df["Int"] + df["Tkl+Int"] + df["Blocks"]) / df["90s"]*4
    df["Final Third Impact"] = (df["SCA"] + df["GCA"]) / df["90s"]*2

    # Normalize scores
    scaler = MinMaxScaler(feature_range=(0, 100))
    df[["Possession Influence", "Defensive Contribution", "Final Third Impact"]] = scaler.fit_transform(
        df[["Possession Influence", "Defensive Contribution", "Final Third Impact"]]
    )

    # Compute TIS as weighted sum
    df["TIS"] = (0.4 * df["Possession Influence"] +
                 0.3 * df["Defensive Contribution"] +
                 0.3 * df["Final Third Impact"])
    # print(df)
    return df[["Player", "Comp", "TIS"]]

# Load data and compute TIS
df = load_relevant_data()
tis_df = calculate_tis(df)
tis_df = tis_df.sort_values(by="TIS", ascending=False)
# Save results
tis_df.to_csv("Tactical_Impact_Score.csv", index=False)
print("TIS calculation complete and saved to Tactical_Impact_Score.csv")


In [None]:
tis_df.head(20)