### Imports

In [77]:
import os
import pandas as pd
import numpy as np
from functools import reduce


In [78]:
data_dir = "base_data"
new_data_dir = "new_data"

In [79]:
columns_to_keep_teams = {
    'teamRoster.csv': ['teamId', 'teamName'],
    'standings.csv': ['teamId', 'form', 'teamRank', 'teamId','gamesPlayed','wins', 'ties', 'losses', 'points','gf','ga','gd'],
}

In [80]:
dfs_teams = []

for filename, columns in columns_to_keep_teams.items():
    path = os.path.join(data_dir, filename)
    df = pd.read_csv(path, usecols=columns)
    dfs_teams.append(df)
   

In [81]:
columns_to_keep_matches = {
    'fixtures.csv': ['eventId', 'homeTeamId', 'awayTeamId', 'homeTeamWinner', 'awayTeamWinner', 'homeTeamScore', 'awayTeamScore', 'date'],
    'teamStats.csv': ['eventId', 'teamId', 'possessionPct', 'totalShots', 'shotsOnTarget', 'saves', 'wonCorners'],
}

In [82]:
dfs_matches = []

for filename, columns in columns_to_keep_matches.items():
    path = os.path.join(data_dir, filename)
    df = pd.read_csv(path, usecols=columns)
    dfs_matches.append(df)
   

In [83]:
merged_df_teams = reduce(lambda left, right: pd.merge(left, right, on='teamId', how='inner'), dfs_teams)

# Generate ordered column list automatically
ordered_columns_t = []
for cols in columns_to_keep_teams.values():
    for col in cols:
        if col not in ordered_columns_t:
            ordered_columns_t.append(col)

# Reorder columns safely
merged_df_teams = merged_df_teams[[col for col in ordered_columns_t if col in merged_df_teams.columns]]

In [84]:
merged_df_matches = reduce(lambda left, right: pd.merge(left, right, on='eventId', how='inner'), dfs_matches)

# Generate ordered column list automatically
ordered_columns_m = []
for cols in columns_to_keep_matches.values():
    for col in cols:
        if col not in ordered_columns_m:
            ordered_columns_m.append(col)

# Reorder columns safely
merged_df_matches = merged_df_matches[[col for col in ordered_columns_m if col in merged_df_matches.columns]]

In [85]:
output_path_teams = os.path.join(new_data_dir, 'database_teams.csv')

os.makedirs(os.path.dirname(output_path_teams), exist_ok=True)

merged_df_teams.to_csv(output_path_teams, index=False)

print(f"Merged CSV saved to {output_path_teams}")


Merged CSV saved to new_data\database_teams.csv


In [86]:
output_path_matches = os.path.join(new_data_dir, 'database_matches.csv')

os.makedirs(os.path.dirname(output_path_matches), exist_ok=True)

merged_df_matches.to_csv(output_path_matches, index=False)

print(f"Merged CSV saved to {output_path_matches}")


Merged CSV saved to new_data\database_matches.csv


In [87]:
import pandas as pd

# Load fixtures.csv
fixtures_path = r"D:\UPC\Probalistic Methods\Probabilistic-Models-in-Sports-Analytics\base_data\fixtures.csv"
fixtures_df = pd.read_csv(fixtures_path)

# Ensure 'date' column is in datetime format
fixtures_df['date'] = pd.to_datetime(fixtures_df['date'])

# Filter for the match
target_date = pd.Timestamp('2025-05-10')
filtered_match = fixtures_df[
    (fixtures_df['date'].dt.date == target_date.date()) &
    (fixtures_df['homeTeamId'] == 132)
]

print(filtered_match)


        Rn  seasonType  leagueId  eventId                date  venueId  \
23179  294       12747       720   711720 2025-05-10 16:30:00     1935   

       attendance  homeTeamId  awayTeamId  homeTeamWinner  awayTeamWinner  \
23179       75000         132         268            True           False   

       homeTeamScore  awayTeamScore  homeTeamShootoutScore  \
23179              2              0                      0   

       awayTeamShootoutScore  statusId           updateTime  
23179                      0        28  2025-05-12 04:22:34  
