### Imports

In [None]:
import os
import pandas as pd
import numpy as np
from functools import reduce


In [None]:
data_dir = "../base_data"
new_data_dir = "new_data"

In [None]:
columns_to_keep_matches = {
    'fixtures.csv': ['eventId', 'homeTeamId', 'awayTeamId','leagueId', 'homeTeamWinner', 'awayTeamWinner', 'homeTeamScore', 'awayTeamScore', 'date'],
}

In [None]:
dfs_matches = []

for filename, columns in columns_to_keep_matches.items():
    path = os.path.join(data_dir, filename)
    df = pd.read_csv(path, usecols=columns)
    dfs_matches.append(df)
   

In [None]:
merged_df_matches = reduce(lambda left, right: pd.merge(left, right, on='eventId', how='inner'), dfs_matches)

# Generate ordered column list automatically
ordered_columns_m = []
for cols in columns_to_keep_matches.values():
    for col in cols:
        if col not in ordered_columns_m:
            ordered_columns_m.append(col)

# Reorder columns safely
merged_df_matches = merged_df_matches[[col for col in ordered_columns_m if col in merged_df_matches.columns]]
merged_df_matches = merged_df_matches.dropna()

In [None]:
target_leagues = [720, 740, 700, 730]
merged_df_matches = merged_df_matches[merged_df_matches['date'] > '2024-08-30']
merged_df_matches = merged_df_matches[merged_df_matches['leagueId'].isin(target_leagues)]
merged_df_matches['date'] = pd.to_datetime(merged_df_matches['date'])
merged_df_matches['date'] = merged_df_matches['date'].dt.strftime('%H:%M, %d.%m.%y')
merged_df_matches

In [None]:
match_id = 705018
match_row = merged_df_matches[merged_df_matches['eventId'] == match_id].iloc[0]
match_row

In [None]:
merged_df_matches = merged_df_matches.groupby(['eventId']).agg(
homeTeamId=pd.NamedAgg(column="homeTeamId", aggfunc="first"),
awayTeamId=pd.NamedAgg(column="awayTeamId", aggfunc="first"),
leagueId=pd.NamedAgg(column="leagueId", aggfunc="first"),
homeTeamScore=pd.NamedAgg(column="homeTeamScore", aggfunc="first"),
awayTeamScore=pd.NamedAgg(column="awayTeamScore", aggfunc="first"),
homeTeamWinner=pd.NamedAgg(column="homeTeamWinner", aggfunc="first"),
awayTeamWinner=pd.NamedAgg(column="awayTeamWinner", aggfunc="first"),
date=pd.NamedAgg(column="date", aggfunc="first"),

).sort_values(by=['leagueId', 'date']).reset_index()  # ✅ Sort after aggregation.reset_index()

In [None]:
# Define custom league name mapping
league_name_map = {
    720: "GER1",
    740: "ESP1",
    700: "ENG1",
    730: "ITA1"
}

# Replace leagueId with custom league names
merged_df_matches['leagueId'] = merged_df_matches['leagueId'].map(league_name_map)

In [None]:
output_path_matches = os.path.join(new_data_dir, 'database_matches.csv')

os.makedirs(os.path.dirname(output_path_matches), exist_ok=True)

merged_df_matches.to_csv(output_path_matches, index=False)

print(f"Merged CSV saved to {output_path_matches}")
