**Imports**

In [None]:
import os
import pandas as pd
import numpy as np
from functools import reduce

In [None]:
data_dir = "../base_data"
new_data_dir = "new_data"

**Data to use in the beginning**

In [None]:
columns_to_keep_teams = {
    'teamRoster.csv': ['teamId', 'teamName'],
    'standings.csv': ['teamId', 'leagueId', 'form', 'teamRank', 'teamId','gamesPlayed','wins', 'ties', 'losses', 'points','gf','ga','gd', 'timeStamp'],
}


In [None]:
dfs_teams = []

for filename, columns in columns_to_keep_teams.items():
    path = os.path.join(data_dir, filename)
    df = pd.read_csv(path, usecols=columns)
    dfs_teams.append(df)

In [None]:
merged_df_teams = reduce(lambda left, right: pd.merge(left, right, on='teamId', how='inner'), dfs_teams)

# Generate ordered column list automatically
ordered_columns_t = []
for cols in columns_to_keep_teams.values():
    for col in cols:
        if col not in ordered_columns_t:
            ordered_columns_t.append(col)

# Reorder columns safely
merged_df_teams = merged_df_teams[[col for col in ordered_columns_t if col in merged_df_teams.columns]]
merged_df_teams = merged_df_teams.dropna()


**Converting float values to integers, for better view**

In [None]:
cols_to_int = ['teamRank', 'gamesPlayed', 'wins', 'ties', 'losses', 'points', 'gf', 'ga', 'gd']
merged_df_teams[cols_to_int] = merged_df_teams[cols_to_int].astype(int)


**Using only matches from Bundesliga(GER1) with league Id = 720**

In [None]:
merged_df_teams = merged_df_teams[merged_df_teams['leagueId'] == 720]

In [None]:
merged_df_teams = (
    merged_df_teams
    .groupby('teamId', as_index=False)
    .agg(
        League=('leagueId', 'first'),
        Rank=('teamRank', 'first'),
        Team=('teamName', 'first'),
        Form=('form', 'first'),
        Games=('gamesPlayed', 'first'),
        Wins=('wins', 'first'),
        Ties=('ties', 'first'),
        Losses=('losses', 'first'),
        Points=('points', 'first'),
        Goal_for=('gf', 'first'),
        Goal_against=('ga', 'first'),
        Goal_difference=('gd', 'first')
    )
    .sort_values(by=['League', 'Rank'])
)


In [None]:
# Define custom league name mapping
league_name_map = {
    720: "GER1",
}

# Replace leagueId with custom league names
merged_df_teams['League'] = merged_df_teams['League'].map(league_name_map)


**Calculating average points per match, which is a good indicator for performance**

In [None]:
merged_df_teams["avg_points_per_match"] = (merged_df_teams["Points"] / merged_df_teams["Games"]).round(2)


**Rewarding the past 5 games form with different weights, based on when was the last match played, the sooner - the higher the weight**

In [None]:
form_points = {'W': 3, 'D': 1, 'L': 0}
weights = [0.5, 0.55, 0.65, 0.8, 1.0]

def weighted_form_score(form_str):
    return sum(form_points.get(ch, 0) * w for ch, w in zip(form_str, weights))

merged_df_teams['weighted_form_score'] = merged_df_teams['Form'].apply(weighted_form_score)

bins = [0, 3.5, 7.0, 10.5]
labels = ['low', 'medium', 'high']

# Create the class column
merged_df_teams['form_score_bin'] = pd.cut(
    merged_df_teams['weighted_form_score'],
    bins=bins,
    labels=labels,
    include_lowest=True
)

merged_df_teams.drop(columns=['weighted_form_score', 'Form'], inplace=True)

In [None]:
merged_df_teams = (
    merged_df_teams
    .groupby('teamId', as_index=False)
    .agg(
        League=('League', 'first'),
        Rank=('Rank', 'first'),
        Team=('Team', 'first'),
        Form=('form_score_bin', 'first'),
        Average_Points=('avg_points_per_match', 'first'),
        Games=('Games', 'first'),
        Wins=('Wins', 'first'),
        Ties=('Ties', 'first'),
        Losses=('Losses', 'first'),
        Points=('Points', 'first'),
        Goal_for=('Goal_for', 'first'),
        Goal_against=('Goal_against', 'first'),
        Goal_difference=('Goal_difference', 'first'),
    )
    .sort_values(by=['League', 'Rank'])
)


**Output df to csv**

In [None]:
output_path_teams = os.path.join(new_data_dir, 'database_teams.csv')

os.makedirs(os.path.dirname(output_path_teams), exist_ok=True)

merged_df_teams.to_csv(output_path_teams, index=False)

print(f"Merged CSV saved to {output_path_teams}")
