In [None]:
import pandas as pd
import os

In [None]:
teams = pd.read_csv("new_data/database_teams.csv")
matches = pd.read_csv("new_data/database_matches.csv")

**Applying home_ and away_ prefix and dropping unnecessary columns**

In [None]:
# Merge home team stats
matches = matches.merge(
    teams.add_prefix("home_"),
    left_on="homeTeamId",
    right_on="home_teamId",
    how="inner"
)

# Merge away team stats
matches = matches.merge(
    teams.add_prefix("away_"),
    left_on="awayTeamId",
    right_on="away_teamId",
    how="inner"
).sort_values(
    by=["date"]
)

def get_result(row):
    if row['homeTeamWinner'] == True:
        return 'HomeWin'
    elif row['awayTeamWinner'] == True:
        return 'AwayWin'
    elif row['homeTeamWinner'] == False and row['awayTeamWinner'] == False:
        return 'Draw'
    else:
        return 'Unknown'

matches['Result'] = matches.apply(get_result, axis=1)

matches.drop(columns=[
    "homeTeamId", "awayTeamId",
    "home_teamId", "away_teamId",
    "homeTeamScore", "awayTeamScore",
    "homeTeamWinner", "awayTeamWinner",
    "leagueId", "home_Games", "away_Games", "away_League", "home_Goal_for", "home_Goal_against", "away_Goal_for", "away_Goal_against", "home_Wins", "home_Ties", "home_Losses", "away_Wins", "away_Ties", "away_Losses", "home_Points", "away_Points",
], inplace=True)

matches = matches.sort_values(by=["date"]).reset_index(drop=True)

**Binning ranks, average points and goal difference**

In [None]:
def bin_rank(value):
    if value <= 3:
        return 'High'
    elif value <= 6:
        return 'Mid High'
    elif value <= 10:
        return 'Mid'
    elif value <= 14:
        return 'Mid Low'
    else:
        return 'Low'
    
def bin_avg_points(value):
    if value <= 0.9:
        return 'Low'
    elif value <= 1.2:
        return 'Mid Low'
    elif value <= 1.6:
        return 'Mid'
    elif value <= 2.0:
        return 'Mid High'
    else:
        return 'High'

def bin_goal_diff(value):
    if value <= -10:
        return 'Low'
    elif value <= 0:
        return 'Mid Low'
    elif value <= 10:
        return 'Mid'
    elif value <= 25:
        return 'Mid High'
    else:
        return 'High'


In [None]:
matches['home_Rank_binned'] = matches['home_Rank'].apply(bin_rank)
matches['away_Rank_binned'] = matches['away_Rank'].apply(bin_rank)

matches['home_AvgPts_binned'] = matches['home_Average_Points'].apply(bin_avg_points)
matches['away_AvgPts_binned'] = matches['away_Average_Points'].apply(bin_avg_points)

matches['home_GD_binned'] = matches['home_Goal_difference'].apply(bin_goal_diff)
matches['away_GD_binned'] = matches['away_Goal_difference'].apply(bin_goal_diff)

**Computing team strength, based on the rank, form, average points per game and goal difference**

In [None]:
def compute_strength(rank, form, avg_pts, goal_diff):
    score = 0

    # Assign scores based on categorical input
    if rank == 'High':
        score += 4
    elif rank == 'Mid High':
        score += 3
    elif rank == 'Mid':
        score += 2
    elif rank == 'Mid Low':
        score += 1

    if form == 'High':
        score += 4
    elif form == 'Mid High':
        score += 3
    elif form == 'Mid':
        score += 2
    elif form == 'Mid Low':
        score += 1

    if avg_pts == 'High':
        score += 4
    elif avg_pts == 'Mid High':
        score += 3
    elif avg_pts == 'Mid':
        score += 2
    elif avg_pts == 'Mid Low':
        score += 1

    if goal_diff == 'High':
        score += 4
    elif goal_diff == 'Mid High':
        score += 3
    elif goal_diff == 'Mid':
        score += 2
    elif goal_diff == 'Mid Low':
        score += 1

    # Bin based on total score (max = 16)
    if score >= 14:
        return 'Very Strong'
    elif score >= 10:
        return 'Strong'
    elif score >= 6:
        return 'Average'
    elif score >= 3:
        return 'Weak'
    else:
        return 'Very Weak'


In [None]:
matches['home_Strength'] = matches.apply(lambda row: compute_strength(
    row['home_Rank_binned'], row['home_Form'],
    row['home_AvgPts_binned'], row['home_GD_binned']
), axis=1)

matches['away_Strength'] = matches.apply(lambda row: compute_strength(
    row['away_Rank_binned'], row['away_Form'],
    row['away_AvgPts_binned'], row['away_GD_binned']
), axis=1)
matches.drop(columns=['home_Rank', 'away_Rank', 'home_Average_Points', 'away_Average_Points', 'home_Goal_difference', 'away_Goal_difference'], inplace=True)

matches = matches.copy()

**Save merged df into csv**

In [None]:
new_data_dir = "new_data"

output_path_matches = os.path.join(new_data_dir, 'merged_matches.csv')
os.makedirs(os.path.dirname(output_path_matches), exist_ok=True)
matches.to_csv(output_path_matches, index=False)
print(f"Merged CSV saved to {output_path_matches}")