In [None]:
import pandas as pd
import os

# Load team and match data
teams = pd.read_csv("new_data/database_teams.csv")
matches = pd.read_csv("new_data/database_matches.csv")


In [None]:
# Merge home team stats
matches = matches.merge(
    teams.add_prefix("home_"),
    left_on="homeTeamId",
    right_on="home_teamId",
    how="inner"
)

# Merge away team stats
matches = matches.merge(
    teams.add_prefix("away_"),
    left_on="awayTeamId",
    right_on="away_teamId",
    how="inner"
).sort_values(
    by=["date"]
)

def get_result(row):
    if row['homeTeamWinner'] == True:
        return 'HomeWin'
    elif row['awayTeamWinner'] == True:
        return 'AwayWin'
    elif row['homeTeamWinner'] == False and row['awayTeamWinner'] == False:
        return 'Draw'
    else:
        return 'Unknown'

matches['Result'] = matches.apply(get_result, axis=1)

matches.drop(columns=[
    "homeTeamId", "awayTeamId",
    "home_teamId", "away_teamId",
    "homeTeamScore", "awayTeamScore",
    "homeTeamWinner", "awayTeamWinner",
    "leagueId", "home_Games", "away_Games", "away_League", "home_Goal_for", "home_Goal_against", "away_Goal_for", "away_Goal_against", "home_Wins", "home_Ties", "home_Losses", "away_Wins", "away_Ties", "away_Losses", "home_Points", "away_Points",
], inplace=True)

matches["date"] = pd.to_datetime(matches["date"], format = '%H:%M, %d.%m.%y')
matches = matches.sort_values(by=["date"]).reset_index(drop=True)
matches

In [None]:
def bin_rank(rank):
    if rank <= 4:
        return 'Top'
    elif rank <= 10:
        return 'Mid'
    else:
        return 'Bottom'
    
def bin_avg_points(points):
    if points >= 2.0:
        return 'High'
    elif points >= 1.2:
        return 'Mid'
    else:
        return 'Low'

def bin_goal_diff(gd):
    if gd >= 15:
        return 'High'
    elif gd >= 0:
        return 'Mid'
    else:
        return 'Low'


In [None]:
df = matches.copy()
df['home_Rank_binned'] = df['home_Rank'].apply(bin_rank)
df['away_Rank_binned'] = df['away_Rank'].apply(bin_rank)

df['home_AvgPts_binned'] = df['home_Average_Points'].apply(bin_avg_points)
df['away_AvgPts_binned'] = df['away_Average_Points'].apply(bin_avg_points)

df['home_GD_binned'] = df['home_Goal_difference'].apply(bin_goal_diff)
df['away_GD_binned'] = df['away_Goal_difference'].apply(bin_goal_diff)

df

In [None]:
def compute_strength(rank, form, avg_pts, goal_diff):
    score = 0
    if rank == 'Top':
        score += 2
    elif rank == 'Mid':
        score += 1

    if form == 'High':
        score += 2
    elif form == 'Mid':
        score += 1

    if avg_pts == 'High':
        score += 2
    elif avg_pts == 'Mid':
        score += 1

    if goal_diff == 'High':
        score += 2
    elif goal_diff == 'Mid':
        score += 1

    if score >= 6:
        return 'Strong'
    elif score >= 3:
        return 'Average'
    else:
        return 'Weak'


In [None]:
df['home_Strength'] = df.apply(lambda row: compute_strength(
    row['home_Rank_binned'], row['home_Form'],
    row['home_AvgPts_binned'], row['home_GD_binned']
), axis=1)

df['away_Strength'] = df.apply(lambda row: compute_strength(
    row['away_Rank_binned'], row['away_Form'],
    row['away_AvgPts_binned'], row['away_GD_binned']
), axis=1)
df.drop(columns=['home_Rank', 'away_Rank', 'home_Average_Points', 'away_Average_Points', 'home_Goal_difference', 'away_Goal_difference'], inplace=True)

matches = df.copy()
matches

In [None]:
new_data_dir = "new_data"

output_path_matches = os.path.join(new_data_dir, 'merged_matches.csv')
os.makedirs(os.path.dirname(output_path_matches), exist_ok=True)
matches.to_csv(output_path_matches, index=False)
print(f"Merged CSV saved to {output_path_matches}")

In [None]:
# Create the graph
import matplotlib.pyplot as plt
import networkx as nx

G = nx.DiGraph()

# Define the edges
edges = [
    ("home_Form", "home_Strength"),
    ("home_Rank", "home_Strength"),
    ("home_Goal_difference", "home_Strength"),
    
    ("away_Form", "away_Strength"),
    ("away_Rank", "away_Strength"),
    ("away_Goal_difference", "away_Strength"),

    ("home_Strength", "Result"),
    ("away_Strength", "Result")
]

G.add_edges_from(edges)



# Draw the graph
plt.figure(figsize=(12, 8))
nx.draw(
    G, with_labels=True, node_size=3000, node_color='skyblue',
    arrowsize=20, font_size=11, font_weight='bold', edge_color='gray'
)
plt.title("DAG", fontsize=16)
plt.axis("off")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.estimators import BayesianEstimator
from pgmpy.inference import VariableElimination

# Load the data
df = pd.read_csv("new_data/merged_matches.csv")
df['date'] = pd.to_datetime(df['date'])

# Filter matches up to and including 15.05.2025 for training
cutoff_date = pd.to_datetime("2025-05-15")
train_df = matches[matches['date'] <= cutoff_date]

# Extract the Bayern Munich vs Hoffenheim match on 17.05.2025
test_date = pd.to_datetime("2025-05-17")
test_match = matches[
    (matches['date'].dt.date == test_date.date()) &
    ((matches['home_Team'] == "Borussia Dortmund") & (matches['away_Team'] == "Holstein Kiel"))
]

if test_match.empty:
    print("Test match not found.")
    exit()

# Select key features (use binned columns)
features = ['home_Strength', 'home_Form', 'away_Form', 'away_Strength', 'Result',
            'home_GD_binned', 'away_GD_binned', 'home_Rank_binned', 'away_Rank_binned']
train_data = train_df[features].copy()



# Define Bayesian Network structure
model = DiscreteBayesianNetwork([
    ("home_Form", "home_Strength"),
    ("home_Rank_binned", "home_Strength"),
    ("home_GD_binned", "home_Strength"),

    ("away_Form", "away_Strength"),
    ("away_Rank_binned", "away_Strength"),
    ("away_GD_binned", "away_Strength"),

    ("home_Strength", "Result"),
    ("away_Strength", "Result")
])

# Fit the model using MLE
# model.fit(train_data, estimator=MaximumLikelihoodEstimator)
model.fit(train_data, estimator=BayesianEstimator, prior_type='BDeu', equivalent_sample_size=5)

# Create inference object
inference = VariableElimination(model)

# Extract the test instance
test_instance = test_match.iloc[0]
evidence = {
    'home_Form': test_instance['home_Form'],
    'away_Form': test_instance['away_Form'],
    'home_Strength': test_instance['home_Strength'],
    'away_Strength': test_instance['away_Strength'],
}

# Query the model
result_prediction = inference.query(variables=['Result'], evidence=evidence)

# Show the result
print("Predicted probabilities for match outcome (Bayern Munich vs Hoffenheim on 17.05.2025):")
print(result_prediction)
