In [None]:
import pandas as pd

# Load Serie A and Premier League data from JSON
serie_a_path = './football.json-master/2023-24/it.1.json'
premier_league_path = './football.json-master/2023-24/en.1.json'

serie_a = pd.read_json(serie_a_path)
premier_league = pd.read_json(premier_league_path)

# Add league column to distinguish between Serie A and Premier League
serie_a['league'] = 'Serie A'
premier_league['league'] = 'Premier League'

# Combine both datasets
data = pd.concat([serie_a, premier_league], ignore_index=True)


In [None]:
# Check column names to ensure correct references (you may need to adjust if the actual names differ)
# Assuming the columns are: 'team1' and 'team2' for teams, 'ft_team1' and 'ft_team2' for full-time scores

# Create a 'home_win' column: 1 if home team wins (ft_team1 > ft_team2), else 0
data['home_win'] = (data['ft_team1'] > data['ft_team2']).astype(int)

# Filter data for Serie A and Premier League
serie_a_data = data[data['league'] == 'Serie A']
premier_league_data = data[data['league'] == 'Premier League']


In [None]:
# Calculate home win probability for Serie A and Premier League
serie_a_home_win_prob = serie_a_data['home_win'].mean()
premier_league_home_win_prob = premier_league_data['home_win'].mean()

# Calculate the difference in probabilities
prob_diff = serie_a_home_win_prob - premier_league_home_win_prob

# Print the results
print(f"Serie A Home Win Probability: {serie_a_home_win_prob:.2f}")
print(f"Premier League Home Win Probability: {premier_league_home_win_prob:.2f}")
print(f"Difference in Home Win Probability: {prob_diff:.2f}")


In [None]:
from scipy.stats import ttest_ind

# Perform a t-test to compare the home win rates between the two leagues
t_stat, p_value = ttest_ind(serie_a_data['home_win'], premier_league_data['home_win'], equal_var=False)

print(f"T-statistic: {t_stat:.2f}")
print(f"P-value: {p_value:.4f}")

# If p-value is less than 0.05, the difference is statistically significant
if p_value < 0.05:
    print("The difference in home win probability is statistically significant.")
else:
    print("The difference in home win probability is not statistically significant.")
