# European Big 5 Football League Match Results for 3 Years and its Analysis

 This file shows European big 5 football league match results and its analysis. First of all, we have to use preprocessed dataset from previous procedure. Preprocessed files have seperated data by year, so we have to combine them by league. Also, there are players' performance index files based on ranking for Forward and Midfielder. These files are grouped by team, the value for team is average performance index of players.

Secondly, we will find top 10 teams for each league, then there will be heatmap for top 10 teams that is representing match result for 3 years between them. To get more meaningful information, we will adopt linear regression to understand linear correlation between difference of performance index and match result. At last, there will be a final heatmap for top 10 teams. Let's get started.

# Combine Seperated .csv files by League

In [None]:
# Import pandas and os module to handle csv files and dataset
import pandas as pd
import os

# Directory path where CSV files are saved
directory_path = '/content/'

# Read all CSV files in one list
csv_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')]

# Sort files by league
pl_files = []
laliga_files = []
seria_files = []
bundes_files = []
ligue1_files = []

for file in csv_files:
    if file.startswith('/content/pl'):
        pl_files.append(os.path.join(directory_path, file))
    elif file.startswith('/content/laliga'):
        laliga_files.append(os.path.join(directory_path, file))
    elif file.startswith('/content/seria'):
        seria_files.append(os.path.join(directory_path, file))
    elif file.startswith('/content/bundes'):
        bundes_files.append(os.path.join(directory_path, file))
    elif file.startswith('/content/ligue1'):
        ligue1_files.append(os.path.join(directory_path, file))


# Generate empty DataFrame
pl_df = pd.DataFrame()
laliga_df = pd.DataFrame()
seria_df = pd.DataFrame()
bundes_df = pd.DataFrame()
ligue1_df = pd.DataFrame()

# Read each CSV file and add it to DataFrame
for file in pl_files: # for PL
    df = pd.read_csv(file)
    pl_df = pd.concat([pl_df, df], ignore_index=True)

for file in laliga_files: # for LaLiga
    df = pd.read_csv(file)
    laliga_df = pd.concat([laliga_df, df], ignore_index=True)

for file in seria_files: # for Serie A
    df = pd.read_csv(file)
    seria_df = pd.concat([seria_df, df], ignore_index=True)

for file in bundes_files: # for Bundesliga
    df = pd.read_csv(file)
    bundes_df = pd.concat([bundes_df, df], ignore_index=True)

for file in ligue1_files: # for Ligue1
    df = pd.read_csv(file)
    ligue1_df = pd.concat([ligue1_df, df], ignore_index=True)

# Save DataFrame for each league in csv file to backup
pl_df.to_csv('n_pl_output.csv', index=False)
laliga_df.to_csv('n_laliga_output.csv', index=False)
seria_df.to_csv('n_seria_output.csv', index=False)
bundes_df.to_csv('n_bundes_output.csv', index=False)
ligue1_df.to_csv('n_ligue1_output.csv', index=False)

# Using Heatmap

In [None]:
# Import seaborn and matplolib module to use heatmap
import seaborn as sns
import matplotlib.pyplot as plt

# Function to create heatmap
def create_heatmap(team_results):
    df = pd.DataFrame(team_results, columns=['Team', 'Opponent', 'Result'])
    '''
    'Team' for y axis, 'Opponent' for x axis, 'Result' for value at the point
    '''
    df_pivot = df.pivot_table(index='Team', columns='Opponent', values='Result')
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_pivot, annot=True, cmap='coolwarm', cbar=True, linewidths=.5)
    plt.title('Team Results Heatmap')
    plt.show()


# Finding Top 10 Teams

After finding top 10 teams, first we will create heatmap between top 10 teams. Although this heatmap can show correlation or the most powerful team among 10 teams, we still need more certain visualization or relation between them. So we will adopt linear regression. The purpose of this regression is to understand correlation between difference of performance index and result. We start with hypothesis that there is a proportional relationship between difference of performance index and result. For other words, the larger difference of performance index, the bigger result.

Since the methodology for each league is same, so implementation detailed explanation will only be in Premier League section by comment.

# Premier League

First step: Find top 10 teams in PL

In [None]:
# find top 10 teams
pl_teams = [] # which team is member of PL
# the member of this list can be different from another season because there is promotion & relegation

pl_total = {} # Team(key), sum of result for 3 years(value)

# iterate row by row through dataframe
for index, row in pl_df.iterrows():
    team = row['Team']
    result = row['Result']

    if team not in pl_total: # first referencing of team
        pl_total[team] = result # add to dictionary with value
        pl_teams.append(team) # be the member of PL
    else: # after first reference
        pl_total[team] += result # just increment result

# convert dictionary to dataframe
pl_total_df = pd.DataFrame(list(pl_total.items()), columns=['Team', 'Total'])
# sort by descending order
pl_total_df = pl_total_df.sort_values(by='Total', ascending=False)
# extract top 10 elements
pl_top_10 = pl_total_df.head(10)
# record top 10 teams' name in list
pl_top_10_teams = pl_top_10['Team'].tolist()

print(pl_top_10)

Step two: Extract matches between top 10 teams and generate initial heatmap

Initial heatmap represents average of 'Goal for - Goal against' for each team. If the value is negative or the block is colored blue, that team has poor performance against respective opponent. That is, the team's expected win probability is low for that opponent.

In [None]:
# generate list to record matches
pl_top_10_match = []
# iterate row by row throught dataframe
for index, row in pl_df.iterrows():
    # extract data from each row
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']
    # check if the match is top 10 teams' game
    if team in pl_top_10_teams and opponent in pl_top_10_teams:
        # append to list as dictionary
        pl_top_10_match.append({'Team': team, 'Opponent': opponent, 'Result': result})

# convert list of dictionaries to dataframe
pl_top_10_match_df = pd.DataFrame(pl_top_10_match)
print(pl_top_10_match_df)

# generate heatmap
create_heatmap(pl_top_10_match_df)

For more reliable statistics, we want to adopt linear regression based on performance index for each team. We're offering 'FW.csv', and 'MF.csv' that have average performance index of the team. Just sum up them to get the team's FW+MF average performance index.

In [None]:
# read csv file and make them up to dataframe
fw_df = pd.DataFrame()
fw_df = pd.read_csv('FW.csv')
mf_df = pd.DataFrame()
mf_df = pd.read_csv('MF.csv')

# to handle both dataframes at one file, concatenate them at FW dataframe
fw_df = pd.concat([fw_df, mf_df], ignore_index=True)

# save in new file to back up data
fw_df.to_csv('n_fw_df.csv', index=False)

# this process is to treat abbreviation
for i in range(len(pl_teams)):
    if pl_teams[i] == 'Brighton & Hove Albion':
        pl_teams[i] = 'Brighton'
    elif pl_teams[i] == 'Manchester United':
        pl_teams[i] = 'Manchester Utd'
    elif pl_teams[i] == 'Newcastle United':
        pl_teams[i] = 'Newcastle Utd'
    elif pl_teams[i] == 'Wolverhampton Wanderers':
        pl_teams[i] = 'Wolves'
    elif pl_teams[i] == 'Tottenham Hotspur':
        pl_teams[i] = 'Tottenham'
    elif pl_teams[i] == 'West Ham United':
        pl_teams[i] = 'West Ham'
    elif pl_teams[i] == 'Sheffield United':
        pl_teams[i] = 'Sheffield'

# generate empty dictionary to save (team: score)
pl_scores = {}
# iterate row by row throught dataframe
for index, row in fw_df.iterrows():
    team = row['Team']
    score = row['Total_score']
    # this process is to treat abbreviation
    if team in pl_teams:
        if team not in pl_scores: # first referencing team
            if team == 'Brighton':
                team = 'Brighton & Hove Albion'
            elif team == 'Manchester Utd':
                team = 'Manchester United'
            elif team == 'Newcastle Utd':
                team = 'Newcastle United'
            elif team == 'Wolves':
                team = 'Wolverhampton Wanderers'
            elif team == 'Tottenham':
                team = 'Tottenham Hotspur'
            elif team == 'West Ham':
                team = 'West Ham United'
            elif team == 'Sheffield':
                team = 'Sheffield United'
            pl_scores[team] = score # add key: value to dictionary
        elif team in pl_scores: # after first referencing
            pl_scores[team] += score # just increment score
        else:
            '''
            if there is no proper team name
             or team is not in top 10
            '''
            pl_scores[team] = 0 # score set to 0

print(pl_scores)

In [None]:
# add Difference attribute to original dataframe
# generate empty dictionary list
pl_diff = []
# iterate row by row through dataframe
for index, row in pl_df.iterrows():
    # extract data from row
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']
    '''
    there can occur error because there may be no performance index
    for a team. For that exception, set score to 0
    '''
    try: # t_score is team score
        t_score = pl_scores[team]
    except:
        t_score = 0
    try: # o_score is opponent score
        o_score = pl_scores[opponent]
    except:
        o_score = 0
    # substract opponent score value from team score value
    diff = t_score - o_score
    # append to dictionary list
    pl_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})

# convert list of dictionary to dataframe
pl_diff_df = pd.DataFrame(pl_diff)
print(pl_diff_df)

# Adopting Linear Regression

For reliable or accurate prediction, we decided to adopt linear regression. From hypothesis at the beginning that there may be exist proportional relation between average performance index and result, additionally, by applying linear regression model, we can analyze or interpret the result only based on team's average performance index.

In [None]:
# import modules to use linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# set difference of average performance index at x axis
X = pl_diff_df['Difference'].values.reshape(-1, 1)
# set result of match at y axis
y = pl_diff_df['Result'].values

'''
 set linear regression model and fit x and y
 and display plot
 '''
model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

print(f'Slope: {model.coef_[0]}')
print(f'Intercept: {model.intercept_}')

plt.scatter(X, y, color='blue', label='Actual Results')
plt.plot(X, y_pred, color='red', linewidth=2, label='Regression Line')
plt.xlabel('Difference')
plt.ylabel('Result')
plt.title('Linear Regression: Difference vs Result')
plt.legend()
plt.show()

mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2: {r2}')

There exists proportional relation between difference of performance index and result.(reliability=14.9%) Then we can generate heatmap again and analyze it.

In [None]:
# generate empty dictionary list
pl_top_10_diff = []
# iterate row by row through dataframe
for index, row in pl_diff_df.iterrows():
    # extract data from row
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']
    diff = row['Difference']
    # if the team is in top 10
    if team in pl_top_10_teams and opponent in pl_top_10_teams:
        pl_top_10_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
# convert list of dictionary to dataframe
pl_top_10_diff_df = pd.DataFrame(pl_top_10_diff)

# generate empty dictionary list
pl_regression = []
# iterate row by row through dataframe
for index, row in pl_top_10_diff_df.iterrows():
    # extract data from row
    team = row['Team']
    opponent = row['Opponent']
    # apply linear regression model
    result = row['Difference'] * 0.10839458716281687 - 0.016503697009748065
    diff = row['Difference']
    pl_regression.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
pl_regression_df = pd.DataFrame(pl_regression)
print(pl_regression_df)

# generate heatmap
create_heatmap(pl_regression_df)

From final heatmap, how can we interprete the heatmap? Value inside the block represents the goal difference gained against opponent based on regression model. For example, Man City may win Arsenal by gaining 0.64 goal difference per match based on player's performance. Then we can get some insights from this heatmap.

# LaLiga

In [None]:
laliga_teams = []
laliga_total = {}
for index, row in laliga_df.iterrows():
    team = row['Team']
    if team == 'Almería':
        team = 'Almeria'
    result = row['Result']

    if team not in laliga_total:
        laliga_total[team] = result
        laliga_teams.append(team)
    else:
        laliga_total[team] += result

laliga_total_df = pd.DataFrame(list(laliga_total.items()), columns=['Team', 'Total'])
laliga_total_df = laliga_total_df.sort_values(by='Total', ascending=False)
laliga_top_10 = laliga_total_df.head(10)
laliga_top_10_teams = laliga_top_10['Team'].tolist()

print(laliga_top_10)
laliga_top_10_match = []
for index, row in laliga_df.iterrows():
    team = row['Team']
    if team == 'Almería':
        team = 'Almeria'
    opponent = row['Opponent']
    result = row['Result']
    #print(team, opponent, result)
    if team in laliga_top_10_teams and opponent in laliga_top_10_teams:
        laliga_top_10_match.append({'Team': team, 'Opponent': opponent, 'Result': result})

laliga_top_10_match_df = pd.DataFrame(laliga_top_10_match)
print(laliga_top_10_match_df)
laliga_top_10_match_df.to_csv('llg.csv', index=False)

create_heatmap(laliga_top_10_match_df)

In [None]:
laliga_scores = {}

for index, row in fw_df.iterrows():
    team = row['Team']
    score = row['Total_score']
    if team in laliga_teams:
        if team not in laliga_scores:
            laliga_scores[team] = score
        elif team in laliga_scores:
            laliga_scores[team] += score
        else:
            laliga_scores[team] = 0

#pl_scores_df = pd.DataFrame(pl_scores)
print(laliga_scores)

laliga_diff = []
for index, row in laliga_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']
    try:
        t_score = laliga_scores[team]
    except:
        t_score = 0
    try:
        o_score = laliga_scores[opponent]
    except:
        o_score = 0
    diff = t_score - o_score
    laliga_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})

laliga_diff_df = pd.DataFrame(laliga_diff)
print(laliga_diff_df)

X = laliga_diff_df['Difference'].values.reshape(-1, 1)
y = laliga_diff_df['Result'].values

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

print(f'Slope: {model.coef_[0]}')
print(f'Intercept: {model.intercept_}')

plt.scatter(X, y, color='blue', label='Actual Results')
plt.plot(X, y_pred, color='red', linewidth=2, label='Regression Line')
plt.xlabel('Difference')
plt.ylabel('Result')
plt.title('Linear Regression: Difference vs Result')
plt.legend()
plt.show()

mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2: {r2}')

In [None]:
laliga_top_10_diff = []
for index, row in laliga_diff_df.iterrows():
    team = row['Team']
    if team == 'Alméria':
        team = 'Almeria'
    opponent = row['Opponent']
    result = row['Result']
    diff = row['Difference']
    if team in laliga_top_10_teams and opponent in laliga_top_10_teams:
        laliga_top_10_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
laliga_top_10_diff_df = pd.DataFrame(laliga_top_10_diff)
#print(pl_top_10_diff_df)

laliga_regression = []
for index, row in laliga_top_10_diff_df.iterrows():
    team = row['Team']
    if team == 'Alméria':
        team = 'Almeria'
    opponent = row['Opponent']
    result = row['Difference'] * 0.07362812079023387 + 0.019190633528563643
    diff = row['Difference']
    laliga_regression.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
laliga_regression_df = pd.DataFrame(laliga_regression)
print(laliga_regression_df)
create_heatmap(laliga_regression_df)

# Serie A

In [None]:
seria_teams = []
seria_total = {}
for index, row in seria_df.iterrows():
    team = row['Team']
    result = row['Result']
    if team not in seria_total:
        seria_total[team] = result
        seria_teams.append(team)
    else:
        seria_total[team] += result

seria_total_df = pd.DataFrame(list(seria_total.items()), columns=['Team', 'Total'])
seria_total_df = seria_total_df.sort_values(by='Total', ascending=False)
seria_top_10 = seria_total_df.head(10)
seria_top_10_teams = seria_top_10['Team'].tolist()

print(seria_top_10)
seria_top_10_match = []
for index, row in seria_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']
    #print(team, opponent, result)
    if team in seria_top_10_teams and opponent in seria_top_10_teams:
        seria_top_10_match.append({'Team': team, 'Opponent': opponent, 'Result': result})

seria_top_10_match_df = pd.DataFrame(seria_top_10_match)
print(seria_top_10_match_df)

create_heatmap(seria_top_10_match_df)

In [None]:
seria_scores = {}
print(seria_teams)
for index, row in fw_df.iterrows():
    team = row['Team']
    score = row['Total_score']
    if team in seria_teams:
        if team not in seria_scores:
            seria_scores[team] = score
        elif team in seria_scores:
            seria_scores[team] += score
        else:
            seria_scores[team] = 0

#pl_scores_df = pd.DataFrame(pl_scores)
print(seria_scores)

seria_diff = []
for index, row in seria_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']
    try:
        t_score = seria_scores[team]
    except:
        t_score = 0
    try:
        o_score = seria_scores[opponent]
    except:
        o_score = 0
    diff = t_score - o_score
    seria_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})

seria_diff_df = pd.DataFrame(seria_diff)
print(seria_diff_df)

X = seria_diff_df['Difference'].values.reshape(-1, 1)
y = seria_diff_df['Result'].values

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

print(f'Slope: {model.coef_[0]}')
print(f'Intercept: {model.intercept_}')

plt.scatter(X, y, color='blue', label='Actual Results')
plt.plot(X, y_pred, color='red', linewidth=2, label='Regression Line')
plt.xlabel('Difference')
plt.ylabel('Result')
plt.title('Linear Regression: Difference vs Result')
plt.legend()
plt.show()

mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2: {r2}')

In [None]:
seria_top_10_diff = []
for index, row in seria_diff_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']
    diff = row['Difference']
    if team in seria_top_10_teams and opponent in seria_top_10_teams:
        seria_top_10_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
seria_top_10_diff_df = pd.DataFrame(seria_top_10_diff)
#print(pl_top_10_diff_df)
X = seria_top_10_diff_df['Difference'].values.reshape(-1, 1)
y = seria_top_10_diff_df['Result'].values

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

print(f'회귀 계수 (Slope): {model.coef_[0]}')
print(f'절편 (Intercept): {model.intercept_}')

plt.scatter(X, y, color='blue', label='Actual Results')
plt.plot(X, y_pred, color='red', linewidth=2, label='Regression Line')
plt.xlabel('Difference')
plt.ylabel('Result')
plt.title('Linear Regression: Difference vs Result')
plt.legend()
plt.show()

mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f'평균 제곱 오차 (MSE): {mse}')
print(f'결정 계수 (R^2): {r2}')

seria_regression = []
for index, row in seria_top_10_diff_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Difference'] * -0.03496405196203219
    diff = row['Difference']
    seria_regression.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
seria_regression_df = pd.DataFrame(seria_regression)
print(seria_regression_df)
create_heatmap(seria_regression_df)

# Bundesliga

In [None]:
bundes_teams = []
bundes_total = {}
for index, row in bundes_df.iterrows():
    team = row['Team']
    result = row['Result']
    if team not in bundes_total:
        bundes_total[team] = result
        bundes_teams.append(team)
    else:
        bundes_total[team] += result

bundes_total_df = pd.DataFrame(list(bundes_total.items()), columns=['Team', 'Total'])
bundes_total_df = bundes_total_df.sort_values(by='Total', ascending=False)
bundes_top_10 = bundes_total_df.head(10)
bundes_top_10_teams = bundes_top_10['Team'].tolist()
#print(bundes_teams)
print(bundes_top_10)
bundes_top_10_match = []
for index, row in bundes_df.iterrows():
    team = row['Team']
    if team == '1. FC Union Berlin':
        team = 'Union Berlin'
    opponent = row['Opponent']
    result = row['Result']
    #print(team, opponent, result)
    if team in bundes_top_10_teams and opponent in bundes_top_10_teams:
        bundes_top_10_match.append({'Team': team, 'Opponent': opponent, 'Result': result})

bundes_top_10_match_df = pd.DataFrame(bundes_top_10_match)
print(bundes_top_10_match_df)
bundes_top_10_match_df.to_csv('bnd.csv', index=False)

create_heatmap(bundes_top_10_match_df)

In [None]:
bundes_scores = {}
print(bundes_teams)
for index, row in fw_df.iterrows():
    team = row['Team']
    #print(team)
    score = row['Total_score']
    if team == 'Augsburg':
        team = 'FC Augsburg'
    elif team == 'Dortmund':
        team = 'Borussia Dortmund'
    elif team == 'Eintracht':
        team = 'Eintracht Frankfurt'
    elif team == 'Freiburg':
        team = 'SC Freiburg'
    elif team == 'Hoffenheim':
        team = 'TSG Hoffenheim'
    elif team == 'Leverkusen':
        team = 'Bayer Leverkusen'
    elif team == 'Mainz 04':
        team = 'Mainz'
    elif team == 'Stuttgart':
        team = 'VfB Stuttgart'
    elif team == 'Wolfsburg':
        team = 'VfL Wolfsburg'
    elif team == 'Bochum':
        team = 'VfL Bochum'
    elif team == 'Cologne':
        team = 'FC Cologne'
    elif team == 'Monchengladbach':
        team = 'Borussia Monchengladbach'
    elif team == 'Darmstadt':
        team = 'SV Darmstadt 98'
    elif team == '1. Union Berlin':
        team = 'Union Berlin'
    if team in bundes_teams:
        #print(team)
        if team not in bundes_scores:
            bundes_scores[team] = score
        elif team in bundes_scores:
            bundes_scores[team] += score
        else:
            bundes_scores[team] = 0

#pl_scores_df = pd.DataFrame(pl_scores)
print(bundes_scores)

bundes_diff = []
for index, row in bundes_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']
    try:
        t_score = bundes_scores[team]
    except:
        t_score = 0
    try:
        o_score = bundes_scores[opponent]
    except:
        o_score = 0
    diff = t_score - o_score
    bundes_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})

bundes_diff_df = pd.DataFrame(bundes_diff)
print(bundes_diff_df)

X = bundes_diff_df['Difference'].values.reshape(-1, 1)
y = bundes_diff_df['Result'].values

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

print(f'Slope: {model.coef_[0]}')
print(f'Intercept: {model.intercept_}')

plt.scatter(X, y, color='blue', label='Actual Results')
plt.plot(X, y_pred, color='red', linewidth=2, label='Regression Line')
plt.xlabel('Difference')
plt.ylabel('Result')
plt.title('Linear Regression: Difference vs Result')
plt.legend()
plt.show()

mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f'Mean Sqaured Error: {mse}')
print(f'R^2: {r2}')

In [None]:
bundes_top_10_diff = []
for index, row in bundes_diff_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    if opponent == '1. Union Berlin':
        opponent = 'Union Berlin'
    result = row['Result']
    diff = row['Difference']
    if team in bundes_top_10_teams and opponent in bundes_top_10_teams:
        bundes_top_10_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
bundes_top_10_diff_df = pd.DataFrame(bundes_top_10_diff)
#print(pl_top_10_diff_df)

bundes_regression = []
for index, row in bundes_top_10_diff_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    if opponent == '1. Union Berlin':
        opponent = 'Union Berlin'
    result = row['Difference'] * 0.07080012976267884 - 0.006425186815792003
    diff = row['Difference']
    bundes_regression.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
bundes_regression_df = pd.DataFrame(bundes_regression)
print(bundes_regression_df)
create_heatmap(bundes_regression_df)

# Ligue 1

In [None]:
ligue1_teams = []
ligue1_total = {}
for index, row in ligue1_df.iterrows():
    team = row['Team']
    result = row['Result']
    if team not in ligue1_total:
        ligue1_total[team] = result
        ligue1_teams.append(team)
    else:
        ligue1_total[team] += result

ligue1_total_df = pd.DataFrame(list(ligue1_total.items()), columns=['Team', 'Total'])
ligue1_total_df = ligue1_total_df.sort_values(by='Total', ascending=False)
ligue1_top_10 = ligue1_total_df.head(10)
ligue1_top_10_teams = ligue1_top_10['Team'].tolist()

print(ligue1_top_10)
ligue1_top_10_match = []
for index, row in ligue1_df.iterrows():
    team = row['Team']
    if team == 'Monaco':
        team = "AS Monaco"
    opponent = row['Opponent']
    result = row['Result']
    #print(team, opponent, result)
    if team in ligue1_top_10_teams and opponent in ligue1_top_10_teams:
        ligue1_top_10_match.append({'Team': team, 'Opponent': opponent, 'Result': result})

ligue1_top_10_match_df = pd.DataFrame(ligue1_top_10_match)
print(ligue1_top_10_match_df)

create_heatmap(ligue1_top_10_match_df)

In [None]:
ligue1_scores = {}
print(ligue1_teams)
for index, row in fw_df.iterrows():
    team = row['Team']
    #print(team)
    score = row['Total_score']
    if team == 'Monaco':
        team = 'AS Monaco'
    elif team == 'Paris S-G':
        team = 'Paris Saint-Germain'
    elif team == 'Montpellier HSC':
        team = 'Montpellier'

    if team in ligue1_teams:
        #print(team)
        if team not in ligue1_scores:
            ligue1_scores[team] = score
        elif team in ligue1_scores:
            ligue1_scores[team] += score
        else:
            ligue1_scores[team] = 0

#pl_scores_df = pd.DataFrame(pl_scores)
print(ligue1_scores)

ligue1_diff = []
for index, row in ligue1_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']
    try:
        t_score = ligue1_scores[team]
    except:
        t_score = 0
    try:
        o_score = ligue1_scores[opponent]
    except:
        o_score = 0
    diff = t_score - o_score
    ligue1_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})

ligue1_diff_df = pd.DataFrame(ligue1_diff)
print(ligue1_diff_df)

X = ligue1_diff_df['Difference'].values.reshape(-1, 1)
y = ligue1_diff_df['Result'].values

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

print(f'Slope: {model.coef_[0]}')
print(f'Intercept: {model.intercept_}')

plt.scatter(X, y, color='blue', label='Actual Results')
plt.plot(X, y_pred, color='red', linewidth=2, label='Regression Line')
plt.xlabel('Difference')
plt.ylabel('Result')
plt.title('Linear Regression: Difference vs Result')
plt.legend()
plt.show()

mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f'Mean Sqaured Error: {mse}')
print(f'R^2: {r2}')

In [None]:
ligue1_top_10_diff = []
for index, row in ligue1_diff_df.iterrows():
    team = row['Team']
    if team == 'Monaco':
        team = 'AS Monaco'
    opponent = row['Opponent']
    result = row['Result']
    diff = row['Difference']
    if team in ligue1_top_10_teams and opponent in ligue1_top_10_teams:
        ligue1_top_10_diff.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
ligue1_top_10_diff_df = pd.DataFrame(ligue1_top_10_diff)
#print(pl_top_10_diff_df)

ligue1_regression = []
for index, row in ligue1_top_10_diff_df.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Difference'] * 0.0667096684508841 - 0.00016115953833044825
    diff = row['Difference']
    ligue1_regression.append({'Team': team, 'Opponent': opponent, 'Result': result, 'Difference': diff})
ligue1_regression_df = pd.DataFrame(ligue1_regression)
print(ligue1_regression_df)
create_heatmap(ligue1_regression_df)