In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import football as fb
import os
from sklearn.preprocessing import LabelEncoder


In [None]:
#read Copy of Accuyield_Test_Package_Data_Scientist_V3(1).xlsx sheet 2
df = pd.read_excel('Copy of Accuyield_Test_Package_Data_Scientist_V3(1).xlsx', sheet_name=1)
df


In [None]:
#filter either hometeam or awayteam is 'Arsenal' and 'Sheffield United'
team_1 = 'Arsenal'
team_2 = 'Sheffield United'

dfx = df[(df['HomeTeam'] == team_1) & (df['AwayTeam'] == team_2) | (df['HomeTeam'] == team_2) & (df['AwayTeam'] == team_1)]
dfx[['Season','HomeTeam', 'AwayTeam', 'HomeScore', 'AwayScore', 'TotalGoals']] 

### Check team stats

In [None]:

for i in df['Season'].unique():
    # print(f'Season: {i}')
    season_df = df[df['Season'] == i]
print(f'Season: {i}')
# Assuming 'season_df' is your main DataFrame with match data
teams = pd.concat([season_df['HomeTeam'], season_df['AwayTeam']]).unique()

# Initialize a dictionary to store aggregated stats
team_stats = {team: {} for team in teams}
calender_date = season_df['Date'].unique()
# Iterate over each team and aggregate data
for team in teams:
    # Home statistics (when the team is the home team)
    home_matches = season_df[season_df['HomeTeam'] == team]
    home_goals_scored = home_matches['HomeScore'].sum()
    home_goals_conceded = home_matches['AwayScore'].sum()
    home_corners = home_matches['HomeCornerKicks'].sum()
    home_yellow_cards = home_matches['HomeYellowCards'].sum()
    home_shots = home_matches['HomeShots'].sum()
    
    # Away statistics (when the team is the away team)
    away_matches = season_df[season_df['AwayTeam'] == team]
    away_goals_scored = away_matches['AwayScore'].sum()
    away_goals_conceded = away_matches['HomeScore'].sum()
    away_corners = away_matches['AwayCornerKicks'].sum()
    away_yellow_cards = away_matches['AwayYellowCards'].sum()
    away_shots = away_matches['AwayShots'].sum()
    
    # Total statistics
    total_matches_played = len(home_matches) + len(away_matches)
    total_goals_scored = home_goals_scored + away_goals_scored
    total_goals_conceded = home_goals_conceded + away_goals_conceded
    
    # Store aggregated data in the dictionary
    team_stats[team] = {
        'MatchesPlayed': total_matches_played,
        'GoalsScored': total_goals_scored,
        'GoalsConceded': total_goals_conceded,
        'GoalDifference': total_goals_scored - total_goals_conceded,
        'HomeGoalsScored': home_goals_scored,
        'AwayGoalsScored': away_goals_scored,
        'HomeCornerKicks': home_corners,
        'AwayCornerKicks': away_corners,
        'HomeYellowCards': home_yellow_cards,
        'AwayYellowCards': away_yellow_cards,
        'HomeShots': home_shots,
        'AwayShots': away_shots
    }

# Convert dictionary to DataFrame
team_stats_season_df = pd.DataFrame.from_dict(team_stats, orient='index')
team_stats_season_df

In [None]:
target = 'GoalsScored'
team_stats_season_df[[f'Home{target}',f'Away{target}']].plot(kind='bar', figsize=(15, 6), title=f'{target} by Team')

### Derive Team Stats (up till the current date)

In [None]:
df

In [14]:


df = df.copy()
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date')
calendar_dates = df['Date'].unique()
teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
team_stats_daily = []

saving_path = 'relationship_matrix'
if not os.path.exists(saving_path):
    os.makedirs(saving_path)
    # Make sure use the data 1 day before the current date in forecasting
    for current_date in calendar_dates:
        matches_up_to_date = df[df['Date'] <= current_date]
        possession_matrix  = fb.create_relationship_matrix(matches_up_to_date, 'Possession')
        possession_matrix = possession_matrix.fillna(0)
        fouls_matrix = fb.create_relationship_matrix(matches_up_to_date, 'Fouls')
        fouls_matrix = fouls_matrix.fillna(0)
        successful_passes_matrix = fb.create_relationship_matrix(matches_up_to_date, 'SuccessfulPassesPct')
        successful_passes_matrix = successful_passes_matrix.fillna(0)
        date_str = current_date.strftime('%Y-%m-%d')
        possession_matrix.to_csv(f'{saving_path}/possession_matrix_{date_str}.csv')
        fouls_matrix.to_csv(f'{saving_path}/fouls_matrix_{date_str}.csv')
        successful_passes_matrix.to_csv(f'{saving_path}/successful_passes_matrix_{date_str}.csv')
else:
    print(f'{saving_path} already exists')






    

In [None]:
season_df['Date'] = pd.to_datetime(season_df['Date'])  # Ensure the 'Date' column is in datetime format
season_df = season_df.sort_values('Date')  # Sort matches by date
calendar_dates = season_df['Date'].unique()
teams = pd.concat([season_df['HomeTeam'], season_df['AwayTeam']]).unique()
team_stats_daily = []

# Iterate over each calendar date
for current_date in calendar_dates:
    matches_up_to_date = season_df[season_df['Date'] <= current_date]
    for team in teams:
        home_matches = matches_up_to_date[matches_up_to_date['HomeTeam'] == team]
        home_goals_scored = home_matches['HomeScore'].sum()
        home_goals_conceded = home_matches['AwayScore'].sum()
        home_corners = home_matches['HomeCornerKicks'].sum()
        home_yellow_cards = home_matches['HomeYellowCards'].sum()
        home_shots = home_matches['HomeShots'].sum()
        away_matches = matches_up_to_date[matches_up_to_date['AwayTeam'] == team]
        away_goals_scored = away_matches['AwayScore'].sum()
        away_goals_conceded = away_matches['HomeScore'].sum()
        away_corners = away_matches['AwayCornerKicks'].sum()
        away_yellow_cards = away_matches['AwayYellowCards'].sum()
        away_shots = away_matches['AwayShots'].sum()
        total_matches_played = len(home_matches) + len(away_matches)
        total_goals_scored = home_goals_scored + away_goals_scored
        total_goals_conceded = home_goals_conceded + away_goals_conceded
        team_stats_daily.append({
            'Team': team,
            'Date': current_date,
            'MatchesPlayed': total_matches_played,
            'GoalsScored': total_goals_scored,
            'GoalsConceded': total_goals_conceded,
            'GoalDifference': total_goals_scored - total_goals_conceded,
            'HomeGoalsScored': home_goals_scored,
            'AwayGoalsScored': away_goals_scored,
            'HomeCornerKicks': home_corners,
            'AwayCornerKicks': away_corners,
            'HomeYellowCards': home_yellow_cards,
            'AwayYellowCards': away_yellow_cards,
            'HomeShots': home_shots,
            'AwayShots': away_shots
        })
team_stats_daily_df = pd.DataFrame(team_stats_daily)
team_stats_daily_df

### Add label encoding for team names

In [None]:
df 
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to HomeTeam and AwayTeam
df['HomeTeamLE'] = label_encoder.fit_transform(df['HomeTeam'])
df['AwayTeamLE'] = label_encoder.fit_transform(df['AwayTeam'])

# Combine encoded team names with PCA-transformed features
X_combined = pd.concat([relationship_matrix_pca_df, df[['HomeTeamEncoded', 'AwayTeamEncoded']]], axis=1)


In [None]:
# trainingcol = ['HomeTeam', 'AwayTeam', '   N 