# Imports

In [28]:
import pandas as pd # pip install pandas
from difflib import SequenceMatcher
from dateutil.parser import parse

# Helper Functions

In [29]:
def levenshtein_distance(s1, s2):
    return SequenceMatcher(None, s1, s2).ratio()

def convert_date_column(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: parse(x, dayfirst=True))
    df[column_name] = df[column_name].dt.strftime('%d-%m-%Y')
    df[column_name] = pd.to_datetime(df[column_name], format='%d-%m-%Y')

# Load Data

Football Matches of Spanish League
https://www.kaggle.com/datasets/ricardomoya/football-matches-of-spanish-league

Premier League Matches 1993-2023
https://www.kaggle.com/datasets/evangower/premier-league-matches-19922022

Turkish Super League Matches (1959-2021)
https://www.kaggle.com/datasets/faruky/turkish-super-league-matches-19592020

France Football from 1950 to 2022
https://www.kaggle.com/datasets/simonherv/france-football-from-1950-to-2022

—
Merge Both

Football | Bundesliga Seasons 2005/06 - 2022/23
https://www.kaggle.com/datasets/oles04/bundesliga-seasons

Bundesliga Results 1993-2018
https://www.kaggle.com/datasets/thefc17/bundesliga-results-19932018

## Import from Github

In [30]:
Bundesliga_1993 = pd.read_csv('https://raw.githubusercontent.com/yasser-sulaiman/info-Int/main/data/Bundesliga%20Results%201993-2018.csv')
Bundesliga_2005 = pd.read_csv('https://raw.githubusercontent.com/yasser-sulaiman/info-Int/main/data/Football%20%7C%20Bundesliga%20Seasons%202005%3A06%20-%202022%3A23.csv')
France_League = pd.read_csv('https://raw.githubusercontent.com/yasser-sulaiman/info-Int/main/data/France%20Football%20from%201950%20to%202022.csv')
Turkish_League = pd.read_csv('https://raw.githubusercontent.com/yasser-sulaiman/info-Int/main/data/Turkish%20Super%20League%20Matches%20(1959-2021).csv')
Premier_League = pd.read_csv('https://raw.githubusercontent.com/yasser-sulaiman/info-Int/main/data/Premier%20League%20Matches%201993-2023.csv')
Spanish_League = pd.read_csv('https://raw.githubusercontent.com/yasser-sulaiman/info-Int/main/data/Spanish_League%201970-2017.csv')
Spanish_League_1995 = pd.read_csv('https://raw.githubusercontent.com/yasser-sulaiman/info-Int/main/data/LaLiga_1995-2023.csv')

## Import from Local Directory

In [31]:
#Spanish_League = pd.read_csv('Spanish_League 1970-2017.csv')
#Premier_League = pd.read_csv('Premier League Matches 1993-2023.csv')
#Turkish_League = pd.read_csv('Turkish Super League Matches (1959-2021).csv')
#France_League = pd.read_csv('France Football from 1950 to 2022.csv')
##Bundesliga_2005 = pd.read_csv('Football | Bundesliga Seasons 2005/06 - 2022/23.csv')
#Bundesliga_1993 = pd.read_csv('Bundesliga Results 1993-2018.csv')

# Global Schema

* Our goal is to define a global schema for all datasets
* We instigate the data and drop the unused columns
* We apply some standardization and cleaning on the individual data sources if needed
* We rename the columns to match the names in our suggested global schema
* Suggested Global schema: ["Date", "HomeTeam", "AwayTeam", "HomeGoals", "AwayGoals", "Winner", "League", "Year"]
* Finally we apply some standardization and cleaning on the integrated data

# Prepare Sources for Merging

## Bundesliga_1993

In [32]:
Bundesliga_1993.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR', 'Season'],
      dtype='object')

In [33]:
# Dropping unused columns ('Div', 'Season', 'HTHG', 'HTAG', 'HTR') from the 'Bundesliga_1993' DataFrame
Bundesliga_1993.drop(columns=['Div', 'Season', 'HTHG', 'HTAG', 'HTR'], axis=1, inplace=True)

# Renaming columns to match global schema
Bundesliga_1993.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals', 'FTR': 'Winner'}, inplace=True)

# Adding a 'League' column with the value 'Bundesliga' for each row to indicate the league name
Bundesliga_1993['League'] = 'Bundesliga'

Bundesliga_1993.head(2)

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Winner,League
0,7/8/1993,Bayern Munich,Freiburg,3,1,H,Bundesliga
1,7/8/1993,Dortmund,Karlsruhe,2,1,H,Bundesliga


## Bundesliga_2005

In [34]:
Bundesliga_2005.columns

Index(['Unnamed: 0', 'MATCH_DATE', 'LEAGUE_NAME', 'SEASON', 'LEAGUE',
       'FINISHED', 'LOCATION', 'VIEWER', 'MATCHDAY', 'MATCHDAY_NR',
       'HOME_TEAM_ID', 'HOME_TEAM_NAME', 'HOME_TEAM', 'HOME_ICON',
       'AWAY_TEAM_ID', 'AWAY_TEAM_NAME', 'AWAY_TEAM', 'AWAY_ICON',
       'GOALS_HOME', 'GOALS_AWAY', 'DRAW', 'WIN_HOME', 'WIN_AWAY'],
      dtype='object')

In [35]:
Bundesliga_2005 = Bundesliga_2005.loc[:, ['MATCH_DATE', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'GOALS_HOME', 'GOALS_AWAY']]
Bundesliga_2005.rename(
    columns={
        'MATCH_DATE': 'Date', 
        'HOME_TEAM_NAME': 'HomeTeam', 
        'AWAY_TEAM_NAME': 'AwayTeam', 
        'GOALS_HOME': 'HomeGoals', 
        'GOALS_AWAY': 'AwayGoals'}, inplace=True)

Bundesliga_2005['Winner'] = ['H' if x > y else 'A' if x < y else 'D' for x, y in zip(Bundesliga_2005['HomeGoals'], Bundesliga_2005['AwayGoals'])]
#Bundesliga_2005_new['Date'] = pd.to_datetime(Bundesliga_2005_new['Date'], format='mixed').dt.strftime('%Y-%m-%d')
Bundesliga_2005['League'] = 'Bundesliga'
Bundesliga_2005.head(2)

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Winner,League
0,2005-08-05 20:30:00,FC Bayern München,Borussia Mönchengladbach,3,0,H,Bundesliga
1,2005-08-06 15:30:00,1. FC Köln,1. FSV Mainz 05,1,0,H,Bundesliga


### Matching Team Names in Bundesliga Datasets

In [36]:
# find the best match for each team in Bundesliga_1993
teams_2005 = [team for team in Bundesliga_2005['HomeTeam'].unique() if len(team) > 1]
for t1 in Bundesliga_1993['HomeTeam'].unique():
    best_match = ''
    best_match_score = 0
    for t2 in teams_2005:
        score = levenshtein_distance(t1.lower(), t2.lower())
        if score > best_match_score:
            best_match = t2
            best_match_score = score

    if best_match != '':
        # change each team name in Bundesliga_1993 to the best match from Bundesliga_2005
        Bundesliga_1993['HomeTeam'] = Bundesliga_1993['HomeTeam'].replace(t1, best_match)
        Bundesliga_1993['AwayTeam'] = Bundesliga_1993['AwayTeam'].replace(t1, best_match)

        # remove the best match from Bundesliga_2005 to avoid matching it again
        teams_2005.remove(best_match)

In [37]:
Bundesliga_1993.HomeTeam.unique(), Bundesliga_2005.HomeTeam.unique()

(array(['FC Bayern München', 'Borussia Dortmund', 'MSV Duisburg',
        '1. FC Köln', 'Hamburger SV', 'RB Leipzig',
        'Borussia Mönchengladbach', 'Eintracht Braunschweig',
        'Werder Bremen', 'Bayer Leverkusen', 'Eintracht Frankfurt',
        'SC Freiburg', '1. FC Kaiserslautern', 'Karlsruher SC',
        'FC Energie Cottbus', '1. FC Nürnberg', 'FC Schalke 04',
        'VfB Stuttgart', '1. FC Union Berlin', 'VfL Bochum',
        'Alemannia Aachen', 'FC Ingolstadt 04', 'FC Hansa Rostock',
        'FC St. Pauli', 'Fortuna Düsseldorf', 'Arminia Bielefeld',
        'Hertha BSC', 'VfL Wolfsburg', 'FC Augsburg',
        'SpVgg Greuther Fürth', 'SC Paderborn 07', 'Hannover 96',
        '1. FSV Mainz 05', 'SV Darmstadt 98', 'TSG 1899 Hoffenheim',
        'Augsburg', 'Greuther Furth', 'Fortuna Dusseldorf', 'Braunschweig',
        'Paderborn', 'Darmstadt', 'Ingolstadt'], dtype=object),
 array(['FC Bayern München', '1. FC Köln', 'MSV Duisburg', 'Hamburger SV',
        'VfL Wolfsburg'

## France_League

In [38]:
France_League.columns

Index(['id', 'date', 'local', 'visiteur', 'ligue', 'saison', 'local_score',
       'visiteur_score', 'resultat'],
      dtype='object')

In [39]:
France_League = France_League.loc[:, ['date', 'local', 'visiteur', 'local_score', 'visiteur_score']]
France_League.rename(
    columns={
        'date': 'Date', 
        'local': 'HomeTeam', 
        'visiteur': 'AwayTeam', 
        'local_score': 'HomeGoals', 
        'visiteur_score': 'AwayGoals'}, inplace=True)

France_League['Winner'] = ['H' if x > y else 'A' if x < y else 'D' for x, y in zip(France_League['HomeGoals'], France_League['AwayGoals'])]
France_League['League'] = 'Ligue 1'
France_League.head(2)

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Winner,League
0,2023-01-29,OGC Nice,LOSC Lille,1,0,H,Ligue 1
1,2023-01-29,AJ Auxerre,Montpellier HSC,0,2,A,Ligue 1


## Premier_League

In [40]:
Premier_League.columns

Index(['Season_End_Year', 'Wk', 'Date', 'Home', 'HomeGoals', 'AwayGoals',
       'Away', 'FTR'],
      dtype='object')

In [41]:
Premier_League = Premier_League.loc[:, ['Date', 'Home', 'Away', 'HomeGoals', 'AwayGoals', 'FTR']]
Premier_League.rename(
    columns={
        'Home': 'HomeTeam', 
        'Away': 'AwayTeam', 
        'FTR': 'Winner'}, inplace=True)
Premier_League['League'] = 'Premier League'
Premier_League.head(2)

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Winner,League
0,1992-08-15,Coventry City,Middlesbrough,2,1,H,Premier League
1,1992-08-15,Leeds United,Wimbledon,2,1,H,Premier League


## Spanish_League

In [42]:
Spanish_League.columns

Index(['id', 'season', 'division', 'round', 'localTeam', 'visitorTeam',
       'localGoals', 'visitorGoals', 'date', 'timestamp'],
      dtype='object')

In [43]:
Spanish_League = Spanish_League.loc[:, ['date', 'localTeam', 'visitorTeam', 'localGoals', 'visitorGoals']]
Spanish_League.rename(
    columns={
        'date': 'Date', 
        'localTeam': 'HomeTeam', 
        'visitorTeam': 'AwayTeam', 
        'localGoals': 'HomeGoals', 
        'visitorGoals': 'AwayGoals'}, inplace=True)

Spanish_League['Winner'] = ['H' if x > y else 'A' if x < y else 'D' for x, y in zip(Spanish_League['HomeGoals'], Spanish_League['AwayGoals'])]
Spanish_League['League'] = 'La Liga'
Spanish_League.head(2)

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Winner,League
0,12/09/1970,Atletico de Bilbao,Barcelona,1,1,D,La Liga
1,12/09/1970,Las Palmas,Atletico de Madrid,1,1,D,La Liga


## Spanish League 1995-2023

In [44]:
Spanish_League_1995.columns

Index(['Season', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR'],
      dtype='object')

In [45]:
Spanish_League_1995 = Spanish_League_1995.loc[:, ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']]
Spanish_League_1995.rename(
    columns={
        'FTHG': 'HomeGoals', 
        'FTAG': 'AwayGoals'}, inplace=True)
Spanish_League_1995['Winner'] = ['H' if x > y else 'A' if x < y else 'D' for x, y in zip(Spanish_League_1995['HomeGoals'], Spanish_League_1995['AwayGoals'])]
Spanish_League_1995['League'] = 'La Liga'
Spanish_League_1995.head(2)

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Winner,League
0,02-09-1995,La Coruna,Valencia,3,0,H,La Liga
1,02-09-1995,Sp Gijon,Albacete,3,0,H,La Liga


### Matching Team Names in Spanish League Datasets

In [46]:
# find the best match for each team in Spanish_League
teams_1995 = [team for team in Spanish_League_1995['HomeTeam'].unique() if len(team) > 1]
for t1 in Spanish_League['HomeTeam'].unique():
    best_match = ''
    best_match_score = 0
    for t2 in teams_1995:
        score = levenshtein_distance(t1.lower(), t2.lower())
        if score > best_match_score:
            best_match = t2
            best_match_score = score

    if best_match != '':
        # change each team name in Spanish_League to the best match from Spanish_League_1995
        Spanish_League['HomeTeam'] = Spanish_League['HomeTeam'].replace(t1, best_match)
        Spanish_League['AwayTeam'] = Spanish_League['AwayTeam'].replace(t1, best_match)

        # remove the best match from teams_1995 to avoid matching it again
        teams_1995.remove(best_match)

## Turkish_League

In [47]:
Turkish_League.columns

Index(['Date', 'Season', 'Week', 'home', 'visitor', 'FT', 'hgoal', 'vgoal',
       'division', 'tier', 'totgoal', 'goaldiff', 'result', 'HT', 'hgoal_half',
       'vgoal_half', 'half_totgoal', 'half_goaldiff', 'result_half', 'fans',
       'neutral', 'home_red_card', 'visitor_red_card'],
      dtype='object')

In [48]:
Turkish_League = Turkish_League.loc[:, ['Date', 'home', 'visitor', 'hgoal', 'vgoal', 'result']]
Turkish_League.rename(
    columns={
        'home': 'HomeTeam', 
        'visitor': 'AwayTeam', 
        'hgoal': 'HomeGoals', 
        'vgoal': 'AwayGoals', 
        'result': 'Winner'}, inplace=True)
Turkish_League['League'] = 'Turkish Super League'
Turkish_League.head(2)

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Winner,League
0,1959-02-21,Genclerbirligi,Adalet,1,1,D,Turkish Super League
1,1959-02-21,Izmirspor,Beykoz 1908,2,1,H,Turkish Super League


# Data Integration

In [49]:
#concatenate DataFrames (Bundesliga_1993_new, Bundesliga_2005_new, France_League_new, Premier_League_new, Spanish_League_new, Turkish_League_new)
data = pd.concat([Bundesliga_1993, Bundesliga_2005, France_League, Premier_League, Spanish_League, Spanish_League_1995, Turkish_League], ignore_index=True)

#checks whether the length of the concatenated DataFrame data is equal to the sum of the lengths of all the individual DataFrames
assert len(data) == len(Bundesliga_1993) + len(Bundesliga_2005) + len(France_League) + len(Premier_League) + len(Spanish_League) + len(Turkish_League) + len(Spanish_League_1995)

In [50]:
# standardize the date format to 'dd-mm-yyyy'
convert_date_column(data, 'Date')

In [51]:
# add year column for easier queries
data['Year'] = data.Date.dt.year

In [52]:
data.isna().sum() #display the count of missing values

Date         0
HomeTeam     0
AwayTeam     0
HomeGoals    0
AwayGoals    0
Winner       0
League       0
Year         0
dtype: int64

# **Remove Duplicates**

In [54]:
# Checking for duplicates in the updated data
duplicates_check = data[data.duplicated(subset=["Date", "HomeTeam"], keep=False)]

# Printing the duplicates (if any)
if len(duplicates_check) == 0:
    print("No duplicates found")
else:
    print(f"There are {len(duplicates_check)} duplicates")


# Remove duplicates 
data_no_duplicates = data.drop_duplicates(subset=["Date", "HomeTeam"], keep='first')

# Checking for duplicates in the updated data
duplicates_check = data_no_duplicates[data_no_duplicates.duplicated()]

# Printing the duplicates (if any)
if len(duplicates_check) == 0:
    print("No duplicates found after removing duplicates")
else:
    print("Duplicates still exist after removing duplicates")
    print(duplicates_check)

There are 20879 duplicates
No duplicates found after removing duplicates


# **Download the CSV File**

In [27]:
# Assuming 'data_no_duplicates' is the DataFrame you want to save
# Replace 'file_path' with the path where you want to save the CSV file along with the desired file name

file_path = 'data/data_no_duplicates.csv'  # Modify 'path_to_save' and file name as needed

data_no_duplicates.to_csv(file_path, index=False)
