In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from requests_html import HTMLSession

import pandas as pd

In [2]:
df = pd.read_csv("All Matches.csv", index_col=False)
# Fill NaN values with integer 0
#df = df.dropna()
df

Unnamed: 0,Date,HomeTeamName,AwayTeamName,HomeTeamGoals,AwayTeamGoals,Stage,SpecialWinConditions,Stadium,City,Attendance,Year
0,1960-07-06,France,Yugoslavia,4,5,Semi-finals,,Parc des Princes,Paris,26370,1960
1,1960-07-06,Czechoslovakia,Soviet Union,0,3,Semi-finals,,Stade Vélodrome,Marseille,25184,1960
2,1960-07-09,Czechoslovakia,France,2,0,Third place play-off,,Stade Vélodrome,Marseille,9438,1960
3,1960-07-10,Soviet Union,Yugoslavia,2,1,Final,Soviet Union win after extra time,Parc des Princes,Paris,17966,1960
4,1964-06-17,Spain,Hungary,2,1,Semi-finals,Spain win after extra time,Santiago Bernabéu,Madrid,34713,1964
...,...,...,...,...,...,...,...,...,...,...,...
281,2016-07-02,Germany,Italy,1,1,Quarter-finals,Germany win on Penalities 6–5,Nouveau Stade de Bordeaux,Bordeaux,38764,2016
282,2016-07-03,France,Iceland,5,2,Quarter-finals,,Stade de France,Saint-Denis,76833,2016
283,2016-07-06,Portugal,Wales,2,0,Semi-finals,,Parc Olympique Lyonnais,Décines-Charpieu,55679,2016
284,2016-07-07,Germany,France,0,2,Semi-finals,,Stade Vélodrome,Marseille,64078,2016


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

In [4]:
# Function to preprocess data and return X, y
def preprocess_data(df):
    # Define match outcome based on HomeTeamGoals and AwayTeamGoals
    def get_match_outcome(row):
        if row['HomeTeamGoals'] > row['AwayTeamGoals']:
            return 'Home Team Wins'
        elif row['AwayTeamGoals'] > row['HomeTeamGoals']:
            return 'Away Team Wins'
        else:
            return 'Draw'

    df['Match Outcome'] = df.apply(get_match_outcome, axis=1)
    
    # Encode categorical variables
    label_encoder = LabelEncoder()
    categorical_cols = ['HomeTeamName', 'AwayTeamName', 'Stage', 'SpecialWinConditions', 'Stadium', 'City']
    
    for col in categorical_cols:
        df[col] = label_encoder.fit_transform(df[col])
    
    # Encode Match Outcome as requested (0: Away Team Wins, 1: Draw, 2: Home Team Wins)
    df['Match Outcome'] = df['Match Outcome'].map({'Away Team Wins': 0, 'Draw': 1, 'Home Team Wins': 2})
    
    # Separate features (X) and target (y)
    X = df[['HomeTeamName', 'AwayTeamName', 'HomeTeamGoals', 'AwayTeamGoals', 'Stage', 
            'SpecialWinConditions', 'Stadium', 'City', 'Attendance']]
    y = df['Match Outcome']
    
    return X, y, label_encoder

# Preprocess data
X, y, label_encoder = preprocess_data(df)

In [5]:
df.tail()

Unnamed: 0,Date,HomeTeamName,AwayTeamName,HomeTeamGoals,AwayTeamGoals,Stage,SpecialWinConditions,Stadium,City,Attendance,Year,Match Outcome
281,2016-07-02,12,16,1,1,9,4,34,9,38764,2016,1
282,2016-07-03,11,15,5,2,9,30,56,60,76833,2016,2
283,2016-07-06,22,34,2,0,11,30,39,17,55679,2016,2
284,2016-07-07,12,11,0,2,11,30,55,46,64078,2016,0
285,2016-07-10,22,11,1,0,0,9,56,60,75868,2016,2


In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((228, 9), (58, 9), (228,), (58,))

In [7]:
# Initialize and train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate model
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9827586206896551


In [9]:
# Define function to predict match outcome probabilities
def predict_match(home_team, away_team, model, label_encoder):
    # Encode team names using the fitted LabelEncoder
    home_team_encoded = label_encoder.transform([home_team])[0]
    away_team_encoded = label_encoder.transform([away_team])[0]
    
    # Example: Assuming placeholder values for other features
    home_goals = 0
    away_goals = 0
    stage = 0
    special_conditions = label_encoder.transform(['NaN'])[0]  # Encode 'NaN' using LabelEncoder
    stadium = 0
    city = 0
    attendance = 0
    
    # Create DataFrame for the new match
    new_match = pd.DataFrame({
        'HomeTeamName': [home_team_encoded],
        'AwayTeamName': [away_team_encoded],
        'HomeTeamGoals': [home_goals],
        'AwayTeamGoals': [away_goals],
        'Stage': [stage],
        'SpecialWinConditions': [special_conditions],
        'Stadium': [stadium],
        'City': [city],
        'Attendance': [attendance]
    })
    
    # Predict probabilities for the new match using the trained model
    probabilities = model.predict_proba(new_match)[0]
    
    # Decode predicted outcome
    predicted_outcome = model.predict(new_match)[0]
    if predicted_outcome == 0:
        outcome = 'Away Team Wins'
    elif predicted_outcome == 1:
        outcome = 'Draw'
    else:
        outcome = 'Home Team Wins'
    
    return {
        'Outcome': outcome,
        'Probabilities': {
            'Away Team Wins': probabilities[0],
            'Draw': probabilities[1],
            'Home Team Wins': probabilities[2]
        }
    }

# Example usage
home_team = 'Germany'
away_team = 'Scotland'
predictions = predict_match(home_team, away_team, model, label_encoder)

print(f"Predicted outcome for {home_team} vs {away_team}:")
print(f"- {predictions['Outcome']}")
print(f"- Probabilities:")
print(f"  - Away Team Wins: {predictions['Probabilities']['Away Team Wins']:.2f}")
print(f"  - Draw: {predictions['Probabilities']['Draw']:.2f}")
print(f"  - Home Team Wins: {predictions['Probabilities']['Home Team Wins']:.2f}")

ValueError: y contains previously unseen labels: 'Germany'