In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("international_matches 3.csv")

# Display the first few rows
df.head()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23921 entries, 0 to 23920
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   date                           23921 non-null  object 
 1   home_team                      23921 non-null  object 
 2   away_team                      23921 non-null  object 
 3   home_team_continent            23921 non-null  object 
 4   away_team_continent            23921 non-null  object 
 5   home_team_fifa_rank            23921 non-null  int64  
 6   away_team_fifa_rank            23921 non-null  int64  
 7   home_team_total_fifa_points    23921 non-null  int64  
 8   away_team_total_fifa_points    23921 non-null  int64  
 9   home_team_score                23921 non-null  int64  
 10  away_team_score                23921 non-null  int64  
 11  tournament                     23921 non-null  object 
 12  city                           23921 non-null 

In [3]:
# list of the groups in world cup tornament
groups = {
    "Group A": ["Netherlands", "Senegal", "Ecuador", "Qatar"],
    "Group B": ["England", "USA", "Iran", "Wales"],
    "Group C": ["Argentina", "Poland", "Mexico", "Saudi Arabia"],
    "Group D": ["France", "Australia", "Tunisia", "Denmark"],
    "Group E": ["Japan", "Spain", "Germany", "Costa Rica"],
    "Group F": ["Morocco", "Croatia", "Belgium", "Canada"],
    "Group G": ["Brazil", "Switzerland", "Cameroon", "Serbia"],
    "Group H": ["Portugal", "South Korea", "Uruguay", "Ghana"]
}

# list of qualified teams
qualified_teams = [
    "Netherlands","Senegal","England","USA","Argentina","Poland",
    "France","Australia","Japan","Spain","Morocco","Croatia",
    "Brazil","Switzerland","Portugal","South Korea"
]

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

# if home team loses, the away team gets the point
def encode_result(row):
    if row['home_team_result'] == 'Win':
        return 1, 0  #1 for home, 0 for away
    else:
        return 0, 1  # 0 for home, 1 for away

# function to encode results
df[['home_result', 'away_result']] = df.apply(encode_result, axis=1, result_type="expand")

# new feature for rank difference
df['rank_diff'] = df['home_team_fifa_rank'] - df['away_team_fifa_rank']

# new feature for point difference
df['points_diff'] = df['home_team_total_fifa_points'] - df['away_team_total_fifa_points']

# new feature for score difference
df['score_diff'] = df['home_team_score'] - df['away_team_score']

# the features will be used for the the model
X = df[['home_team_fifa_rank', 'away_team_fifa_rank',
        'home_team_total_fifa_points', 'away_team_total_fifa_points',
        'home_team_score', 'away_team_score', 'rank_diff', 'points_diff', 'score_diff']]

# variable for home and away result
y_home = df['home_result']  # Target for home team win (1) or loss (0)
y_away = df['away_result']  # Target for away team win (1) or loss (0)

# split the data into training and testing sets
X_train, X_test, y_home_train, y_home_test, y_away_train, y_away_test = train_test_split(
    X, y_home, y_away, test_size=0.2, random_state=42)

# Normalize the features to make sure all the features are on the same scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Decision Tree model for home team prediction
clf_home = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=10, min_samples_leaf=5)
clf_home.fit(X_train_scaled, y_home_train)

# Initialize and train the Decision Tree model for away team prediction
clf_away = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=10, min_samples_leaf=5)
clf_away.fit(X_train_scaled, y_away_train)

# Predict the outcomes for the test set
y_home_pred = clf_home.predict(X_test_scaled)
y_away_pred = clf_away.predict(X_test_scaled)

# Calculate the accuracy of the models
accuracy_home = accuracy_score(y_home_test, y_home_pred)
accuracy_away = accuracy_score(y_away_test, y_away_pred)

print(f'Home team prediction accuracy: {accuracy_home * 100:.2f}%')
print(f'Away team prediction accuracy: {accuracy_away * 100:.2f}%')

# list of qualified teams for the 2022 World Cup
qualified_teams = [
    "Netherlands", "Senegal", "England", "USA", "Argentina", "Poland",
    "France", "Australia", "Japan", "Spain", "Morocco", "Croatia",
    "Brazil", "Switzerland", "Portugal", "South Korea"
]

# Function to get the team statistics
def get_team_statistics(team_name):
    team_data_home = df[df['home_team'] == team_name]
    team_data_away = df[df['away_team'] == team_name]

    # Aggregate the team's historical performance
    total_fifa_points = team_data_home['home_team_total_fifa_points'].sum() + team_data_away['away_team_total_fifa_points'].sum()
    average_fifa_rank = (team_data_home['home_team_fifa_rank'].mean() + team_data_away['away_team_fifa_rank'].mean()) / 2

    # Calculate the number of home and away wins
    home_wins = team_data_home[team_data_home['home_result'] == 1].shape[0]
    away_wins = team_data_away[team_data_away['away_result'] == 1].shape[0]

    return total_fifa_points, average_fifa_rank, home_wins, away_wins

# Now let's get the statistics for each team and predict which one is most likely to win
team_performance = {}
for team in qualified_teams:
    total_fifa_points, average_fifa_rank, home_wins, away_wins = get_team_statistics(team)
    team_performance[team] = {
        'total_fifa_points': total_fifa_points,
        'average_fifa_rank': average_fifa_rank,
        'home_wins': home_wins,
        'away_wins': away_wins
    }

# Rank the teams based on their total FIFA points and average FIFA rank
sorted_teams = sorted(team_performance.items(), key=lambda x: (x[1]['total_fifa_points'], -x[1]['average_fifa_rank'], x[1]['home_wins'] + x[1]['away_wins']), reverse=True)

# Display the predicted top 3 teams
print("Predicted top teams:")
for rank, (team, performance) in enumerate(sorted_teams[:3], 1):
    print(f"{rank}. {team} (Total FIFA Points: {performance['total_fifa_points']}, Average FIFA Rank: {performance['average_fifa_rank']:.2f}, Wins: {performance['home_wins'] + performance['away_wins']})")


Home team prediction accuracy: 99.12%
Away team prediction accuracy: 99.12%
Predicted top teams:
1. Spain (Total FIFA Points: 206508, Average FIFA Rank: 5.42, Wins: 283)
2. Brazil (Total FIFA Points: 203947, Average FIFA Rank: 3.12, Wins: 346)
3. Argentina (Total FIFA Points: 194845, Average FIFA Rank: 6.30, Wins: 259)
