<a href="https://colab.research.google.com/github/tao8max/JavaScript-final-project/blob/master/MLS_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


retrieve data from SportRadar and save the information about each team in the current MLS season to files in a folder. Each xml file includes information about the previous and upcoming games for each team. It also includes cool stats like yellow cards, shots on target per player, referee country of birth, ball possession percentage etc..

In [None]:
import requests
import os

# List of competitor IDs
competitor_ids = [
    "sr:competitor:5133",
    "sr:competitor:7080",
    "sr:competitor:21825",
    "sr:competitor:22006",
    "sr:competitor:22007",
    "sr:competitor:22009",
    "sr:competitor:22010",
    "sr:competitor:39833",
    "sr:competitor:41618",
    "sr:competitor:52237",
    "sr:competitor:167510",
    "sr:competitor:245305",
    "sr:competitor:305920",
    "sr:competitor:402227",
    "sr:competitor:659691",
    "sr:competitor:668063",
    "sr:competitor:772256",
    "sr:competitor:863473",
    "sr:competitor:874725"
]

# Your API key goes here
api_key = "tznstwwhjksc7wk84wxaazcs"

# Base URL for the API
base_url = "https://api.sportradar.us/soccer/trial/v4/en/competitors/{}/summaries.xml"

# Directory to save the files
output_directory = "competitor_data"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Loop through each competitor ID
for competitor_id in competitor_ids:
    # Make the API request
    url = base_url.format(competitor_id)
    response = requests.get(url, params={"api_key": api_key})

    # Check if the request was successful
    if response.status_code == 200:
        # Save the data to a file
        # Replacing colons with underscores in the file name
        file_name = f"{competitor_id.replace(':', '_')}.xml"
        file_path = os.path.join(output_directory, file_name)
        with open(file_path, "w") as file:
            file.write(response.text)
        print(f"Data for {competitor_id} saved to {file_path}")
    else:
        print(f"Failed to retrieve data for {competitor_id}. Status code: {response.status_code}")

print("Completed.")


Data for sr:competitor:5133 saved to competitor_data/sr_competitor_5133.xml
Data for sr:competitor:7080 saved to competitor_data/sr_competitor_7080.xml
Data for sr:competitor:21825 saved to competitor_data/sr_competitor_21825.xml
Data for sr:competitor:22006 saved to competitor_data/sr_competitor_22006.xml
Data for sr:competitor:22007 saved to competitor_data/sr_competitor_22007.xml
Data for sr:competitor:22009 saved to competitor_data/sr_competitor_22009.xml
Data for sr:competitor:22010 saved to competitor_data/sr_competitor_22010.xml
Data for sr:competitor:39833 saved to competitor_data/sr_competitor_39833.xml
Data for sr:competitor:41618 saved to competitor_data/sr_competitor_41618.xml
Data for sr:competitor:52237 saved to competitor_data/sr_competitor_52237.xml
Data for sr:competitor:167510 saved to competitor_data/sr_competitor_167510.xml
Data for sr:competitor:245305 saved to competitor_data/sr_competitor_245305.xml
Data for sr:competitor:305920 saved to competitor_data/sr_compet

Creating a csv with all game data

create a list of the upcoming MLS game in the current season. I had to use the mapping from team name to id since I had a hard time training the model on team name strings. It worked when I mapped the team names to an integer (team_mappings.csv)

In [None]:
import csv
import xml.etree.ElementTree as ET
from datetime import datetime
import os

# Define the XML namespace
namespace = {'ns': 'http://schemas.sportradar.com/sportsapi/soccer/v4'}

# Create a dictionary to store the team mappings
team_mappings = {}

# Get a list of XML files in the folder
folder_path = 'competitor_data'
xml_files = [file for file in os.listdir(folder_path) if file.endswith('.xml')]

# Process each XML file
for file in xml_files:
    # Load and parse the XML file
    tree = ET.parse(os.path.join(folder_path, file))
    root = tree.getroot()

    # Find all competitors elements
    competitors_elements = root.findall(".//ns:competitor", namespace)

    # Iterate over the competitors elements
    for competitor_element in competitors_elements:
        team_id = competitor_element.attrib.get('id')
        team_name = competitor_element.attrib.get('name')

        # Extract the number from the id field
        team_number = int(team_id.split(':')[-1])

        # Add the team mapping to the dictionary
        team_mappings[team_name] = team_number

# Create a list to store the formatted data
data = []

# Process each XML file again to extract game data
for file in xml_files:
    # Load and parse the XML file
    tree = ET.parse(os.path.join(folder_path, file))
    root = tree.getroot()

    # Find all summary elements
    summary_elements = root.findall(".//ns:summary", namespace)

    # Iterate over the summary elements
    for summary_element in summary_elements:
        # Extract data from sport_event element
        sport_event_element = summary_element.find("ns:sport_event", namespace)
        start_time = sport_event_element.attrib.get('start_time')

        # Convert the start time to a datetime object
        start_datetime = datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S%z")

        # Check if the game has already happened
        if start_datetime < datetime.now(start_datetime.tzinfo):
            # Extract data from sport_event_status element
            sport_event_status_element = summary_element.find("ns:sport_event_status", namespace)
            home_score = sport_event_status_element.attrib.get('home_score')
            away_score = sport_event_status_element.attrib.get('away_score')

            # Extract data from competitors element
            competitors_element = sport_event_element.find("ns:competitors", namespace)
            home_team = competitors_element.find("./ns:competitor[@qualifier='home']", namespace).attrib.get('name')
            away_team = competitors_element.find("./ns:competitor[@qualifier='away']", namespace).attrib.get('name')

            # Determine the winner or indicate a tie
            if home_score is not None and away_score is not None:
                if home_score > away_score:
                    winner = "Home Team"
                elif home_score < away_score:
                    winner = "Away Team"
                else:
                    winner = "Tie"
            else:
                winner = None

            # Replace the team names with numbers
            home_team_number = team_mappings.get(home_team)
            away_team_number = team_mappings.get(away_team)

            # Create a dictionary with the extracted data
            game_data = {
                "Start Time": start_datetime,
                "Home Team": home_team_number,
                "Away Team": away_team_number,
                "Home Score": home_score,
                "Away Score": away_score,
                "Winner": winner
            }

            # Append the game data to the list
            data.append(game_data)

# Define the CSV file path
csv_file_path = 'game_data.csv'

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='') as csvfile:
    fieldnames = ["Start Time", "Home Team", "Away Team", "Home Score", "Away Score", "Winner"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write the header row
    writer.writeheader()

    # Write the data rows
    for game in data:
        writer.writerow(game)

print("Data has been written to", csv_file_path)



Data has been written to game_data.csv


let's train the model, I used logistic regression.. I asked chatGPT to provide the simplest way to train the model and it suggested logistic regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Load the game data from the CSV file
data = pd.read_csv('game_data.csv')

# Create the feature matrix X and target variable y
X = data[['Home Team', 'Away Team']]
y = (data['Winner'] == 'Home Team').astype(int)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'logistic_regression_model.pkl')

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.53      0.55      0.54        64
           1       0.40      0.38      0.39        50

    accuracy                           0.47       114
   macro avg       0.46      0.46      0.46       114
weighted avg       0.47      0.47      0.47       114



use the model to predict future games

In [None]:
import pandas as pd
import joblib

# Load the trained model
model = joblib.load('logistic_regression_model.pkl')

# Load the future games data
future_games = pd.read_csv('future_games.csv')

# Perform one-hot encoding on the team names
teams_encoded = future_games[['Home Team', 'Away Team']]

# Make predictions on the future games
predictions = model.predict(teams_encoded)

# Create a new DataFrame with the predicted outcomes
predicted_games = future_games.copy()
predicted_games['Winner'] = ['Home Team' if prediction == 1 else 'Away Team' for prediction in predictions]

# Write the predicted games to a CSV file
predicted_games.to_csv('predicted_games.csv', index=False)

print("Predicted games have been written to predicted_games.csv")


Predicted games have been written to predicted_games.csv


replace the numbers with the team names

In [None]:
import pandas as pd

# Read the predicted games file
predicted_games = pd.read_csv('predicted_games.csv')

# Read the team mappings file
team_mappings = pd.read_csv('team_mappings.csv')

# Create a dictionary mapping team numbers to team names
team_mapping_dict = dict(zip(team_mappings['Team Number'], team_mappings['Team Name']))

# Replace team numbers with team names in the predicted games DataFrame
predicted_games['Home Team'] = predicted_games['Home Team'].map(team_mapping_dict)
predicted_games['Away Team'] = predicted_games['Away Team'].map(team_mapping_dict)

# Write the updated predicted games to a new file
predicted_games.to_csv('predicted_games_with_names.csv', index=False)

print("Predicted games with team names have been written to predicted_games_with_names.csv")


Predicted games with team names have been written to predicted_games_with_names.csv


In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/predicted_games_with_names.csv')

# Sort the DataFrame by the "Date" column
df_sorted = df.sort_values(by='Date')

# Display the sorted DataFrame
display(df_sorted)



Unnamed: 0,Date,Home Team,Away Team,Winner
283,2023-07-01,Orlando City SC,Chicago Fire,Home Team
150,2023-07-01,Toronto FC,Real Salt Lake,Away Team
250,2023-07-01,Inter Miami CF,Austin FC,Away Team
84,2023-07-01,FC Cincinnati,New England Revolution,Home Team
233,2023-07-01,Inter Miami CF,Austin FC,Away Team
...,...,...,...,...
135,2023-10-21,Toronto FC,Orlando City SC,Away Team
183,2023-10-21,San Jose Earthquakes,Austin FC,Away Team
151,2023-10-21,Saint Louis City SC,Seattle Sounders,Home Team
32,2023-10-21,Saint Louis City SC,Seattle Sounders,Home Team
