In [16]:
import requests
import os
from dotenv import load_dotenv
import json
import time
import pandas as pd
from fuzzywuzzy import fuzz, process
import re

## Fetch Team Names from the Odds API

In [17]:
# Load the API key from the .env file
load_dotenv()
api_key = os.getenv("API_KEY")

# Define sport keys for men's and women's NCAA basketball
sport_keys = {
    "men": "basketball_ncaab",
}

# Base URL for The Odds API
base_url = "https://api.the-odds-api.com/v4"

def fetch_events(sport_key, gender):
    url = f"{base_url}/sports/{sport_key}/events"
    params = {
        "apiKey": api_key
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json(), gender
    else:
        print(f"Error fetching events for {sport_key}: {response.status_code} - {response.text}")
        return [], gender

# Main script
if __name__ == "__main__":
    all_teams = []

    for gender, sport_key in sport_keys.items():
        print(f"Fetching events for {gender}'s NCAA basketball")
        events, gender = fetch_events(sport_key, gender)
        
        for event in events:
            home_team = event.get("home_team")
            away_team = event.get("away_team")
            if home_team:
                all_teams.append({"team": home_team, "gender": gender})
            if away_team:
                all_teams.append({"team": away_team, "gender": gender})
        
        # Respect API rate limits
        time.sleep(1)

    # Load existing team names if file exists
    existing_teams = []
    if os.path.exists('ncaa_basketball_teams.json'):
        with open('ncaa_basketball_teams.json', 'r') as f:
            existing_teams = json.load(f)

    # Combine new and existing teams
    all_teams.extend(existing_teams)

    # Remove duplicates by creating a set of tuples (team, gender) and convert back to list of dicts
    unique_teams = [dict(t) for t in set(tuple(team.items()) for team in all_teams)]

    # Save the list of teams to a JSON file
    with open('ncaa_basketball_teams.json', 'w') as f:
        json.dump(unique_teams, f, indent=4)

    print(f"Total unique NCAA basketball teams fetched: {len(unique_teams)}")


Fetching events for men's NCAA basketball
Total unique NCAA basketball teams fetched: 195


## Match fetched names to my prediction database

In [18]:
import json
import os
import re
import pandas as pd
from fuzzywuzzy import fuzz, process

# Load unique teams and prediction dataset
with open('ncaa_basketball_teams.json', 'r') as f:
    unique_teams = json.load(f)

prediction_teams = pd.read_parquet("/Users/zacharias/Dropbox/Python/Project Jupiter/Project Zoccer/ncaa_basketball/project_ncaa_live/ncaa_team_scores_working_file.parquet").columns

# Function to normalize team names by removing punctuation and converting to lowercase
def normalize_name(name):
    return re.sub(r"[^a-zA-Z0-9 ]", "", name).lower().replace(" ", "_")

# Function to create name variations by removing the last one or two words
def create_name_variations(name):
    words = name.split()
    variations = [name]  # Original name
    if len(words) > 1:
        variations.append(" ".join(words[:-1]))  # Drop last word
    if len(words) > 2:
        variations.append(" ".join(words[:-2]))  # Drop last two words
    return variations

# Load existing matched teams to avoid adding them to unmatched
master_file = "matched_teams_master.csv"
if os.path.exists(master_file):
    master_df = pd.read_csv(master_file)
    already_matched_teams = set(master_df["original_name"].values)
else:
    master_df = pd.DataFrame(columns=["original_name", "expected_name", "best_match", "match_score"])
    already_matched_teams = set()

# Lists to store new matched and unmatched teams
matched_teams = []
unmatched_teams = []

# Try to match each team in unique_teams to prediction_teams
for team_entry in unique_teams:
    original_name = team_entry['team']

    # Skip if already matched
    if original_name in already_matched_teams:
        continue

    normalized_variations = [normalize_name(var) for var in create_name_variations(original_name)]
    gender_suffix = "_M" if team_entry['gender'] == 'men' else "_F"

    # Collect all matches across all variations
    all_matches = []
    for name_variation in normalized_variations:
        full_team_name = f"{name_variation}{gender_suffix}"
        matches_for_variation = process.extract(full_team_name, prediction_teams, limit=5, scorer=fuzz.token_sort_ratio)
        all_matches.extend(matches_for_variation)

    # Sort and deduplicate all matches by score and select the top 5 unique matches
    all_matches = sorted(set(all_matches), key=lambda x: x[1], reverse=True)[:5]
    unique_matches = all_matches

    # Check if the best match is above the threshold
    if unique_matches and unique_matches[0][1] >= 85:
        matched_teams.append({
            "original_name": original_name,
            "expected_name": f"{normalize_name(original_name)}{gender_suffix}",
            "best_match": unique_matches[0][0],
            "match_score": unique_matches[0][1]
        })
    else:
        # If no good match is found, save the original name with the top 5 unique matches for manual review
        unmatched_teams.append({
            "original_name": original_name,
            "expected_name": f"{normalize_name(original_name)}{gender_suffix}",
            "top_matches": unique_matches  # List of top 5 unique matches and scores
        })

# Append new matches to the master file without overwriting existing entries
new_matches_df = pd.DataFrame(matched_teams)
master_df = pd.concat([master_df, new_matches_df], ignore_index=True).drop_duplicates(subset="original_name")
master_df.to_csv(master_file, index=False)

# Save unmatched teams with suggestions for manual review
with open("unmatched_teams_with_suggestions.json", "w") as f:
    json.dump(unmatched_teams, f, indent=4)

# Calculate success rate based on newly processed teams
newly_matched_count = len(matched_teams)
unmatched_count = len(unmatched_teams)

# Calculate success rate
if newly_matched_count + unmatched_count > 0:
    match_success_rate = (newly_matched_count / (newly_matched_count + unmatched_count)) * 100
else:
    match_success_rate = 100  # If no teams processed, assume full success

print(f"\nMatch Success Rate: {match_success_rate:.2f}%")

print("Matched teams saved to 'matched_teams_master.csv'")
print("Unmatched teams with top suggestions saved to 'unmatched_teams_with_suggestions.json'")


Match Success Rate: 74.70%
Matched teams saved to 'matched_teams_master.csv'
Unmatched teams with top suggestions saved to 'unmatched_teams_with_suggestions.json'
