In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from datetime import datetime
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup


### Utilize Massey Files to get total Wins and Losses data

In [3]:
def parse_ncaa_games(input_file):
    """Parse NCAA basketball games data from file."""
    # Read the content of the input file
    with open(input_file, 'r') as file:
        content = file.read()
    
    # Split by game entries
    games = re.findall(r'(\d+,\d+,\s*\d+,\s*-?\d+,\s*\d+,\s*\d+,\s*-?\d+,\s*\d+)', content)
    
    # Create a list to hold the parsed data
    data = []
    
    # Parse each game
    for game in games:
        # Split by comma and strip whitespace
        values = [value.strip() for value in game.split(',')]
        data.append(values)
    
    # Define column names
    columns = [
        'GameID', 'Date', 'TeamID1', 'Location1', 'Score1', 
        'TeamID2', 'Location2', 'Score2'
    ]
    
    # Create a pandas DataFrame
    df = pd.DataFrame(data, columns=columns)
    
    # Convert columns to appropriate types
    df['GameID'] = df['GameID'].astype(int)
    df['Date'] = df['Date'].astype(int)
    df['TeamID1'] = df['TeamID1'].astype(int)
    df['Location1'] = df['Location1'].astype(int)
    df['Score1'] = df['Score1'].astype(int)
    df['TeamID2'] = df['TeamID2'].astype(int)
    df['Location2'] = df['Location2'].astype(int)
    df['Score2'] = df['Score2'].astype(int)
    
    return df

def determine_win_loss(row):
    """Add win/loss columns to a game row."""
    if row['Score1'] > row['Score2']:
        row['Win1'] = 1
        row['Loss1'] = 0
        row['Win2'] = 0
        row['Loss2'] = 1
    else:
        row['Win1'] = 0
        row['Loss1'] = 1
        row['Win2'] = 1
        row['Loss2'] = 0
    return row

def prepare_team_game_records(games_df):
    """Convert games data to individual team records."""
    # Add win/loss information
    games_df = games_df.apply(determine_win_loss, axis=1)
    
    # Create records for team1
    team1_records = games_df[[
        'GameID', 'Date', 'TeamID1', 'Location1', 'Score1', 
        'TeamID2', 'Win1', 'Loss1'
    ]].rename(columns={
        'TeamID1': 'TeamID',
        'Location1': 'Location',
        'Score1': 'Score',
        'TeamID2': 'OpponentID',
        'Win1': 'Win',
        'Loss1': 'Loss'
    })
    
    # Create records for team2
    team2_records = games_df[[
        'GameID', 'Date', 'TeamID2', 'Location2', 'Score2', 
        'TeamID1', 'Win2', 'Loss2'
    ]].rename(columns={
        'TeamID2': 'TeamID',
        'Location2': 'Location',
        'Score2': 'Score',
        'TeamID1': 'OpponentID',
        'Win2': 'Win',
        'Loss2': 'Loss'
    })
    
    # Concatenate team records
    all_records = pd.concat([team1_records, team2_records])
    
    return all_records

def generate_team_stats(games_df, cutoff_date=None):
    """
    Generate stats for each team including their performance by location
    and their opponents' performance.
    
    Parameters:
    - games_df: DataFrame with game records
    - cutoff_date: Date in YYYYMMDD format to include games up to (inclusive)
    
    Returns:
    - DataFrame with team stats
    """
    # Filter games by date if cutoff_date is provided
    if cutoff_date:
        games_df = games_df[games_df['Date'] <= cutoff_date]
    
    # Prepare team game records
    team_records = prepare_team_game_records(games_df)
    
    # Initialize lists to store team stats
    teams = []
    stats = []
    
    # Get unique team IDs
    unique_teams = team_records['TeamID'].unique()
    
    for team_id in unique_teams:
        # Get games for this team
        team_games = team_records[team_records['TeamID'] == team_id]
        
        # Calculate team stats
        # Home games (Location = 1)
        home_games = team_games[team_games['Location'] == 1]
        home_wins = home_games['Win'].sum()
        home_losses = home_games['Loss'].sum()
        
        # Away games (Location = -1)
        away_games = team_games[team_games['Location'] == -1]
        away_wins = away_games['Win'].sum()
        away_losses = away_games['Loss'].sum()
        
        # Neutral games (Location = 0)
        neutral_games = team_games[team_games['Location'] == 0]
        neutral_wins = neutral_games['Win'].sum()
        neutral_losses = neutral_games['Loss'].sum()
        
        # Get opponents for this team
        opponent_ids = team_games['OpponentID'].unique()
        
        # Get all games played by these opponents (excluding games against this team)
        opponent_games = team_records[
            (team_records['TeamID'].isin(opponent_ids)) & 
            (team_records['OpponentID'] != team_id)
        ]
        
        # Calculate opponent stats
        # Home games
        opp_home_games = opponent_games[opponent_games['Location'] == 1]
        opp_home_wins = opp_home_games['Win'].sum()
        opp_home_losses = opp_home_games['Loss'].sum()
        
        # Away games
        opp_away_games = opponent_games[opponent_games['Location'] == -1]
        opp_away_wins = opp_away_games['Win'].sum()
        opp_away_losses = opp_away_games['Loss'].sum()
        
        # Neutral games
        opp_neutral_games = opponent_games[opponent_games['Location'] == 0]
        opp_neutral_wins = opp_neutral_games['Win'].sum()
        opp_neutral_losses = opp_neutral_games['Loss'].sum()
        
        # Add team and stats to lists
        teams.append(team_id)
        stats.append({
            'HomeWins': home_wins,
            'HomeLosses': home_losses,
            'AwayWins': away_wins,
            'AwayLosses': away_losses,
            'NeutralWins': neutral_wins,
            'NeutralLosses': neutral_losses,
            'TotalWins': home_wins + away_wins + neutral_wins,
            'TotalLosses': home_losses + away_losses + neutral_losses,
            'OpponentHomeWins': opp_home_wins,
            'OpponentHomeLosses': opp_home_losses,
            'OpponentAwayWins': opp_away_wins,
            'OpponentAwayLosses': opp_away_losses,
            'OpponentNeutralWins': opp_neutral_wins,
            'OpponentNeutralLosses': opp_neutral_losses,
            'OpponentTotalWins': opp_home_wins + opp_away_wins + opp_neutral_wins,
            'OpponentTotalLosses': opp_home_losses + opp_away_losses + opp_neutral_losses
        })
    
    # Create DataFrame with team stats
    team_stats_df = pd.DataFrame({'TeamID': teams})
    
    # Add stats columns
    for stat_name in stats[0].keys():
        team_stats_df[stat_name] = [stat[stat_name] for stat in stats]
    
    return team_stats_df

def merge_team_names(stats_df, teams_file):
    """Merge team names into stats DataFrame."""
    # Parse team names file
    with open(teams_file, 'r') as file:
        content = file.read()
    
    # Extract team IDs and names
    teams = re.findall(r'(\d+),\s*([A-Za-z_\-\.&\']+)', content)
    
    # Create teams DataFrame
    teams_df = pd.DataFrame(teams, columns=['TeamID', 'TeamName'])
    teams_df['TeamID'] = teams_df['TeamID'].astype(int)
    
    # Merge with stats
    merged_df = pd.merge(stats_df, teams_df, on='TeamID', how='left')
    
    return merged_df

def analyze_ncaa_basketball(games_file, teams_file, cutoff_date=None):
    """
    Main function to analyze NCAA basketball data.
    
    Parameters:
    - games_file: Path to games data file
    - teams_file: Path to teams data file
    - cutoff_date: Date in YYYYMMDD format (default is None, which uses all data)
    
    Returns:
    - DataFrame with team stats
    """
    # Parse games data
    games_df = parse_ncaa_games(games_file)
    
    # Generate team stats
    team_stats_df = generate_team_stats(games_df, cutoff_date)
    
    # Merge with team names
    result_df = merge_team_names(team_stats_df, teams_file)
    
    # Sort by total wins (descending)
    result_df = result_df.sort_values(by='TotalWins', ascending=False)
    
    return result_df



In [15]:
games_file = "WBB_Data/2025 NCAA Women's Basketball Games.txt" 
teams_file = "WBB_Data/2025 NCAA Women's Basketball Teams.txt" 

# Analyze data up to a specific date (example: Nov 15, 2024)
date_stats = analyze_ncaa_basketball(games_file, teams_file, cutoff_date=20250318)
print("\nStats for games up to Selection Sunday:")
date_stats
#date_stats.to_csv("ncaa_team_stats_20241115.csv", index=False)


Stats for games up to Selection Sunday:


Unnamed: 0,TeamID,HomeWins,HomeLosses,AwayWins,AwayLosses,NeutralWins,NeutralLosses,TotalWins,TotalLosses,OpponentHomeWins,OpponentHomeLosses,OpponentAwayWins,OpponentAwayLosses,OpponentNeutralWins,OpponentNeutralLosses,OpponentTotalWins,OpponentTotalLosses,TeamName
203,299,15,0,10,2,6,1,31,3,318,117,177,161,71,55,566,333,Texas
111,294,19,0,7,2,5,1,31,3,283,125,131,183,51,47,465,355,TCU
139,57,13,1,10,2,8,0,31,3,239,88,158,113,54,37,451,238,Connecticut
2,317,12,1,10,1,8,0,30,2,298,122,162,146,62,42,522,310,UCLA
87,271,14,1,8,2,8,0,30,3,331,106,162,143,88,49,581,298,South_Carolina
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,320,0,12,1,14,1,1,2,27,102,111,82,180,18,27,202,318,UNC_Asheville
361,204,1,11,1,14,0,0,2,25,151,100,93,164,22,20,266,284,Niagara
357,8,1,13,0,14,0,2,1,29,151,115,123,157,17,17,291,289,American_Univ
358,13,1,9,0,16,0,0,1,25,128,84,81,190,18,32,227,306,Ark_Pine_Bluff


In [16]:
date_stats.to_csv("select_sun_W_stats2025.csv", index=False)

In [6]:
has_na = date_stats.isna().any().any()
print(f"Has NA values: {has_na}")

# Check for NA values in each column
has_na_col = date_stats.isna().any()
print(f"Has NA values per column:\n{has_na_col}")

Has NA values: False
Has NA values per column:
TeamID                   False
HomeWins                 False
HomeLosses               False
AwayWins                 False
AwayLosses               False
NeutralWins              False
NeutralLosses            False
TotalWins                False
TotalLosses              False
OpponentHomeWins         False
OpponentHomeLosses       False
OpponentAwayWins         False
OpponentAwayLosses       False
OpponentNeutralWins      False
OpponentNeutralLosses    False
OpponentTotalWins        False
OpponentTotalLosses      False
TeamName                 False
dtype: bool


### Scraping NET ranking from NCAA

When scraping use archive.org to find NET rankings from 

In [17]:
url25 = "https://web.archive.org/web/20250320004606/https://www.ncaa.com/rankings/basketball-women/d1/ncaa-womens-basketball-net-rankings"
url24 = "https://web.archive.org/web/20240319105521/https://www.ncaa.com/rankings/basketball-women/d1/ncaa-womens-basketball-net-rankings"
url23 = "https://web.archive.org/web/20230314220429/https://www.ncaa.com/rankings/basketball-women/d1/ncaa-womens-basketball-net-rankings"

# Read all tables from the webpage
tables25 = pd.read_html(url25)
tables24 = pd.read_html(url24)
tables23 = pd.read_html(url23)

# Since there's only one table, get the first one
rankings_table25 = tables25[1]
rankings_table24 = tables24[1]
rankings_table23 = tables23[1]


# Display the first few rows
print(rankings_table25.head())
print(rankings_table24.head())
print(rankings_table23.head())



   Rank  Previous          School Conference Record  Road Neutral  Home  \
0     1         1           UConn   Big East   31-3  10-2     8-0  13-1   
1     2         2  South Carolina        SEC   30-3   8-2     8-0  14-1   
2     3         3           Texas        SEC   31-3  10-2     6-1  15-0   
3     4         4            UCLA    Big Ten   30-2  10-1     8-0  12-1   
4     5         5      Notre Dame        ACC   26-5  11-1     1-3  14-1   

  Quad 1 Quad 2 Quad 3 Quad 4  
0    5-3    7-0   10-0    9-0  
1   16-3    4-0    3-0    7-0  
2   14-3    4-0    4-0    9-0  
3   13-2    5-0    3-0    9-0  
4    6-5    9-0    3-0    8-0  
   Rank  Previous          School Conference Record  Road Neutral  Home  \
0     1         1  South Carolina        SEC   32-0  12-0     5-0  15-0   
1     2         2           UConn   Big East   28-5  10-3     5-1  13-1   
2     3         3           Texas     Big 12   30-4   9-2     6-0  15-2   
3     4         4        Stanford     Pac-12   28-5   9-2

In [18]:
# Save to CSV
rankings_table25.to_csv("ncaa_rankings_W_2025.csv", index=False)
rankings_table24.to_csv("ncaa_rankings_W_2024.csv", index=False)
rankings_table23.to_csv("ncaa_rankings_W_2023.csv", index=False)

### Extra For-Fun Functions

In [None]:
def analyze_team_performance_over_time(games_df, team_id, team_name=None):
    """
    Analyze a specific team's performance over time.
    
    Parameters:
    - games_df: DataFrame with game records
    - team_id: ID of the team to analyze
    - team_name: Name of the team (optional, for display)
    
    Returns:
    - DataFrame with team's performance over time
    """
    # Get team's games
    team_records = prepare_team_game_records(games_df)
    team_games = team_records[team_records['TeamID'] == team_id]
    
    # Sort by date
    team_games = team_games.sort_values(by='Date')
    
    # Calculate cumulative wins and losses
    team_games['CumulativeWins'] = team_games['Win'].cumsum()
    team_games['CumulativeLosses'] = team_games['Loss'].cumsum()
    
    # Calculate win percentage
    team_games['TotalGames'] = team_games['CumulativeWins'] + team_games['CumulativeLosses']
    team_games['WinPercentage'] = (team_games['CumulativeWins'] / team_games['TotalGames'] * 100).round(2)
    
    # Create readable date format (assuming YYYYMMDD format)
    team_games['FormattedDate'] = team_games['Date'].apply(
        lambda x: datetime.strptime(str(x), '%Y%m%d').strftime('%b %d, %Y')
    )
    
    # Add opponent info if available
    if 'OpponentName' in team_games.columns:
        team_games['GameDescription'] = team_games.apply(
            lambda row: f"{row['FormattedDate']} vs. {row['OpponentName']}", axis=1
        )
    else:
        team_games['GameDescription'] = team_games.apply(
            lambda row: f"{row['FormattedDate']} vs. Team {row['OpponentID']}", axis=1
        )
    
    # Display team name in results if provided
    if team_name:
        print(f"Performance Analysis for {team_name} (ID: {team_id})")
    
    return team_games

def calculate_advanced_metrics(team_stats_df):
    """
    Calculate advanced metrics for teams.
    
    Parameters:
    - team_stats_df: DataFrame with team stats
    
    Returns:
    - DataFrame with additional advanced metrics
    """
    # Create a copy to avoid modifying the original
    df = team_stats_df.copy()
    
    # Calculate winning percentages
    df['HomeWinPct'] = (df['HomeWins'] / (df['HomeWins'] + df['HomeLosses']) * 100).round(2)
    df['AwayWinPct'] = (df['AwayWins'] / (df['AwayWins'] + df['AwayLosses']) * 100).round(2)
    df['NeutralWinPct'] = (df['NeutralWins'] / (df['NeutralWins'] + df['NeutralLosses']) * 100).round(2)
    df['TotalWinPct'] = (df['TotalWins'] / (df['TotalWins'] + df['TotalLosses']) * 100).round(2)
    
    # Calculate opponent winning percentages
    df['OpponentWinPct'] = (df['OpponentTotalWins'] / (df['OpponentTotalWins'] + df['OpponentTotalLosses']) * 100).round(2)
    
    # Calculate strength of schedule (SOS) - simple version based on opponent win percentage
    df['StrengthOfSchedule'] = df['OpponentWinPct']
    
    # Calculate home court advantage (difference between home and away win percentages)
    df['HomeAdvantage'] = (df['HomeWinPct'] - df['AwayWinPct']).round(2)
    
    # Additional metrics can be added here
    
    return df

def compare_teams(advanced_stats_df, team_ids, metric_columns=None):
    """
    Compare specific teams across various metrics.
    
    Parameters:
    - advanced_stats_df: DataFrame with team stats including advanced metrics
    - team_ids: List of team IDs to compare
    - metric_columns: List of metrics to compare (default is a standard set)
    
    Returns:
    - DataFrame with comparison metrics for selected teams
    """
    if metric_columns is None:
        metric_columns = [
            'TotalWins', 'TotalLosses', 'TotalWinPct', 
            'HomeWinPct', 'AwayWinPct', 'NeutralWinPct',
            'StrengthOfSchedule', 'HomeAdvantage'
        ]
    
    # Filter for selected teams
    comparison_df = advanced_stats_df[advanced_stats_df['TeamID'].isin(team_ids)]
    
    # Select relevant columns
    if 'TeamName' in advanced_stats_df.columns:
        comparison_df = comparison_df[['TeamID', 'TeamName'] + metric_columns]
    else:
        comparison_df = comparison_df[['TeamID'] + metric_columns]
    
    return comparison_df

def visualize_win_percentages(advanced_stats_df, top_n=25):
    """
    Create a visualization of win percentages for top teams.
    
    Parameters:
    - advanced_stats_df: DataFrame with team stats including advanced metrics
    - top_n: Number of top teams to include
    
    Returns:
    - None (displays plot)
    """
    # Get top teams by total win percentage
    top_teams = advanced_stats_df.sort_values(by='TotalWinPct', ascending=False).head(top_n)
    
    # Set up the plot
    plt.figure(figsize=(12, 8))
    
    # Create team labels
    if 'TeamName' in top_teams.columns:
        team_labels = top_teams['TeamName']
    else:
        team_labels = [f"Team {team_id}" for team_id in top_teams['TeamID']]
    
    # Plotting
    x = np.arange(len(team_labels))
    width = 0.2
    
    plt.bar(x - width, top_teams['HomeWinPct'], width, label='Home Win %')
    plt.bar(x, top_teams['TotalWinPct'], width, label='Overall Win %')
    plt.bar(x + width, top_teams['AwayWinPct'], width, label='Away Win %')
    
    plt.xlabel('Teams')
    plt.ylabel('Win Percentage')
    plt.title(f'Win Percentages for Top {top_n} NCAA Basketball Teams')
    plt.xticks(x, team_labels, rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    
    plt.show()

def visualize_strength_of_schedule(advanced_stats_df, highlight_teams=None):
    """
    Create a scatter plot of win percentage vs strength of schedule.
    
    Parameters:
    - advanced_stats_df: DataFrame with team stats including advanced metrics
    - highlight_teams: List of team IDs to highlight
    
    Returns:
    - None (displays plot)
    """
    plt.figure(figsize=(12, 8))
    
    # Regular teams
    plt.scatter(
        advanced_stats_df['StrengthOfSchedule'], 
        advanced_stats_df['TotalWinPct'], 
        alpha=0.7
    )
    
    # Highlight specific teams if requested
    if highlight_teams:
        highlight_df = advanced_stats_df[advanced_stats_df['TeamID'].isin(highlight_teams)]
        plt.scatter(
            highlight_df['StrengthOfSchedule'], 
            highlight_df['TotalWinPct'], 
            color='red', 
            s=100,
            alpha=0.8
        )
        
        # Add labels for highlighted teams
        for _, row in highlight_df.iterrows():
            label = row.get('TeamName', f"Team {row['TeamID']}")
            plt.annotate(
                label, 
                (row['StrengthOfSchedule'], row['TotalWinPct']),
                xytext=(5, 5), 
                textcoords='offset points'
            )
    
    plt.xlabel('Strength of Schedule (Opponent Win %)')
    plt.ylabel('Team Win Percentage')
    plt.title('NCAA Basketball: Win Percentage vs. Strength of Schedule')
    plt.grid(True, alpha=0.3)
    
    plt.show()

def analyze_team_performance_by_date_range(games_df, start_date, end_date):
    """
    Analyze team performance within a specific date range.
    
    Parameters:
    - games_df: DataFrame with game records
    - start_date: Start date in YYYYMMDD format
    - end_date: End date in YYYYMMDD format
    
    Returns:
    - DataFrame with team stats for the specified period
    """
    # Filter games by date range
    filtered_games = games_df[(games_df['Date'] >= start_date) & (games_df['Date'] <= end_date)]
    
    # Generate team stats for this period
    period_stats = generate_team_stats(filtered_games)
    
    # Calculate advanced metrics
    advanced_stats = calculate_advanced_metrics(period_stats)
    
    return advanced_stats

def identify_hot_and_cold_streaks(games_df, min_streak_length=3):
    """
    Identify teams with notable win or loss streaks.
    
    Parameters:
    - games_df: DataFrame with game records
    - min_streak_length: Minimum consecutive wins/losses to count as a streak
    
    Returns:
    - DataFrame with streak information
    """
    # Prepare team game records
    team_records = prepare_team_game_records(games_df)
    
    # Initialize lists to store streak data
    streak_data = []
    
    # Get unique team IDs
    unique_teams = team_records['TeamID'].unique()
    
    for team_id in unique_teams:
        # Get games for this team
        team_games = team_records[team_records['TeamID'] == team_id]
        
        # Sort by date
        team_games = team_games.sort_values(by='Date')
        
        # Variables to track streaks
        current_streak = 0
        streak_type = None
        streak_start_date = None
        
        # Iterate through games
        for i, (_, game) in enumerate(team_games.iterrows()):
            if i == 0:
                # First game
                current_streak = 1
                streak_type = 'Win' if game['Win'] == 1 else 'Loss'
                streak_start_date = game['Date']
            else:
                # Check if continuing the streak
                if (streak_type == 'Win' and game['Win'] == 1) or (streak_type == 'Loss' and game['Loss'] == 1):
                    current_streak += 1
                else:
                    # Streak ended
                    if current_streak >= min_streak_length:
                        streak_data.append({
                            'TeamID': team_id,
                            'StreakType': streak_type,
                            'StreakLength': current_streak,
                            'StartDate': streak_start_date,
                            'EndDate': team_games.iloc[i-1]['Date']
                        })
                    
                    # Start new streak
                    current_streak = 1
                    streak_type = 'Win' if game['Win'] == 1 else 'Loss'
                    streak_start_date = game['Date']
        
        # Check if ended on a streak
        if current_streak >= min_streak_length:
            streak_data.append({
                'TeamID': team_id,
                'StreakType': streak_type,
                'StreakLength': current_streak,
                'StartDate': streak_start_date,
                'EndDate': team_games.iloc[-1]['Date']
            })
    
    # Create DataFrame
    streaks_df = pd.DataFrame(streak_data)
    
    # Sort by streak length (descending)
    streaks_df = streaks_df.sort_values(by=['StreakLength', 'TeamID'], ascending=[False, True])
    
    return streaks_df

def predict_game_outcome(team1_id, team2_id, location, advanced_stats_df):
    """
    A simple model to predict game outcomes based on team stats.
    
    Parameters:
    - team1_id: ID of first team
    - team2_id: ID of second team
    - location: 1 (team1 home), -1 (team2 home), 0 (neutral)
    - advanced_stats_df: DataFrame with team stats including advanced metrics
    
    Returns:
    - Dictionary with prediction details
    """
    # Get stats for both teams
    team1_stats = advanced_stats_df[advanced_stats_df['TeamID'] == team1_id].iloc[0]
    team2_stats = advanced_stats_df[advanced_stats_df['TeamID'] == team2_id].iloc[0]
    
    # Calculate base winning percentage based on location
    if location == 1:  # Team 1 at home
        team1_base = team1_stats['HomeWinPct']
        team2_base = team2_stats['AwayWinPct']
    elif location == -1:  # Team 2 at home
        team1_base = team1_stats['AwayWinPct']
        team2_base = team2_stats['HomeWinPct']
    else:  # Neutral location
        team1_base = team1_stats['NeutralWinPct']
        team2_base = team2_stats['NeutralWinPct']
    
    # Adjust for strength of schedule
    team1_sos_adj = (team1_base * (team2_stats['StrengthOfSchedule'] / 50))
    team2_sos_adj = (team2_base * (team1_stats['StrengthOfSchedule'] / 50))
    
    # Calculate win probability
    team1_win_prob = (team1_sos_adj / (team1_sos_adj + team2_sos_adj)) * 100
    team2_win_prob = 100 - team1_win_prob
    
    # Get team names if available
    team1_name = team1_stats.get('TeamName', f"Team {team1_id}")
    team2_name = team2_stats.get('TeamName', f"Team {team2_id}")
    
    # Return prediction details
    return {
        'Team1': {
            'ID': team1_id,
            'Name': team1_name,
            'WinProbability': round(team1_win_prob, 2)
        },
        'Team2': {
            'ID': team2_id,
            'Name': team2_name,
            'WinProbability': round(team2_win_prob, 2)
        },
        'PredictedWinner': team1_name if team1_win_prob > team2_win_prob else team2_name
    }

In [None]:
games_file = "Scraping/2023games.txt"
teams_file = "Scraping/2023teams.txt"

# Parse games data
games_df = parse_ncaa_games(games_file)

# Generate team stats for all data
team_stats_df = generate_team_stats(games_df)

# Merge with team names
result_df = merge_team_names(team_stats_df, teams_file)

# Calculate advanced metrics
advanced_stats_df = calculate_advanced_metrics(result_df)

# Example: Analyze performance over time for a specific team
team_id = 115  # Replace with actual team ID
team_performance = analyze_team_performance_over_time(games_df, team_id)

# Example: Compare top teams
top_team_ids = advanced_stats_df.sort_values(by='TotalWinPct', ascending=False).head(5)['TeamID'].tolist()
team_comparison = compare_teams(advanced_stats_df, top_team_ids)

# Example: Visualize win percentages
visualize_win_percentages(advanced_stats_df, top_n=15)

# Example: Find notable streaks
streaks_df = identify_hot_and_cold_streaks(games_df, min_streak_length=5)

# Example: Predict outcome of a hypothetical game
prediction = predict_game_outcome(top_team_ids[0], top_team_ids[1], 1, advanced_stats_df)
print(f"Predicted winner: {prediction['PredictedWinner']}")