# March Madness Predictions using Random Forest

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict, deque
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import plot_tree
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## Load and Explore the Dataset

In [None]:
# Load KenPom summary data
kenpom_summary = pd.read_csv("./data/archive/INT _ KenPom _ Summary.csv")
kenpom_summary.head()

In [None]:
# Load efficiency data
kenpom_efficiency = pd.read_csv("./data/archive/INT _ KenPom _ Efficiency.csv")
kenpom_efficiency.head()

In [None]:
# Load defensive data
kenpom_defense = pd.read_csv("./data/archive/INT _ KenPom _ Defense.csv")
kenpom_defense.head()

In [None]:
# Load offensive data
kenpom_offense = pd.read_csv("./data/archive/INT _ KenPom _ Offense.csv")
kenpom_offense.head()

In [None]:
# Load height data
kenpom_height = pd.read_csv("./data/archive/INT _ KenPom _ Height.csv")
kenpom_height.head()

In [None]:
# Load March Madness games
march_madness = pd.read_csv("./data/archive/DEV _ March Madness.csv")
march_madness.head()

In [None]:
# Load tournament teams data
tournament_teams = pd.read_csv("./data/archive/REF _ Post-Season Tournament Teams.csv")
tournament_teams.head()

## Data Preprocessing

In [None]:
# Merge datasets to create a comprehensive team profile dataset
def merge_team_data(year):
    # Filter datasets for the specified year
    summary = kenpom_summary[kenpom_summary['Season'] == year]
    efficiency = kenpom_efficiency[kenpom_efficiency['Season'] == year]
    defense = kenpom_defense[kenpom_defense['Season'] == year]
    offense = kenpom_offense[kenpom_offense['Season'] == year]
    height = kenpom_height[kenpom_height['Season'] == year]
    
    # Merge on Team column
    merged = summary.merge(efficiency, on=['Season', 'TeamName'], how='inner')
    merged = merged.merge(defense, on=['Season', 'TeamName'], how='inner')
    merged = merged.merge(offense, on=['Season', 'TeamName'], how='inner')
    merged = merged.merge(height, on=['Season', 'TeamName'], how='inner')
    
    return merged

# Create a dataset for a recent year (adjust as needed)
team_data_2023 = merge_team_data(2023)
team_data_2023.head()

In [None]:
# Get a list of all team matchups from tournament data
tournament_matchups = pd.read_csv("./data/archive/DEV _ March Madness.csv")
tournament_matchups.head()

## Feature Engineering

In [None]:
# Create a dataset with matchups and feature differences for each game
def create_matchup_features(team1, team2, team_data):
    # Get data for both teams
    team1_data = team_data[team_data['TeamName'] == team1]
    team2_data = team_data[team_data['TeamName'] == team2]
    
    if team1_data.empty or team2_data.empty:
        return None
    
    # Select important features
    features = [
        'AdjEM', 'AdjOE', 'AdjDE', 'Tempo', 'eFGPct', 'TOPct', 'ORPct', 'FTRate',
        'BlockPct', 'StlRate', 'Experience', 'AvgHeight', 'Bench'
    ]
    
    # Create differentials for each feature
    diff_features = {}
    for feature in features:
        if feature in team1_data.columns and feature in team2_data.columns:
            diff_features[f"{feature}_DIFF"] = float(team1_data[feature].values[0]) - float(team2_data[feature].values[0])
    
    # Add team info
    diff_features['TEAM1'] = team1
    diff_features['TEAM2'] = team2
    
    return diff_features

# Example of creating features for a single matchup
sample_matchup = create_matchup_features('Duke', 'North Carolina', team_data_2023)
sample_matchup

In [None]:
# Load tournament results to create a labeled dataset
def create_training_data(years):
    all_matchups = []
    
    for year in years:
        # Get team data for this year
        team_data = merge_team_data(year)
        
        # Get tournament games for this year
        tournament = tournament_teams[tournament_teams['Season'] == year]
        
        # For each tournament game, create features
        # This would require tournament game results which should be in your dataset
        # We'll simulate it here
        for idx, game in tournament.iterrows():
            # In a real dataset, you'd have actual matchups and results
            # Here we'll need to modify based on your actual data structure
            pass
    
    return pd.DataFrame(all_matchups)

# Training data would be created based on historical tournament matchups
# For now, we'll generate synthetic data for demonstration
def generate_synthetic_training_data(n_samples=1000):
    synthetic_data = []
    
    # Create synthetic feature differences
    for _ in range(n_samples):
        sample = {
            'AdjEM_DIFF': np.random.normal(0, 10),
            'AdjOE_DIFF': np.random.normal(0, 5),
            'AdjDE_DIFF': np.random.normal(0, 5),
            'Tempo_DIFF': np.random.normal(0, 3),
            'eFGPct_DIFF': np.random.normal(0, 0.05),
            'TOPct_DIFF': np.random.normal(0, 0.02),
            'ORPct_DIFF': np.random.normal(0, 0.03),
            'FTRate_DIFF': np.random.normal(0, 0.04),
            'BlockPct_DIFF': np.random.normal(0, 0.02),
            'StlRate_DIFF': np.random.normal(0, 0.01),
            'Experience_DIFF': np.random.normal(0, 0.5),
            'AvgHeight_DIFF': np.random.normal(0, 1),
            'Bench_DIFF': np.random.normal(0, 5),
        }
        
        # Generate result based on feature differences
        # Team 1 wins if AdjEM_DIFF > 0 with some randomness
        prob_team1_wins = 1 / (1 + np.exp(-0.1 * sample['AdjEM_DIFF']))
        sample['RESULT'] = 1 if np.random.random() < prob_team1_wins else -1
        
        synthetic_data.append(sample)
    
    return pd.DataFrame(synthetic_data)

# Generate synthetic training data
training_data = generate_synthetic_training_data(2000)
training_data.head()

In [None]:
# Split data into training and testing sets
X = training_data.drop('RESULT', axis=1)
y = training_data['RESULT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train Models

In [None]:
# Custom Random Forest using our own implementation
from RandomForest.RandomForest import RandomForest

# Create numpy arrays from training data
X_train_np = X_train.values
y_train_np = y_train.values.reshape(-1, 1)
train_data_np = np.hstack((X_train_np, y_train_np))

# Initialize and train our custom Random Forest
n_features = X_train.shape[1]
my_rf = RandomForest(n_features=int(n_features * 0.7), n_estimators=100, tree_params=dict(max_depth=10, min_samples_split=5))
my_rf.build_forest(train_data_np)

In [None]:
# Scikit-learn Random Forest
sklearn_rf = RandomForestClassifier(n_estimators=500, max_depth=10, max_features="sqrt", random_state=42)
sklearn_rf.fit(X_train, y_train)

# XGBoost
xgb_model = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb_model.fit(X_train, y_train)

# Neural Network
# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and train the neural network
nn_model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_scaled, (y_train == 1).astype(int), epochs=30, batch_size=32, validation_split=0.2, verbose=0)

## Evaluate Models

In [None]:
# Prepare test data for custom RF
X_test_np = X_test.values
y_test_np = y_test.values.reshape(-1, 1)
test_data_np = np.hstack((X_test_np, y_test_np))

# Make predictions with custom RF
my_rf_acc = my_rf.calculate_accuracy(test_data_np)
print(f"Custom Random Forest Accuracy: {my_rf_acc:.4f}")

# Make predictions with sklearn RF
sklearn_rf_preds = sklearn_rf.predict(X_test)
sklearn_rf_acc = metrics.accuracy_score(y_test, sklearn_rf_preds)
print(f"Scikit-learn Random Forest Accuracy: {sklearn_rf_acc:.4f}")

# Make predictions with XGBoost
xgb_preds = xgb_model.predict(X_test)
xgb_acc = metrics.accuracy_score(y_test, xgb_preds)
print(f"XGBoost Accuracy: {xgb_acc:.4f}")

# Make predictions with Neural Network
nn_preds = (nn_model.predict(X_test_scaled) > 0.5).astype(int)
nn_acc = metrics.accuracy_score((y_test == 1).astype(int), nn_preds)
print(f"Neural Network Accuracy: {nn_acc:.4f}")

In [None]:
# Feature importance
importances = pd.Series(sklearn_rf.feature_importances_, index=X.columns)
plt.figure(figsize=(10, 6))
importances.sort_values(ascending=False).plot(kind='bar')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

## Generate March Madness Bracket

In [None]:
# Function to simulate a game between two teams
def predict_game(team1, team2, model, team_data, scaler=None):
    # Create feature differences between teams
    game_features = create_matchup_features(team1, team2, team_data)
    
    if game_features is None:
        return None, 0.5
    
    # Extract features in the same order as training data
    feature_values = [game_features[f] for f in X.columns]
    features_df = pd.DataFrame([feature_values], columns=X.columns)
    
    # Scale features if using neural network
    if scaler is not None:
        features_scaled = scaler.transform(features_df)
        prob = float(nn_model.predict(features_scaled)[0, 0])
        prediction = 1 if prob > 0.5 else -1
    else:
        # For other models
        prediction = model.predict(features_df)[0]
        prob = float(model.predict_proba(features_df)[0, 1]) if hasattr(model, 'predict_proba') else 0.5
    
    # Return winner and probability
    if prediction == 1:
        return team1, prob
    else:
        return team2, 1 - prob

In [None]:
# Example 2023 March Madness bracket (first round)
team_data_2023 = merge_team_data(2023)
bracket_2023 = [
    # East region (just a few examples)
    ('Purdue', 'Fairleigh Dickinson'),
    ('Memphis', 'Florida Atlantic'),
    ('Duke', 'Oral Roberts'),
    ('Tennessee', 'Louisiana'),
    
    # Midwest region (just a few examples)
    ('Houston', 'Northern Kentucky'),
    ('Iowa', 'Auburn'), 
    ('Texas A&M', 'Penn State'),
    ('Texas', 'Colgate'),
    
    # Add more matchups as needed
]

# Simulate the first round
first_round_results = []
for team1, team2 in bracket_2023:
    winner, prob = predict_game(team1, team2, xgb_model, team_data_2023)
    first_round_results.append((winner, prob))
    print(f"{team1} vs {team2} -> {winner} wins with {prob:.2f} probability")

In [None]:
# Function to simulate entire tournament
def simulate_tournament(teams, model, team_data, scaler=None):
    # Start with first round matchups
    current_round = []
    for i in range(0, len(teams), 2):
        current_round.append((teams[i], teams[i+1]))
    
    # Keep track of the bracket
    bracket = [current_round]
    
    # Simulate each round
    while len(current_round) > 0:
        next_round = []
        for team1, team2 in current_round:
            winner, prob = predict_game(team1, team2, model, team_data, scaler)
            next_round.append(winner)
        
        # Match teams for next round
        paired_next_round = []
        for i in range(0, len(next_round), 2):
            if i+1 < len(next_round):
                paired_next_round.append((next_round[i], next_round[i+1]))
        
        current_round = paired_next_round
        if len(current_round) > 0:
            bracket.append(current_round)
    
    return bracket

# Example simulation with a small subset of teams
example_teams = [
    'Gonzaga', 'Grand Canyon',
    'Kansas', 'Howard',
    'UCLA', 'UNC Asheville',
    'Connecticut', 'Iona'
]

example_bracket = simulate_tournament(example_teams, xgb_model, team_data_2023)

# Print bracket results
for i, round_games in enumerate(example_bracket):
    print(f"\nRound {i+1}:")
    for matchup in round_games:
        if isinstance(matchup, tuple):
            print(f"{matchup[0]} vs {matchup[1]}")
        else:
            print(f"Champion: {matchup}")

## Advanced Team Analysis

In [None]:
# Calculate ELO-style ratings for teams
def calculate_team_elos(matchups_data, init_elo=1500, k=32):
    elo_ratings = defaultdict(lambda: init_elo)
    
    for _, row in matchups_data.iterrows():
        team1 = row['TEAM1']
        team2 = row['TEAM2']
        result = row['RESULT']  # 1 if team1 won, -1 if team2 won
        
        # Get current ELOs
        elo1 = elo_ratings[team1]
        elo2 = elo_ratings[team2]
        
        # Calculate expected outcome
        expected1 = 1 / (1 + 10 ** ((elo2 - elo1) / 400))
        expected2 = 1 - expected1
        
        # Update ELOs based on result
        if result == 1:  # team1 won
            actual1, actual2 = 1, 0
        else:  # team2 won
            actual1, actual2 = 0, 1
        
        elo_ratings[team1] += k * (actual1 - expected1)
        elo_ratings[team2] += k * (actual2 - expected2)
    
    return dict(elo_ratings)

# We'd need real historical matchup data for this
# For demonstration, we'll create a small synthetic dataset
synthetic_matchups = pd.DataFrame({
    'TEAM1': ['Duke', 'North Carolina', 'Kansas', 'Duke', 'Gonzaga'],
    'TEAM2': ['Kentucky', 'Duke', 'North Carolina', 'Gonzaga', 'Kentucky'],
    'RESULT': [1, -1, 1, 1, -1]  # 1 means TEAM1 won, -1 means TEAM2 won
})

# Calculate ELO ratings
team_elos = calculate_team_elos(synthetic_matchups)
print("Team ELO Ratings:")
for team, elo in sorted(team_elos.items(), key=lambda x: x[1], reverse=True):
    print(f"{team}: {elo:.1f}")

In [None]:
# Calculate upset probabilities
def calculate_upset_probability(team1_seed, team2_seed, team1, team2, model, team_data, scaler=None):
    # Define upset (lower seed beats higher seed)
    lower_seed = max(team1_seed, team2_seed)
    higher_seed = min(team1_seed, team2_seed)
    lower_seed_team = team1 if team1_seed == lower_seed else team2
    higher_seed_team = team1 if team1_seed == higher_seed else team2
    
    # Only consider potential upsets (difference in seeds)
    if lower_seed - higher_seed < 2:
        return 0.0
    
    # Predict game
    winner, prob = predict_game(higher_seed_team, lower_seed_team, model, team_data, scaler)
    
    # Probability of upset
    upset_prob = 1 - prob if winner == higher_seed_team else prob
    
    return upset_prob

# Example upset calculations
upset_examples = [
    (1, 16, 'Purdue', 'Fairleigh Dickinson'),
    (4, 13, 'Virginia', 'Furman'),
    (5, 12, 'San Diego State', 'Charleston'),
    (2, 15, 'Arizona', 'Princeton')
]

for team1_seed, team2_seed, team1, team2 in upset_examples:
    upset_prob = calculate_upset_probability(team1_seed, team2_seed, team1, team2, xgb_model, team_data_2023)
    print(f"Upset probability for #{team2_seed} {team2} over #{team1_seed} {team1}: {upset_prob:.2f}")

## Visualize Tournament Bracket

In [None]:
# Simple bracket visualization function
def plot_bracket(bracket):
    rounds = len(bracket)
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # No lines or ticks
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.set_xticks([])
    ax.set_yticks([])
    
    # Plot each round
    for r, round_games in enumerate(bracket):
        x = r * 2
        for i, matchup in enumerate(round_games):
            # Calculate y position
            num_games = len(round_games)
            spacing = 8 / (num_games + 1)
            y = i * spacing + spacing
            
            if isinstance(matchup, tuple):
                team1, team2 = matchup
                ax.text(x, y, f"{team1} vs {team2}", fontsize=10)
            else:
                ax.text(x, y, f"Champion: {matchup}", fontsize=12, weight='bold')
    
    ax.set_title("Tournament Bracket Simulation", fontsize=14)
    plt.tight_layout()
    plt.show()

# Plot the example bracket
plot_bracket(example_bracket)

## Make Final Predictions

In [None]:
# Function to create a full March Madness bracket
def create_march_madness_bracket(year):
    # This would require the actual tournament teams and seedings for the year
    # For demonstration, we'll create a small example bracket
    regions = {
        'East': [
            (1, 'Purdue'), (16, 'Fairleigh Dickinson'),
            (8, 'Memphis'), (9, 'Florida Atlantic'),
            (5, 'Duke'), (12, 'Oral Roberts'),
            (4, 'Tennessee'), (13, 'Louisiana')
        ],
        'West': [
            (1, 'Kansas'), (16, 'Howard'),
            (8, 'Arkansas'), (9, 'Illinois'),
            (5, 'Saint Mary\'s'), (12, 'VCU'),
            (4, 'Connecticut'), (13, 'Iona')
        ],
        'South': [
            (1, 'Alabama'), (16, 'Texas A&M CC'),
            (8, 'Maryland'), (9, 'West Virginia'),
            (5, 'San Diego State'), (12, 'Charleston'),
            (4, 'Virginia'), (13, 'Furman')
        ],
        'Midwest': [
            (1, 'Houston'), (16, 'Northern Kentucky'),
            (8, 'Iowa'), (9, 'Auburn'),
            (5, 'Miami'), (12, 'Drake'),
            (4, 'Indiana'), (13, 'Kent State')
        ]
    }
    
    return regions

# Function to simulate the entire tournament
def simulate_full_tournament(regions, model, team_data, scaler=None):
    region_results = {}
    final_four = []
    
    # Simulate each region
    for region_name, teams in regions.items():
        # Extract team names in proper order
        team_names = [team[1] for team in teams]
        
        # Simulate region bracket
        region_bracket = simulate_tournament(team_names, model, team_data, scaler)
        region_results[region_name] = region_bracket
        
        # Get regional champion
        if len(region_bracket[-1]) == 1 and not isinstance(region_bracket[-1][0], tuple):
            regional_champion = region_bracket[-1][0]
            final_four.append((region_name, regional_champion))
    
    # Simulate Final Four
    if len(final_four) == 4:
        # Semifinal 1: East vs West
        semi1_teams = [team for region, team in final_four if region in ['East', 'West']]
        if len(semi1_teams) == 2:
            semi1_winner, _ = predict_game(semi1_teams[0], semi1_teams[1], model, team_data, scaler)
        else:
            semi1_winner = None
        
        # Semifinal 2: South vs Midwest
        semi2_teams = [team for region, team in final_four if region in ['South', 'Midwest']]
        if len(semi2_teams) == 2:
            semi2_winner, _ = predict_game(semi2_teams[0], semi2_teams[1], model, team_data, scaler)
        else:
            semi2_winner = None
        
        # Championship
        if semi1_winner and semi2_winner:
            champion, _ = predict_game(semi1_winner, semi2_winner, model, team_data, scaler)
        else:
            champion = None
    else:
        semi1_winner, semi2_winner, champion = None, None, None
    
    return {
        'region_results': region_results,
        'final_four': final_four,
        'semifinal1': (semi1_teams[0], semi1_teams[1], semi1_winner) if 'semi1_teams' in locals() and len(semi1_teams) == 2 else None,
        'semifinal2': (semi2_teams[0], semi2_teams[1], semi2_winner) if 'semi2_teams' in locals() and len(semi2_teams) == 2 else None,
        'champion': champion
    }

# Simulate the tournament
regions_2023 = create_march_madness_bracket(2023)
tournament_result = simulate_full_tournament(regions_2023, xgb_model, team_data_2023)

# Print the results
print("Final Four Teams:")
for region, team in tournament_result['final_four']:
    print(f"{region}: {team}")

print("\nChampionship Game:")
if tournament_result['semifinal1'] and tournament_result['semifinal2']:
    print(f"{tournament_result['semifinal1'][2]} vs {tournament_result['semifinal2'][2]}")
    
print("\nTournament Champion:")
print(tournament_result['champion'])