In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss
from sklearn.impute import SimpleImputer
from itertools import combinations

# Load the data

In [2]:
# Load data
m_teams = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MTeams.csv')
w_teams = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WTeams.csv')
m_regular = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MRegularSeasonCompactResults.csv')
w_regular = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WRegularSeasonCompactResults.csv')
m_tourney = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MNCAATourneyCompactResults.csv')
w_tourney = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WNCAATourneyCompactResults.csv')
m_massey = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MMasseyOrdinals.csv')
m_team_conferences = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MTeamConferences.csv')
w_team_conferences = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WTeamConferences.csv')

In [3]:
print(m_team_conferences.columns)
print(m_regular.head())

Index(['Season', 'TeamID', 'ConfAbbrev'], dtype='object')
   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT
0    1985      20     1228      81     1328      64    N      0
1    1985      25     1106      77     1354      70    H      0
2    1985      25     1112      63     1223      56    H      0
3    1985      25     1165      70     1432      54    H      0
4    1985      25     1192      86     1447      74    H      0


# Processing men's teams

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MRegularSeasonDetailedResults.csv')

# Feature Engineering
data['WLoc'] = data['WLoc'].map({'H': 0, 'A': 1, 'N': 2})

# Create target variable
data['Outcome'] = 1  # 1 for win

# Create a mirrored dataset for the losing team
data_lose = data.copy()
data_lose['Outcome'] = 0  # 0 for loss
data_lose['WTeamID'], data_lose['LTeamID'] = data_lose['LTeamID'], data_lose['WTeamID']
data_lose['WScore'], data_lose['LScore'] = data_lose['LScore'], data_lose['WScore']

# Combine both datasets
combined_data = pd.concat([data, data_lose])

# Select features and target
features = ['WTeamID', 'LTeamID', 'WScore', 'LScore', 'WLoc', 'NumOT']
Xm = combined_data[features]
ym = combined_data['Outcome']

# Split the data into training and testing sets
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.2, random_state=42)

# Save the training and testing data
Xm_train.to_csv('Xm_train.csv', index=False)
Xm_test.to_csv('Xm_test.csv', index=False)
ym_train.to_csv('ym_train.csv', index=False)
ym_test.to_csv('ym_test.csv', index=False)

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss

# Load the training and testing data
Xm_train = pd.read_csv('Xm_train.csv')
Xm_test = pd.read_csv('Xm_test.csv')
ym_train = pd.read_csv('ym_train.csv')
ym_test = pd.read_csv('ym_test.csv')

# Ensure y_train and y_test are 1D arrays (required for scikit-learn)
ym_train = ym_train.values.ravel()
ym_test = ym_test.values.ravel()

# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(Xm_train, ym_train)

# Make predictions on the test set (probabilities for the positive class)
y_pred_proba_m = model.predict_proba(Xm_test)[:, 1]

# Calculate the Brier score
brier_score_m = brier_score_loss(ym_test, y_pred_proba_m)

# Print the Brier score
print(f"Brier Score men: {brier_score_m:.4f}")

Brier Score men: 0.0010


In [6]:
# Import required libraries
import pandas as pd
import numpy as np
import itertools

# Load data
m_team_conferences = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/MTeamConferences.csv')

# Define the seasons to process
seasons = [2025]

# Create a DataFrame to store all predictions
all_predictions = []

# Process men's teams
for season in seasons:
    # Get Division-I teams for the current season
    m_teams_season = m_team_conferences[m_team_conferences['Season'] == season]['TeamID'].unique()
    
    # Generate all possible matchups for the current season
    m_matchups = list(itertools.combinations(m_teams_season, 2))
    
    # Create a DataFrame for the matchups
    matchup_df = pd.DataFrame(m_matchups, columns=["Team1", "Team2"])
    
    # Add season and matchup ID
    matchup_df["Season"] = season
    matchup_df["ID"] = matchup_df.apply(lambda row: f"{row['Season']}_{min(row['Team1'], row['Team2'])}_{max(row['Team1'], row['Team2'])}", axis=1)
    
    # Prepare features for prediction
    # Assuming the model requires features like WTeamID, LTeamID, WScore, LScore, WLoc, NumOT
    # Replace this with your actual feature generation logic
    matchup_df["WTeamID"] = matchup_df["Team1"]
    matchup_df["LTeamID"] = matchup_df["Team2"]
    matchup_df["WScore"] = 70  # Example: Replace with actual logic
    matchup_df["LScore"] = 65  # Example: Replace with actual logic
    matchup_df["WLoc"] = 0     # Example: Replace with actual logic
    matchup_df["NumOT"] = 0    # Example: Replace with actual logic
    
    # Select features for prediction
    features = ["WTeamID", "LTeamID", "WScore", "LScore", "WLoc", "NumOT"]
    X_test = matchup_df[features]
    
    # Ensure Xm_test has the same number of rows as matchup_df
    if len(X_test) != len(matchup_df):
        raise ValueError(f"Feature generation error: Xm_test has {len(X_test)} rows, but matchup_df has {len(matchup_df)} rows.")
    
    # Make predictions for all matchups in the current season
    pred = model.predict_proba(X_test)[:, 1]  # Replace this with your actual model's prediction
    
    # Ensure predictions have the same length as matchup_df
   # if len(pred) != len(matchup_df):
       # raise ValueError(f"Prediction error: Pred has {len(pred)} values, but matchup_df has {len(matchup_df)} rows.")
    
    # Append the matchup ID and prediction to the list
    matchup_df["Pred"] = pred
    all_predictions.append(matchup_df[["ID", "Pred"]])

# Concatenate all predictions into a single DataFrame
submission_df_m = pd.concat(all_predictions, ignore_index=True)

# Verify the number of rows
#required_rows = 507108
#if len(submission_df) < required_rows:
  # raise ValueError(f"Submission file has only {len(submission_df)} rows, but {required_rows} rows are required.")

# Save the predictions to a CSV file
submission_df_m.to_csv("men_tournament_predictions.csv", index=False)

# Print confirmation
print(f"Submission file generated with {len(submission_df_m)} rows.")

print(submission_df_m.head(20))

Submission file generated with 66066 rows.
                ID  Pred
0   2025_1101_1102  0.98
1   2025_1101_1103  0.99
2   2025_1101_1104  1.00
3   2025_1101_1105  1.00
4   2025_1101_1106  1.00
5   2025_1101_1107  1.00
6   2025_1101_1108  1.00
7   2025_1101_1110  1.00
8   2025_1101_1111  1.00
9   2025_1101_1112  1.00
10  2025_1101_1113  1.00
11  2025_1101_1114  1.00
12  2025_1101_1115  1.00
13  2025_1101_1116  1.00
14  2025_1101_1117  1.00
15  2025_1101_1119  1.00
16  2025_1101_1120  1.00
17  2025_1101_1122  1.00
18  2025_1101_1123  1.00
19  2025_1101_1124  1.00


In [7]:
# Check the features for the test data
print("Sample test features (Xm_test):")
print(Xm_test.head())

# Check if features are constant
if Xm_test.nunique().eq(1).any():
    print("Warning: Some features are constant and may not be informative.")
else:
    print("Features are diverse and should allow for meaningful predictions.")

Sample test features (Xm_test):
   WTeamID  LTeamID  WScore  LScore  WLoc  NumOT
0     1398     1120      73      78     0      0
1     1287     1184      75      62     0      0
2     1379     1443      67      64     0      1
3     1458     1278      68      81     0      0
4     1397     1272      79      88     0      0
Features are diverse and should allow for meaningful predictions.


In [8]:
# Check model performance on the training data
train_predictions = model.predict_proba(Xm_train)[:, 1]
print("Training predictions sample:", train_predictions[:10])

# Check if training predictions are diverse
if np.unique(train_predictions).size == 0.9:
    print("Warning: Model is predicting the same value for all training samples.")
else:
    print("Model is making diverse predictions on training data.")

Training predictions sample: [1. 1. 0. 1. 1. 1. 1. 0. 0. 1.]
Model is making diverse predictions on training data.


# Processing women's teams

In [9]:
# Load the data
data_w = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WRegularSeasonDetailedResults.csv')

# Feature Engineering
data_w['WLoc'] = data_w['WLoc'].map({'H': 0, 'A': 1, 'N': 2})

# Create target variable
data_w['Outcome'] = 1  # 1 for win

# Create a mirrored dataset for the losing team
data_lose_w = data.copy()
data_lose_w['Outcome'] = 0  # 0 for loss
data_lose_w['WTeamID'], data_lose['LTeamID'] = data_lose['LTeamID'], data_lose['WTeamID']
data_lose_w['WScore'], data_lose['LScore'] = data_lose['LScore'], data_lose['WScore']

# Combine both datasets
combined_data_w = pd.concat([data_w, data_lose_w])

# Select features and target
features_w = ['WTeamID', 'LTeamID', 'WScore', 'LScore', 'WLoc', 'NumOT']
Xw = combined_data_w[features]
yw = combined_data_w['Outcome']

# Split the data into training and testing sets
Xw_train, Xw_test, yw_train, yw_test = train_test_split(Xw, yw, test_size=0.2, random_state=42)

# Save the training and testing data
Xw_train.to_csv('Xw_train.csv', index=False)
Xw_test.to_csv('Xw_test.csv', index=False)
yw_train.to_csv('yw_train.csv', index=False)
yw_test.to_csv('yw_test.csv', index=False)

In [10]:
# Load the training and testing data
Xw_train = pd.read_csv('Xw_train.csv')
Xw_test = pd.read_csv('Xw_test.csv')
yw_train = pd.read_csv('yw_train.csv')
yw_test = pd.read_csv('yw_test.csv')

# Ensure y_train and y_test are 1D arrays (required for scikit-learn)
yw_train = yw_train.values.ravel()
yw_test = yw_test.values.ravel()

# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(Xw_train, yw_train)

# Make predictions on the test set (probabilities for the positive class)
y_pred_proba_w = model.predict_proba(Xw_test)[:, 1]

# Calculate the Brier score
brier_score_w = brier_score_loss(yw_test, y_pred_proba_w)

# Print the Brier score
print(f"Brier Score women: {brier_score_w:.4f}")

Brier Score women: 0.0000


In [11]:
# Load data
w_team_conferences = pd.read_csv('/kaggle/input/march-machine-learning-mania-2025/WTeamConferences.csv')

# Define the seasons to process
seasons = [2025]

# Create a DataFrame to store all predictions
all_predictions_w = []

# Process women's teams
for season in seasons:
    # Get Division-I teams for the current season
    w_teams_season = w_team_conferences[w_team_conferences['Season'] == season]['TeamID'].unique()
    
    # Generate all possible matchups for the current season
    w_matchups = list(itertools.combinations(w_teams_season, 2))
    
    # Create a DataFrame for the matchups
    matchup_df_w = pd.DataFrame(w_matchups, columns=["Team1", "Team2"])
    
    # Add season and matchup ID
    matchup_df_w["Season"] = season
    matchup_df_w["ID"] = matchup_df_w.apply(lambda row: f"{row['Season']}_{min(row['Team1'], row['Team2'])}_{max(row['Team1'], row['Team2'])}", axis=1)
    
    # Prepare features for prediction
    # Assuming the model requires features like WTeamID, LTeamID, WScore, LScore, WLoc, NumOT
    # Replace this with your actual feature generation logic
    matchup_df_w["WTeamID"] = matchup_df_w["Team1"]
    matchup_df_w["LTeamID"] = matchup_df_w["Team2"]
    matchup_df_w["WScore"] = 70  # Example: Replace with actual logic
    matchup_df_w["LScore"] = 65  # Example: Replace with actual logic
    matchup_df_w["WLoc"] = 0     # Example: Replace with actual logic
    matchup_df_w["NumOT"] = 0    # Example: Replace with actual logic
    
    # Select features for prediction
    features_w = ["WTeamID", "LTeamID", "WScore", "LScore", "WLoc", "NumOT"]
    X_test_w = matchup_df_w[features_w]
    
    # Ensure Xm_test has the same number of rows as matchup_df
    if len(X_test_w) != len(matchup_df_w):
        raise ValueError(f"Feature generation error: Xw_test has {len(X_test_w)} rows, but matchup_df_w has {len(matchup_df_w)} rows.")
    
    # Make predictions for all matchups in the current season
    pred_w = model.predict_proba(X_test_w)[:, 1]  # Replace this with your actual model's prediction
    
    # Ensure predictions have the same length as matchup_df
   # if len(pred) != len(matchup_df):
       # raise ValueError(f"Prediction error: Pred has {len(pred)} values, but matchup_df has {len(matchup_df)} rows.")
    
    # Append the matchup ID and prediction to the list
    matchup_df_w["Pred"] = pred_w
    all_predictions_w.append(matchup_df_w[["ID", "Pred"]])

# Concatenate all predictions into a single DataFrame
submission_df_w = pd.concat(all_predictions_w, ignore_index=True)

# Verify the number of rows
#required_rows = 507108
#if len(submission_df) < required_rows:
  # raise ValueError(f"Submission file has only {len(submission_df)} rows, but {required_rows} rows are required.")

# Save the predictions to a CSV file
submission_df_w.to_csv("women_tournament_predictions.csv", index=False)

# Print confirmation
print(f"Submission file generated with {len(submission_df_w)} rows.")

print(submission_df_w.tail(20))

Submission file generated with 65341 rows.
                   ID  Pred
65321  2025_3474_3476   1.0
65322  2025_3474_3477   1.0
65323  2025_3474_3478   1.0
65324  2025_3474_3479   1.0
65325  2025_3474_3480   1.0
65326  2025_3475_3476   1.0
65327  2025_3475_3477   1.0
65328  2025_3475_3478   1.0
65329  2025_3475_3479   1.0
65330  2025_3475_3480   1.0
65331  2025_3476_3477   1.0
65332  2025_3476_3478   1.0
65333  2025_3476_3479   1.0
65334  2025_3476_3480   1.0
65335  2025_3477_3478   1.0
65336  2025_3477_3479   1.0
65337  2025_3477_3480   1.0
65338  2025_3478_3479   1.0
65339  2025_3478_3480   1.0
65340  2025_3479_3480   1.0


In [12]:
# Check the features for the test data
print("Sample test features (Xw_test):")
print(Xw_test.head())

# Check if features are constant
if Xw_test.nunique().eq(1).any():
    print("Warning: Some features are constant and may not be informative.")
else:
    print("Features are diverse and should allow for meaningful predictions.")

Sample test features (Xw_test):
   WTeamID  LTeamID  WScore  LScore  WLoc  NumOT
0     3258     3388      72      57     0      0
1     1186     1381      82      70     2      0
2     3313     3354      63      49     0      0
3     3172     3203      73      54     1      0
4     3198     3152      64      46     0      0
Features are diverse and should allow for meaningful predictions.


In [13]:
# Check model performance on the training data
train_predictions_w = model.predict_proba(Xw_train)[:, 1]
print("Training predictions sample:", train_predictions_w[:10])

# Check if training predictions are diverse
if np.unique(train_predictions_w).size == 1:
    print("Warning: Model is predicting the same value for all training samples.")
else:
    print("Model is making diverse predictions on training data.")

Training predictions sample: [1. 1. 0. 1. 1. 1. 1. 1. 1. 1.]
Model is making diverse predictions on training data.


In [14]:
final_submission = pd.concat([submission_df_m, submission_df_w], axis=0)

# Check if features are constant
if final_submission.nunique().eq(1).any():
    print("Warning: Some data is constant and may not be informative.")
else:
    print("Data is diverse.")

print(final_submission.shape)

final_submission.to_csv("submission.csv", index=False)

Data is diverse.
(131407, 2)
