### To start, I will make a quick model for the sake of example. You will want to substitute in your model

In [1]:
import pandas as pd
import zipfile
from sklearn.linear_model import LogisticRegression

In [2]:
with zipfile.ZipFile('march-machine-learning-mania-2024.zip') as zf:
    with zf.open('MNCAATourneySeeds.csv') as f:
        m_seed = pd.read_csv(f)
    with zf.open('MNCAATourneyCompactResults.csv') as f:
        m_results = pd.read_csv(f)
    with zf.open('WNCAATourneySeeds.csv') as f:
        w_seed = pd.read_csv(f)
    with zf.open('WNCAATourneyCompactResults.csv') as f:
        w_results = pd.read_csv(f)

In [3]:
m_seed['Seed'] = m_seed['Seed'].str.extract(r'(\d+)')
w_seed['Seed'] = w_seed['Seed'].str.extract(r'(\d+)')

In [4]:
seeds = pd.concat([m_seed, w_seed])
results = pd.concat([m_results, w_results])

In [5]:
results = results.rename(columns={'WTeamID':'T1_ID','WScore':'T1_Score','LTeamID':'T2_ID','LScore':'T2_Score'})
even_matchups = pd.concat([results, results.rename(columns={'T1_ID':'T2_ID', 'T2_ID':'T1_ID', 'T1_Score':'T2_Score', 'T2_Score':'T1_Score'})])
even_matchups['T1_Win'] = (even_matchups['T1_Score'] > even_matchups['T2_Score']).astype(int)

In [6]:
merged = pd.merge(even_matchups, seeds, left_on=['T1_ID', 'Season'], right_on=['TeamID', 'Season'])
merged.rename(columns={'Seed': 'T1_Seed'}, inplace=True)
merged = pd.merge(merged, seeds, left_on=['T2_ID', 'Season'], right_on=['TeamID', 'Season'])
merged.rename(columns={'Seed': 'T2_Seed'}, inplace=True)
merged[['T1_ID', 'T1_Seed', 'T2_ID', 'T2_Seed', 'T1_Win']]

Unnamed: 0,T1_ID,T1_Seed,T2_ID,T2_Seed,T1_Win
0,1116,09,1234,08,1
1,1116,09,1385,01,0
2,1207,01,1385,01,1
3,1246,12,1385,01,0
4,1301,03,1385,01,0
...,...,...,...,...,...
8063,3417,04,3328,05,1
8064,3339,12,3328,05,0
8065,3437,04,3156,13,1
8066,3437,04,3195,12,1


In [7]:
model = LogisticRegression()

features = ['T1_Seed', 'T2_Seed']

X = merged[features]
y = merged['T1_Win']
model.fit(X, y)

LogisticRegression()

## Now that I have my model ready, I can predict based on this year's tournament teams

In [8]:
# reading in the possible matchups for this year
potential_24 = pd.read_csv('2024_Potential_Matchups.csv')[['T1_Spelling', 'T1_ID', 'T1_Seed', 'T2_Spelling', 'T2_ID', 'T2_Seed']]

## Here is where you will have to add in any features that may be used in the model. For the example model, I just used seed, which is already given to me in the potential_24 dataframe

In [9]:
# If you have a table that has all of your features for each team, you can simply merge those features into the potential_24 dataframe

# features = [...]
# potential_24 = pd.merge(potential_24, feature_df[[features]], left_on = 'T1_ID', right_on = 'TeamID', how = 'left')
# potential_24 = pd.merge(potential_24, feature_df[[features]], left_on = 'T2_ID', right_on = 'TeamID', how = 'left', suffixes=('_T1', '_T2'))

# you may need to switch some of this to match the feature names that you used when you fit your model

In [10]:
# add a new column with predictions for each game
potential_24['T1_Win'] = model.predict_proba(potential_24[features])[:,1]

In [11]:
potential_24.to_csv('submission.csv')