In [5]:
import itertools
import pandas as pd
import math
X = pd.read_csv('final_tourney_data.csv')
y = X['win']
X = X.drop(['win'], axis=1)
selected_features = X.columns
merged_data = pd.read_csv('merged.csv')

def predict(team1, team2, model,scaler):
    team1 = team1.copy()
    team2 = team2.copy()
    team1_name = team1['TEAM'].values[0]
    team2_name = team2['TEAM'].values[0]
    #add _team to the columns of team1 and _opp to the columns of team2
    team1.columns = team1.columns + "_team"
    team2.columns = team2.columns + "_opp"
    #calculate seed difference
    team1['TeamElo']= team1['season_elo_team']
    team2['OppElo'] = team2['season_elo_opp']
    pred_df = pd.merge(team1, team2, left_on=['YEAR_team'], right_on=['YEAR_opp'], how='left')
    # pred_df['seed_diff'] = pred_df['SEED_team'] - pred_df['SEED_opp']
    pred_df = pred_df[selected_features]
    pred_df = scaler.transform(pred_df)
    #predict the outcome
    win_prob = model.predict_proba(pred_df)[0][1]
    team1_win_prob = win_prob*100
    team2_win_prob = 100 - team1_win_prob
    if win_prob > 0.5:
        team1_odds = math.ceil((100*team1_win_prob)/(team1_win_prob - 100))
        team2_odds = -math.ceil(((100*(team2_win_prob-100))/(team2_win_prob)))
    elif win_prob < 0.5:
        team1_odds = -math.ceil(((100*(team1_win_prob-100))/(team1_win_prob)))
        team2_odds = math.ceil((100*team2_win_prob)/(team2_win_prob - 100))

    else:
        team1_odds = 100
        team2_odds = 100
    pred = team1_win_prob
    predictions = {'ID': f'{team1_name}_{team2_name}', 'Pred': win_prob}
    return predictions 

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost pipeline
xgb_pipeline = Pipeline([
    ('clf', XGBClassifier())
])

# MLP pipeline
mlp_pipeline = Pipeline([
    ('clf', MLPClassifier())
])

# Parameter grids for grid search
xgb_param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__learning_rate': [0.05, 0.1, 0.2],
    'clf__max_depth': [3, 5, 7],
    'clf__subsample': [0.6, 0.8, 1.0],
    'clf__colsample_bytree': [0.6, 0.8, 1.0],
    'clf__gamma': [0, 0.1, 0.2],
    'clf__reg_alpha': [0, 0.1, 0.2],
    'clf__reg_lambda': [0, 0.1, 0.2],
    'clf__min_child_weight': [1, 3, 5],
    'clf__scale_pos_weight': [1, 2, 3]
}


mlp_param_grid = {
    'clf__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'clf__activation': ['relu', 'tanh'],
    'clf__alpha': [0.0001, 0.001, 0.01],
    'clf__learning_rate_init': [0.001, 0.01, 0.1],
    'clf__solver': ['adam', 'sgd'],
    'clf__batch_size': [32, 64, 128],
    'clf__max_iter': [100, 200, 300],
    'clf__early_stopping': [True, False],
    'clf__beta_1': [0.9, 0.95, 0.99],
    'clf__beta_2': [0.999, 0.9999, 0.99999],
    'clf__epsilon': [1e-8, 1e-7, 1e-6],
    'clf__validation_fraction': [0.1, 0.2, 0.3],
    'clf__n_iter_no_change': [5, 10, 15]
}


# Grid search for XGBoost
xgb_grid_search = GridSearchCV(xgb_pipeline, param_grid=xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(X_train_scaled, y_train)

# Grid search for MLP
mlp_grid_search = GridSearchCV(mlp_pipeline, param_grid=mlp_param_grid, cv=5, scoring='accuracy',n_jobs=-1)
mlp_grid_search.fit(X_train_scaled, y_train)

# Best parameters
print("XGBoost Best Parameters:", xgb_grid_search.best_params_)
print("MLP Best Parameters:", mlp_grid_search.best_params_)

# Evaluate on test set
xgb_preds = xgb_grid_search.predict(X_test_scaled)
mlp_preds = mlp_grid_search.predict(X_test_scaled)

print("XGBoost Test Accuracy:", accuracy_score(y_test, xgb_preds))
print("MLP Test Accuracy:", accuracy_score(y_test, mlp_preds))

# Feature importance analysis for XGBoost
best_xgb_model = xgb_grid_search.best_estimator_.named_steps['clf']
feature_importance = best_xgb_model.feature_importances_

# Identify which features to drop based on feature importance
important_features = pd.Series(feature_importance, index=X.columns).sort_values(ascending=False)
print("Top Features:")
print(important_features)

# Drop less important features and retrain the models if needed




