In [5]:
import itertools
import pandas as pd
import math
X = pd.read_csv('final_tourney_data.csv')
y = X['win']
X = X.drop(['win'], axis=1)
selected_features = X.columns
merged_data = pd.read_csv('merged.csv')

def predict(team1, team2, model,scaler):
    team1 = team1.copy()
    team2 = team2.copy()
    team1_name = team1['TEAM'].values[0]
    team2_name = team2['TEAM'].values[0]
    #add _team to the columns of team1 and _opp to the columns of team2
    team1.columns = team1.columns + "_team"
    team2.columns = team2.columns + "_opp"
    #calculate seed difference
    team1['TeamElo']= team1['season_elo_team']
    team2['OppElo'] = team2['season_elo_opp']
    pred_df = pd.merge(team1, team2, left_on=['YEAR_team'], right_on=['YEAR_opp'], how='left')
    # pred_df['seed_diff'] = pred_df['SEED_team'] - pred_df['SEED_opp']
    pred_df = pred_df[selected_features]
    pred_df = scaler.transform(pred_df)
    #predict the outcome
    win_prob = model.predict_proba(pred_df)[0][1]
    team1_win_prob = win_prob*100
    team2_win_prob = 100 - team1_win_prob
    if win_prob > 0.5:
        team1_odds = math.ceil((100*team1_win_prob)/(team1_win_prob - 100))
        team2_odds = -math.ceil(((100*(team2_win_prob-100))/(team2_win_prob)))
    elif win_prob < 0.5:
        team1_odds = -math.ceil(((100*(team1_win_prob-100))/(team1_win_prob)))
        team2_odds = math.ceil((100*team2_win_prob)/(team2_win_prob - 100))

    else:
        team1_odds = 100
        team2_odds = 100
    pred = team1_win_prob
    predictions = {'ID': f'{team1_name}_{team2_name}', 'Pred': win_prob}
    return predictions 

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from cuml.ensemble import RandomForestClassifier  # Import GPU-accelerated RandomForestClassifier
from xgboost import XGBClassifier

# Assuming X and y are defined earlier

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost pipeline using cuML's RandomForestClassifier (GPU-accelerated)
xgb_pipeline = Pipeline([
    ('clf', RandomForestClassifier())  # Use GPU-accelerated RandomForestClassifier
])

# Parameter distributions for random search
xgb_param_distributions = {
    'clf__n_estimators': np.random.randint(100, 500, 10),
    'clf__max_depth': np.random.randint(3, 10, 10),
    'clf__subsample': np.random.uniform(0.6, 1.0, 10),
    'clf__colsample_bytree': np.random.uniform(0.6, 1.0, 10),
    'clf__gamma': np.random.uniform(0, 0.2, 10),
    'clf__reg_alpha': np.random.uniform(0, 0.2, 10),
    'clf__reg_lambda': np.random.uniform(0, 0.2, 10),
    'clf__min_child_weight': np.random.randint(1, 6, 10),
    'clf__scale_pos_weight': np.random.randint(1, 4, 10)
}

# Random search for XGBoost
xgb_random_search = RandomizedSearchCV(xgb_pipeline, param_distributions=xgb_param_distributions, n_iter=10, cv=5,
                                       scoring='accuracy', n_jobs=-1)
xgb_random_search.fit(X_train_scaled, y_train)

# Best parameters
print("XGBoost Best Parameters:", xgb_random_search.best_params_)

# Evaluate on test set
xgb_preds = xgb_random_search.predict(X_test_scaled)
print("XGBoost Test Accuracy:", accuracy_score(y_test, xgb_preds))



