old RF model notebook

In [None]:
import pandas as pd
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


def create_weighting_matrix(data_path, weights_path):
    data = pd.read_csv(data_path)
    data['cage_size'] = data['cage_size'].map({'big': 1, 'small': 0})
    weight_classes = data['weight_class'].unique()
    feature_cols = ['eff_diff', 'control_diff', 'fighter_age', 'fighter_age_differential', 'cage_size']

    weights = {wc: {feature: 0 for feature in feature_cols} for wc in weight_classes}

    for wc in weight_classes:
        wc_data = data[data['weight_class'] == wc]
        wc_data['result'] = wc_data['result'].apply(lambda x: 1 if x == 'W' else 0)
        X = wc_data[feature_cols]
        y = wc_data['result']
        model = RandomForestClassifier(random_state=42)
        model.fit(X, y)
        importances = model.feature_importances_
        for i, col in enumerate(feature_cols):
            weights[wc][col] = importances[i]

    weights_df = pd.DataFrame.from_dict(weights, orient='index').reset_index()
    weights_df.rename(columns={'index': 'weight_class'}, inplace=True)
    weights_df.to_csv(weights_path, index=False)

    return weights_df

def load_and_preprocess_data(data_path, weights_path):
    data = pd.read_csv(data_path)
    weights_df = pd.read_csv(weights_path)
    weighted_data = data.apply(lambda row: apply_weights(row, weights_df), axis=1)
    one_hot_encoder = OneHotEncoder()

    # Replace the cage_size mapping with one-hot encoding
    #processed_data = pd.get_dummies(processed_data, columns=['cage_size'], drop_first=True)
    
    encoded_weight_class = one_hot_encoder.fit_transform(weighted_data[['weight_class']]).toarray()
    encoded_weight_class_df = pd.DataFrame(encoded_weight_class, columns=one_hot_encoder.get_feature_names_out(['weight_class']))
    processed_data = weighted_data.join(encoded_weight_class_df).drop('weight_class', axis=1)
    processed_data['result'] = processed_data['result'].apply(lambda x: 1 if x == 'W' else 0)
    return processed_data, one_hot_encoder

def apply_weights(row, weights_df):
    weight_class = row['weight_class']
    weights = weights_df[weights_df['weight_class'] == weight_class]
    if not weights.empty:
        weights = weights.iloc[0]
        for col in weights.index:
            if col != 'weight_class' and col in row:
                row_value = pd.to_numeric(row[col], errors='coerce')
                if pd.notnull(row_value):
                    row[col] = row_value * weights[col]
    return row

# Define the train_and_evaluate_model function
def train_and_evaluate_model(data):
    X = data.drop('result', axis=1)
    y = data['result']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    #default of max_features is the square root of the number of features

    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

    return model

def predict_upcoming_bouts(model, one_hot_encoder, upcoming_bouts_path, data_path):
    historical_data = pd.read_csv(data_path)
    avg_eff_diff_by_class = historical_data.groupby('weight_class')['eff_diff'].mean()
    avg_control_diff_by_class = historical_data.groupby('weight_class')['control_diff'].mean()
    #avg_days_since_by_class = historical_data.groupby('weight_class')['days_since'].mean()
    avg_fighter_age_by_class = historical_data.groupby('weight_class')['fighter_age'].mean()
    avg_fighter_age_differential_by_class = historical_data.groupby('weight_class')['fighter_age_differential'].mean()

    with open(upcoming_bouts_path, 'r') as f:
        upcoming_bouts = json.load(f)

    predictions = []
    for bout in upcoming_bouts:
        weight_class = bout['weight_class']
        cage_size = bout['cage_size']
        bout['fighter_a_eff_diff'] = bout['fighter_a_eff_diff'] if bout['fighter_a_eff_diff'] is not None else avg_eff_diff_by_class.get(weight_class, 0)
        bout['fighter_b_eff_diff'] = bout['fighter_b_eff_diff'] if bout['fighter_b_eff_diff'] is not None else avg_eff_diff_by_class.get(weight_class, 0)
        bout['fighter_a_control_rate_diff'] = bout['fighter_a_control_rate_diff'] if bout['fighter_a_control_rate_diff'] is not None else avg_control_diff_by_class.get(weight_class, 0)
        bout['fighter_b_control_rate_diff'] = bout['fighter_b_control_rate_diff'] if bout['fighter_b_control_rate_diff'] is not None else avg_control_diff_by_class.get(weight_class, 0)
        #bout['fighter_a_days_since_last_bout'] = bout['fighter_a_days_since_last_bout'] if bout['fighter_a_days_since_last_bout'] is not None else avg_days_since_by_class.get(weight_class, 0)
        #bout['fighter_b_days_since_last_bout'] = bout['fighter_b_days_since_last_bout'] if bout['fighter_b_days_since_last_bout'] is not None else avg_days_since_by_class.get(weight_class, 0)
        bout['fighter_a_age'] = bout['fighter_a_age'] if bout['fighter_a_age'] is not None else avg_fighter_age_by_class.get(weight_class, 0)
        bout['fighter_b_age'] = bout['fighter_b_age'] if bout['fighter_b_age'] is not None else avg_fighter_age_by_class.get(weight_class, 0)
        bout['fighter_a_age_differential'] = bout['fighter_a_age_differential'] if bout['fighter_a_age_differential'] is not None else avg_fighter_age_differential_by_class.get(weight_class, 0)

        input_data = pd.DataFrame([{
            'eff_diff': bout['fighter_a_eff_diff'],
            'control_diff': bout['fighter_a_control_rate_diff'],
            #'days_since': bout['fighter_a_days_since_last_bout'],
            'fighter_age': bout['fighter_a_age'],
            'fighter_age_differential': bout['fighter_a_age_differential'],
            'weight_class': bout['weight_class'],
            'cage_size': bout['cage_size']
        }])
        encoded_input = one_hot_encoder.transform(input_data[['weight_class']]).toarray()
        input_data = input_data.join(pd.DataFrame(encoded_input, columns=one_hot_encoder.get_feature_names_out(['weight_class']))).drop('weight_class', axis=1)
        
        fighter_a_prob = model.predict_proba(input_data)[:, 1][0]
        predictions.append({
            'fighter_a_name': bout['fighter_a_name'],
            'fighter_b_name': bout['fighter_b_name'],
            'fighter_a_win_probability': fighter_a_prob,
            'fighter_b_win_probability': 1 - fighter_a_prob
        })

    return pd.DataFrame(predictions)

def save_predictions(predictions, output_path):
    if os.path.isfile(output_path):
        existing_data = pd.read_csv(output_path)
        combined_data = pd.concat([existing_data, predictions], ignore_index=True)
        combined_data.to_csv(output_path, index=False)
    else:
        predictions.to_csv(output_path, index=False)

def get_external_data(cursor):
    





data_path = '/Users/daneweickert/Library/CloudStorage/GoogleDrive-weickertdane99@gmail.com/My Drive/Work/Sports Betting/Sports/MMA/ufc_modeling/scraping_UFCstats/scraping_ufc_stats/scraping_ufc_stats/spiders/2019_on_calc.csv'
weights_path = '/Users/daneweickert/Downloads/weight_matrix - Sheet2 (5).csv'
upcoming_bouts_path = '/Users/daneweickert/Downloads/01_13_2024 - Sheet1 (3).csv'
output_path = '/Users/daneweickert/Library/CloudStorage/GoogleDrive-weickertdane99@gmail.com/My Drive/Work/Sports Betting/Sports/MMA/ufc_modeling/scraping_UFCstats/scraping_ufc_stats/scraping_ufc_stats/spiders/predictions_01_13_2024.csv'

# Process and train
weights_df = create_weighting_matrix(data_path, weights_path)
processed_data, one_hot_encoder = load_and_preprocess_data(data_path, weights_path)
model = train_model(processed_data)



# Predict and save
predictions = predict_upcoming_bouts(model, one_hot_encoder, upcoming_bouts_path, data_path)
save_predictions(predictions, output_path)

# Train and evaluate the model
train_and_evaluate_model(processed_data)


training RF model and outputting projections

In [None]:
import sqlite3
import pandas as pd
import json
import os
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def setup_logging():
    logging.basicConfig(filename='train_model.log', level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')


def get_db_connection(db_path):
    return sqlite3.connect(db_path)

def load_and_preprocess_data(cursor):

    # select all data from the calc_ufc_table
    cursor.execute('SELECT * FROM training_data')
    data = cursor.fetchall()






def main():
    setup_logging()
    db_path = '/Users/daneweickert/Library/CloudStorage/GoogleDrive-weickertdane99@gmail.com/My Drive/Work/Sports Betting/Sports/MMA/ufc_modeling/prod/database/historical_raw.db'
    try:
        conn = get_db_connection(db_path)
        cursor = conn.cursor()

        # Call functions


if __name__ == "__main__":
    main()
