ALL the code in one organized, annotated space

# Initialization 

In [1]:
import numpy as np 
import pandas as pd 

ufc = pd.read_csv('ufc-master.csv')

In [2]:
print(f"Shape of Data: {ufc.shape}")
ufc.Winner.value_counts()

Shape of Data: (4896, 119)


Winner
Red     2859
Blue    2037
Name: count, dtype: int64

# Handling Nulls, Cat/Num, Fracture

In [5]:
### replace nulls with mode/mean (for categorical/continuous)

def handle_nulls(data):
    for col in data.columns:
        if data[col].isna().sum() >=4890:
            data = data.drop([col], axis=1)
        else:
            if data[col].dtype in ['int64', 'float64']:
                data = data.fillna({col : data[col].mean()})
            else:
                data = data.fillna({col : data[col].mode()[0]})
    
    return data

In [9]:
### security measure for target values bc I keep having issues 

def check_target_vals(target):
    
    red = 0
    
    blue = 0
    
    for val in target:
        if val == 1:
            red += 1
        else:
            blue += 1

    if red != 2859:
        return f"red disconnect - instead of 2859, {red}"
    elif blue != 2037:
        return f"blue disconnect - instead of 2037, {blue}"
    else:
        return "all good"

In [6]:
### break into features and target 

def fracture(data):
    target = [1 if victor == 'Red' else 0 for victor in data['Winner']]
    features = data.drop(['Winner'], axis=1)
    return features, target

In [10]:
### categorical and continuous values

def num_and_cat(features):
    num_col = [col for col in features.columns if features[col].dtype in ['int64', 'float64']]
    cat_col = [col for col in features.columns if col not in num_col]
    return num_col, cat_col 

In [9]:
D, target = fracture(ufc)
print(f"Shape of D: {D.shape}")
print(f"target? {check_target_vals(target)}")

Shape of D: (4896, 118)
target? all good


In [11]:
num, cat = num_and_cat(D)
print(f"Categorical columns - {len(cat)}")
print(f"Continous columns - {len(num)}")

Categorical columns - 14
Continous columns - 104


# Approach 1 - Dummy

In [11]:
def dummy_approach(data):
    features, target = fracture(data)
    return features, features.columns, target

In [13]:
D, selected_features, target = dummy_approach(ufc)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"target? {check_target_vals(target)}")

Shape of D: (4896, 118)
Number of features: 118
target? all good


# Approach 2 - Num Corrs

In [12]:
def highest_correlating_num_cols(data, squared_thresh):
    features, target = fracture(data)
    desired_cols = []
    
    to_mess_with = features.copy()
    to_mess_with['target'] = target
    
    num_col, _ = num_and_cat(features)
    
    for col in num_col:
        test_val = (to_mess_with[col].corr(to_mess_with['target']))**2

        if test_val >= squared_thresh:
            desired_cols.append(col)
        
    return features, desired_cols, target

In [15]:
D, selected_features, target = highest_correlating_num_cols(ufc, squared_thresh=0.0225)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"target vals? {check_target_vals(target)}")

Shape of D: (4896, 118)
Number of features: 16
target vals? all good


# Approach 3 - Num and Cat

In [13]:
from sklearn.preprocessing import LabelEncoder

def highest_correlating_num_and_cat(data, cat_thresh, squared_thresh, just_cats = False):
    
    features, target = fracture(data)
    
    enc = LabelEncoder()
    desired_cols = []
    
    num_col, cat_col = num_and_cat(features)
    
    for col in cat_col:
        features[col] = enc.fit_transform(features[col])
        
    to_mess_with = features.copy()
    to_mess_with['target'] = target
    
    for col in cat_col:
        test_val = (to_mess_with[col].corr(to_mess_with['target']))**2
        test = test_val >= cat_thresh
        
        if test:
            desired_cols.append(col)
    
    if not just_cats:
        _, nums, unimp = highest_correlating_num_cols(data, squared_thresh)
        for num in nums:
            desired_cols.append(num)
            
    return features, desired_cols, target

In [20]:
D, selected_features, target = highest_correlating_num_and_cat(ufc, cat_thresh=0.001, squared_thresh=0.0225)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"target vals? {check_target_vals(target)}")

Shape of D: (4896, 118)
Number of features: 18
target vals? all good


# Approach 4 - Differences 

In [14]:
def create_diffs(data):
    
    data['draw_diff'] = data['R_draw'] - data['B_draw'] ## draw difference
    data['SigStr_pct_dff'] = data['R_avg_SIG_STR_pct'] - data['B_avg_SIG_STR_pct'] ## mean sig strikes percent 
    data['SigStr_land_diff'] = data['R_avg_SIG_STR_landed'] - data['B_avg_SIG_STR_landed'] ## mean sig strikes 
    data['maj_dec_diff'] = data['R_win_by_Decision_Majority'] - data['B_win_by_Decision_Majority'] ## nean wins 
    data['split_dec_diff'] = data['R_win_by_Decision_Split'] - data['B_win_by_Decision_Split'] ## ^^
    data['unan_dec_diff'] = data['R_win_by_Decision_Unanimous'] - data['B_win_by_Decision_Unanimous'] ## ^^
    data['doc_stop_diff'] = data['R_win_by_TKO_Doctor_Stoppage'] - data['B_win_by_TKO_Doctor_Stoppage'] ##^^
    data['odds_diff'] = data['R_odds'] - data['B_odds'] ## odds
    data['ev_diff'] = data['R_ev'] - data['B_ev'] ## expected value 
    data['subs_diff'] = data['R_avg_SUB_ATT'] - data['B_avg_SUB_ATT'] ## subs attempted
    data['td_landed_diff'] = data['R_avg_TD_landed'] - data['B_avg_TD_landed'] ## takedowns 
    data['td_pct_diff'] = data['R_avg_TD_pct'] - data['B_avg_TD_pct'] ## takedowns percent 
    data['ko_diff'] = data['r_ko_odds'] - data['b_ko_odds'] ## knockouts 
    data['ko_win_diff'] = data['R_win_by_KO/TKO'] - data['B_win_by_KO/TKO'] ## wins by KO/TKO 
    
    return data 


In [15]:
def cols_of_differences(data):
    features, target = fracture(data)
    features = create_diffs(features)
    desired_cols = ['draw_diff', 'SigStr_pct_dff', 'SigStr_land_diff', 'maj_dec_diff', 
          'split_dec_diff', 'unan_dec_diff', 'doc_stop_diff', 'odds_diff', 
          'ev_diff', 'subs_diff', 'td_landed_diff', 'td_pct_diff', 'ko_diff', 
          'ko_win_diff']
    
    return features, desired_cols, target

In [23]:
D, selected_features, target = cols_of_differences(ufc)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"target vals? {check_target_vals(target)}")

Shape of D: (4896, 132)
Number of features: 14
target vals? all good


# Approach 5 - Combined Differences and Categorical Variables

In [16]:
def differences_and_cat(data, cat_thresh, squared_thresh, just_cats=True):
    
    features, desired_cols, target = cols_of_differences(data)
    _, cats, not_imp = highest_correlating_num_and_cat(data, cat_thresh=cat_thresh, squared_thresh=squared_thresh, just_cats=just_cats)
    for cat in cats:
        desired_cols.append(cat)
    
    return features, desired_cols, target

In [25]:
D, selected_features, target = differences_and_cat(ufc, cat_thresh=0.001, squared_thresh=0.0225)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"target vals? {check_target_vals(target)}")

Shape of D: (4896, 132)
Number of features: 16
target vals? all good


# Approach 6 - Discussion Comment Betting Variables

In [17]:
def discussion_comment_betting_variables(data):
    features, target = fracture(data)
    desired_cols = ['B_current_win_streak', 'R_win_by_Submission', 'B_win_by_Decision_Unanimous', 'R_win_by_Decision_Unanimous', 'R_current_lose_streak', 'B_win_by_TKO_Doctor_Stoppage', 
                    'win_dif', 'B_win_by_Decision_Split', 'B_wins', 'R_Stance', 'B_age', 'B_Weight_lbs', 'R_ev', 'B_total_rounds_fought', 'location', 
                    'R_odds', 'R_Reach_cms', 'R_Weight_lbs', 'R_current_win_streak', 'R_age', 'empty_arena', 'R_win_by_Decision_Split', 'R_draw', 'lose_streak_dif', 'B_draw']
    
    enc = LabelEncoder()
    
    for col in desired_cols:
        if features[col].dtype == 'object':
            features[col] = enc.fit_transform(features[col])
    
    return features, desired_cols, target

In [30]:
D, selected_features, target = discussion_comment_betting_variables(ufc)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"target vals? {check_target_vals(target)}")

Shape of D: (4896, 118)
Number of features: 25
target vals? all good


# Approach 7 - Discussion Comment Betting Variables with Differences 

In [18]:
def disc_cols_with_differences(data):
    
    features, target = fracture(data)
    
    features = create_diffs(features)
    
    desired_cols = ['B_current_win_streak', 'R_current_lose_streak', 'win_dif', 
                       'R_Stance', 'B_age', 'B_Weight_lbs', 'location', 'R_Reach_cms', 
                       'R_Weight_lbs', 'R_current_win_streak', 'R_age', 'empty_arena', 'lose_streak_dif']
    
    return features, desired_cols, target

In [32]:
D, selected_features, target = disc_cols_with_differences(ufc)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"target vals? {check_target_vals(target)}")

Shape of D: (4896, 132)
Number of features: 13
target vals? all good


# Encoding variable

In [19]:
def encode_if_needed(D):
    enc = LabelEncoder()
    num, cat = num_and_cat(D)
    
    if len(cat) == 0:
        return D
    else:
        for c in cat:
            D[c] = enc.fit_transform(D[c])

# Running Model Execution

In [27]:
def data_prep_and_feat_engineering(data, cat_thresh, squared_thresh):
    
    ### handle nulls
    data = handle_nulls(data)
    
    ### dictionary of approaches 
    approach_dict = {1 : dummy_approach,
                     2 : highest_correlating_num_cols,
                     3 : highest_correlating_num_and_cat,
                     4 : cols_of_differences,
                     5 : differences_and_cat,
                     6 : discussion_comment_betting_variables,
                     7 : disc_cols_with_differences}
    
    results_dict = {}
    
    for i in range(1,8):
        
        if i == 2:
            features, desired_cols, target = approach_dict[i](data, squared_thresh)
        elif i in [3, 5]:
            features, desired_cols, target = approach_dict[i](data, cat_thresh, squared_thresh)
        else:
            features, desired_cols, target = approach_dict[i](data)
        
        approach_name = f"approach {i}"
        
        results_dict[approach_name] = (features, desired_cols, target)
    
    return results_dict

In [24]:
import numpy as np 
import pandas as pd 

ufc = pd.read_csv('ufc-master.csv')
ufc = handle_nulls(ufc)

In [28]:
test = data_prep_and_feat_engineering(ufc, cat_thresh=0.001, squared_thresh=0.0225)

In [33]:
test['approach 1'][2]

[1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def trying_it_out(D, selected_features, target, model, ts=0.2):
    
    feats = D[selected_features]
    
    feats = encode_if_needed(feats)
    
    X_train, X_test, y_train, y_test = train_test_split(feats, target, random_state=0, test_size=ts)
    
    model.fit(X_train, y_train)
    
    y_preds = model.predixt(X_test)
    
    return accuracy_score(y_preds, y_test)