In [230]:
import numpy as np 
import pandas as pd 

ufc = pd.read_csv('ufc-master.csv')

In [231]:
ufc.Winner.value_counts()

Winner
Red     2859
Blue    2037
Name: count, dtype: int64

In [232]:
### filling nulls with mode (if categorical) or mean (if numerical)

def handle_nulls(data):
    for col in data.columns:
        if data[col].isna().sum() >=4890:
            data = data.drop([col], axis=1)
        else:
            if data[col].dtype in ['int64', 'float64']:
                data = data.fillna({col : data[col].mean()})
            else:
                data = data.fillna({col : data[col].mode()[0]})
    
    return data

In [233]:
def check_target_vals(target):
    
    red = 0
    
    blue = 0
    
    for val in target:
        if val == 1:
            red += 1
        else:
            blue += 1

    if red != 2859:
        return f"red disconnect - instead of 2859, {red}"
    elif blue != 2037:
        return f"blue disconnect - instead of 2037, {blue}"
    else:
        return "all good"

In [234]:
ufc = handle_nulls(ufc)

In [235]:
def num_and_cat(features):
    num_col = [col for col in features.columns if features[col].dtype in ['int64', 'float64']]
    cat_col = [col for col in features.columns if col not in num_col]
    return num_col, cat_col 

In [236]:
def fracture(data):
    target = [1 if victor == 'Red' else 0 for victor in data['Winner']]
    features = data.drop(['Winner'], axis=1)
    return features, target

In [237]:
X, y = fracture(ufc)

In [238]:
check_target_vals(y)

'all good'

In [239]:
def dummy_approach(data):
    features, target = fracture(data)
    return features, features.columns, target

In [240]:
D, selected_features, target = dummy_approach(ufc)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"target? {check_target_vals(target)}")

Shape of D: (4896, 117)
Number of features: 117
target? all good


In [241]:
def highest_correlating_num_cols(data, squared_thresh):
    features, target = fracture(data)
    desired_cols = []
    
    to_mess_with = features.copy()
    to_mess_with['target'] = target
    
    num_col, _ = num_and_cat(features)
    
    for col in num_col:
        test_val = (to_mess_with[col].corr(to_mess_with['target']))**2

        if test_val >= squared_thresh:
            desired_cols.append(col)
        
    return features, desired_cols, target

In [242]:
D, selected_features, target = highest_correlating_num_cols(ufc, squared_thresh=0.0225)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"target vals? {check_target_vals(target)}")

Shape of D: (4896, 117)
Number of features: 7
target vals? all good


In [153]:
from sklearn.preprocessing import LabelEncoder

def highest_correlating_num_and_cat(data, cat_thresh, squared_thresh, just_cats = False):
    
    features, target = fracture(data)
    
    enc = LabelEncoder()
    desired_cols = []
    
    num_col = [col for col in features.columns if features[col].dtype == 'int64' or features[col].dtype == 'float64']
    cat_col = [col for col in features.columns if col not in num_col]
    
    for col in cat_col:
        features[col] = enc.fit_transform(features[col])
    
    for col in cat_col:
        test_val = (features[col].corr(target))**2
        test = test_val >= cat_thresh
        
        if test:
            desired_cols.append(col)
    
    if not just_cats:
        _, nums, unimp = highest_correlating_num_cols(data, squared_thresh)
        for num in nums:
            desired_cols.append(num)
            
    return features, desired_cols, target

In [156]:
D, selected_features, target = highest_correlating_num_and_cat(ufc, cat_thresh=0.001, squared_thresh=0.0225)
print(f"Shape of D: {D.shape}")
print(f"Number of features: {len(selected_features)}")
print(f"Shape of target: {target.shape}")

Shape of D: (4896, 117)
Number of features: 0
Shape of target: (4896,)


In [40]:
def create_diffs(data):
    
    data['draw_diff'] = data['R_draw'] - data['B_draw'] ## draw difference
    data['SigStr_pct_dff'] = data['R_avg_SIG_STR_pct'] - data['B_avg_SIG_STR_pct'] ## mean sig strikes percent 
    data['SigStr_land_diff'] = data['R_avg_SIG_STR_landed'] - data['B_avg_SIG_STR_landed'] ## mean sig strikes 
    data['maj_dec_diff'] = data['R_win_by_Decision_Majority'] - data['B_win_by_Decision_Majority'] ## nean wins 
    data['split_dec_diff'] = data['R_win_by_Decision_Split'] - data['B_win_by_Decision_Split'] ## ^^
    data['unan_dec_diff'] = data['R_win_by_Decision_Unanimous'] - data['B_win_by_Decision_Unanimous'] ## ^^
    data['doc_stop_diff'] = data['R_win_by_TKO_Doctor_Stoppage'] - data['B_win_by_TKO_Doctor_Stoppage'] ##^^
    data['odds_diff'] = data['R_odds'] - data['B_odds'] ## odds
    data['ev_diff'] = data['R_ev'] - data['B_ev'] ## expected value 
    data['subs_diff'] = data['R_avg_SUB_ATT'] - data['B_avg_SUB_ATT'] ## subs attempted
    data['td_landed_diff'] = data['R_avg_TD_landed'] - data['B_avg_TD_landed'] ## takedowns 
    data['td_pct_diff'] = data['R_avg_TD_pct'] - data['B_avg_TD_pct'] ## takedowns percent 
    data['ko_diff'] = data['r_ko_odds'] - data['b_ko_odds'] ## knockouts 
    data['ko_win_diff'] = data['R_win_by_KO/TKO'] - data['B_win_by_KO/TKO'] ## wins by KO/TKO 
    
    return data 


In [77]:
def cols_of_differences(data):
    features, target = fracture(data)
    features = create_diffs(features)
    desired_cols = ['draw_diff', 'SigStr_pct_dff', 'SigStr_land_diff', 'maj_dec_diff', 
          'split_dec_diff', 'unan_dec_diff', 'doc_stop_diff', 'odds_diff', 
          'ev_diff', 'subs_diff', 'td_landed_diff', 'td_pct_diff', 'ko_diff', 
          'ko_win_diff']
    
    return features, desired_cols, target

In [79]:
D, selected_features, target = cols_of_differences(ufc)

In [89]:
def differences_and_cat(data, cat_thresh, squared_thresh, just_cats=True):
    
    features, desired_cols, target = cols_of_differences(data)
    _, cats, not_imp = highest_correlating_num_and_cat(data, cat_thresh=cat_thresh, squared_thresh=squared_thresh, just_cats=just_cats)
    for cat in cats:
        desired_cols.append(cat)
    
    return features, desired_cols, target

In [85]:
D, selected_features, target = differences_and_cat(ufc, cat_thresh=0.001, squared_thresh=0.0225)

In [87]:
selected_features

['draw_diff',
 'SigStr_pct_dff',
 'SigStr_land_diff',
 'maj_dec_diff',
 'split_dec_diff',
 'unan_dec_diff',
 'doc_stop_diff',
 'odds_diff',
 'ev_diff',
 'subs_diff',
 'td_landed_diff',
 'td_pct_diff',
 'ko_diff',
 'ko_win_diff']

In [103]:
def discussion_comment_betting_variables(data):
    features, target = fracture(data)
    desired_cols = ['B_current_win_streak', 'R_win_by_Submission', 'B_win_by_Decision_Unanimous', 'R_win_by_Decision_Unanimous', 'R_current_lose_streak', 'B_win_by_TKO_Doctor_Stoppage', 
                    'win_dif', 'B_win_by_Decision_Split', 'B_wins', 'R_Stance', 'B_age', 'B_Weight_lbs', 'R_ev', 'B_total_rounds_fought', 'location', 
                    'R_odds', 'R_Reach_cms', 'R_Weight_lbs', 'R_current_win_streak', 'R_age', 'empty_arena', 'R_win_by_Decision_Split', 'R_draw', 'lose_streak_dif', 'B_draw']
    
    enc = LabelEncoder()
    
    for col in desired_cols:
        if features[col].dtype == 'object':
            features[col] = enc.fit_transform(features[col])
    
    return features, desired_cols, target


In [92]:
D, selected_features, target = discussion_comment_betting_variables(ufc)

cat
cat


In [93]:
def disc_cols_with_differences(data):
    
    features, target = fracture(data)
    
    features = create_diffs(features)
    
    desired_cols = ['B_current_win_streak', 'R_current_lose_streak', 'win_dif', 
                       'R_Stance', 'B_age', 'B_Weight_lbs', 'location', 'R_Reach_cms', 
                       'R_Weight_lbs', 'R_current_win_streak', 'R_age', 'empty_arena', 'lose_streak_dif']
    
    return features, desired_cols, target

In [96]:
def data_prep_and_feat_engineering(data, cat_thresh, squared_thresh):
    
    ### handle nulls
    data = handle_nulls(data)
    
    ### dictionary of approaches 
    approach_dict = {1 : dummy_approach,
                     2 : highest_correlating_num_cols,
                     3 : highest_correlating_num_and_cat,
                     4 : cols_of_differences,
                     5 : differences_and_cat,
                     6 : discussion_comment_betting_variables,
                     7 : disc_cols_with_differences}
    
    for i in range(1,8):
        
        if i == 2:
            features, desired_cols, target = approach_dict[i](data, squared_thresh)
        elif i in [3, 5]:
            features, desired_cols, target = approach_dict[i](data, cat_thresh, squared_thresh)
        else:
            features, desired_cols, target = approach_dict[i](data)
        
        print(f"--- Approach {i} --- ")
        print(f"Shape of D: {features.shape}")
        print(f"Number of features: {len(desired_cols)}")
        print(f"Shape of target: {target.shape}")
    
    return i

In [219]:
i_val = data_prep_and_feat_engineering(ufc, cat_thresh=0.001, squared_thresh=0.0225)

--- Approach 1 --- 
Shape of D: (4896, 117)
Number of features: 117


AttributeError: 'list' object has no attribute 'shape'