In [1]:
import pandas as pd

# Import Data

In [2]:
# Import the CSV to fighter and fight
fighter = pd.read_csv('../raw_data/raw_fighter_details.csv')
fight = pd.read_csv('../raw_data/raw_total_fight_data.csv', delimiter=';')

# Merge Datasets

In [3]:
# Add 'R' to column names for R_fighter and 'B' for Blue to differentiate in dataset.
R_fighter = fighter.add_prefix("R_")
B_fighter = fighter.add_prefix("B_")

# Merge datasets.
data = fight.merge(R_fighter, how='left', left_on='R_fighter', right_on='R_fighter_name')
data = data.merge(B_fighter, how='left', left_on='B_fighter', right_on='B_fighter_name')

# Clean Data

In [4]:
# Remove characters from Weight, Reach and Height

weight_cols = ['R_Weight','B_Weight']
reach_cols = ['R_Reach','B_Reach']
height_cols = ['R_Height','B_Height']

# Remove 'lbs.' from weight cols
for weight in weight_cols:
    data[weight] = data[weight].apply(lambda x: float(x.replace(' lbs.', '')) if type(x) != float else x)

# Remove " from reach cols
for reach in reach_cols:
    data[reach] = data[reach].apply(lambda x: float(x.replace('"', '')) if type(x) != float else x)
    
# Convert height from feet' inch" to cm using funciton to apply by lambda to column
def height_convert(height):
    ft_in = []
    height_cm = 0
    for i in height.split("'"):
        i = i.replace("'","")
        i = i.replace('"','')
        ft_in.append(i)
    if len(ft_in) == 2:
        height_cm += float(ft_in[0]) * 30.48
        height_cm += float(ft_in[1]) * 2.54
    else:
        height_cm += float(ft_in[0]) * 30.48
    return height_cm

for height in height_cols:
    data[height] = data[height].apply(lambda x: height_convert(x) if type(x) != float else x)

In [5]:
# Convert the DOB column to date time and create age column by subtracting DOB from today's date. 
data[['R_DOB','B_DOB']] = data[['R_DOB','B_DOB']].apply(pd.to_datetime)

data['R_current_age'] = pd.to_datetime(pd.Timestamp("today").strftime("%Y-%m-%d")) - data['R_DOB']
data['B_current_age'] = pd.to_datetime(pd.Timestamp("today").strftime("%Y-%m-%d")) - data['B_DOB']

# Year has 365 days, account for leap years, add 0.25. Day has 24 * 60 min, min has 60s. Multiply to get seconds in year.
data['R_current_age'] = data['R_current_age'].apply(lambda x: x.total_seconds() / (365.25*24*60*60))
data['B_current_age'] = data['B_current_age'].apply(lambda x: x.total_seconds() / (365.25*24*60*60))

In [6]:
# Remove percentae sign from the columns.

def remove_pct(pct):
    if type(pct) == str:
        return float(pct.replace('%',''))
    else:
        return float(pct)

pct_col = ['R_Str_Acc','B_Str_Acc','R_Str_Def','B_Str_Def',
           'R_TD_Acc','B_TD_Acc','R_TD_Def','B_TD_Def']    

for col in pct_col:
    data[col] = data[col].apply(lambda x: remove_pct(x))

In [7]:
# R_CTRL and B_CTRL and last_round_time to seconds.
# Note, we may be able to delete the else clause.

def get_sec(time):
    # Transform time mm:ss to seconds
    if time == '--':
        return None
    if len(time.split(':')) == 2:
        m, s = time.split(':')
        return int(m) * 60 + float(s)
    else:
        return None

time_col = ['R_CTRL','B_CTRL','last_round_time']

for col in time_col:
    data[col] = data[col].apply(lambda x: get_sec(x))

In [8]:
# Generate pct for all cols with 'X of Y' to generate percentages.
def generate_pct(string):
    if 'of' in string:
        num, den = string.split("of")
        if float(den) != 0:
            pct = (float(num)/float(den))*100
            return pct
        else:
            return None
    else:
        return None

pct_conv_cols = ['R_SIG_STR.','B_SIG_STR.','R_TOTAL_STR.','B_TOTAL_STR.',
               'R_TD','B_TD','R_HEAD','B_HEAD','R_BODY','B_BODY','R_LEG',
               'B_LEG','R_DISTANCE','B_DISTANCE','R_CLINCH','B_CLINCH',
               'R_GROUND','B_GROUND']    

for col in pct_conv_cols:
    data[f"{col}_pct"] = data[col].apply(lambda x: generate_pct(x))
    data.drop(columns=col, inplace=True)

In [9]:
# Code R winner (instead of a named winner) True if Red Wins, False if Blue Wins
data['R_Win'] = data['R_fighter'] == data['Winner']

In [10]:
# Drop Uneccessary Columns

data.drop(columns=['R_DOB','B_DOB','R_SIG_STR_pct',
                   'B_SIG_STR_pct','Referee', 'location',
                   'date','Winner'], inplace=True)

# Preprocess

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

In [12]:
# Create Imputing and Scaling Pipelines
num_transformer = make_pipeline(SimpleImputer(strategy="median"), RobustScaler())
cat_transformer = make_pipeline(SimpleImputer(strategy="most_frequent"))

# Select num_col and cat_col
num_col = make_column_selector(dtype_include=['number'])
cat_col = make_column_selector(dtype_include=['object','bool'])

# Transform the Columns
data_preproc = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

# Output

In [13]:
out = pd.DataFrame(data_preproc.fit_transform(data),
                   columns=data_preproc.get_feature_names_out())

In [14]:
# Check columns and example values.
out_cols = list(out.columns)
for i in out_cols:
    print(f"{i}: {out[i][1]}")

pipeline-1__R_KD: 0.0
pipeline-1__B_KD: 0.0
pipeline-1__R_TD_pct: 0.0
pipeline-1__B_TD_pct: 0.1046153846153844
pipeline-1__R_SUB_ATT: 1.0
pipeline-1__B_SUB_ATT: 2.0
pipeline-1__R_REV: 0.0
pipeline-1__B_REV: 1.0
pipeline-1__R_CTRL: -0.03111111111111111
pipeline-1__B_CTRL: 1.3397435897435896
pipeline-1__last_round: 0.0
pipeline-1__last_round_time: 0.09868421052631579
pipeline-1__R_Height: 0.19999999999999954
pipeline-1__R_Weight: 0.375
pipeline-1__R_Reach: 0.4
pipeline-1__R_SLpM: 0.08552631578947362
pipeline-1__R_Str_Acc: 1.2
pipeline-1__R_SApM: -0.7801418439716312
pipeline-1__R_Str_Def: 0.5555555555555556
pipeline-1__R_TD_Avg: 0.0
pipeline-1__R_TD_Acc: 1.9047619047619047
pipeline-1__R_TD_Def: 0.68
pipeline-1__R_Sub_Avg: -0.22222222222222224
pipeline-1__B_Height: 0.6000000000000009
pipeline-1__B_Weight: 0.875
pipeline-1__B_Reach: 1.0
pipeline-1__B_SLpM: -0.03468208092485552
pipeline-1__B_Str_Acc: 0.7272727272727273
pipeline-1__B_SApM: -0.9103448275862069
pipeline-1__B_Str_Def: 0.54545454

In [17]:
out.shape

(6012, 62)

In [18]:
#Export merged data file
with open('../raw_data/merged_out.csv', 'w') as file:
    out.to_csv(file, index = True, header = True)
print("Merged data saved as '/merged_out.csv'")

Merged data saved as '/merged_out.csv'
