In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import random
import math

In [None]:
# Load in full matches and wrestlers dataframes
full_matches = pd.read_csv('MATCHES.csv').drop(columns="Unnamed: 0")
full_wrestlers = pd.read_csv('WRESTLERS.csv').drop(columns="Unnamed: 0")

In [None]:
full_matches.shape # number of matches, recorded match variables

In [None]:
full_matches.columns

In [None]:
full_matches

In [None]:
full_wrestlers.shape # number of wrestlers, recorded wrestler variables

In [None]:
full_wrestlers.columns

In [None]:
full_wrestlers

In [None]:
# Drop matches decided by useless wincons like forfeit or bye (ASSUMPTION: should have no effect on ranking the wrestlers)
bad_wins = ['Forfeit','Injury Default','Medical Forfeit','Bye','Disqualified','Default','No Contest']
win_filter = [win not in bad_wins for win in full_matches["Victory Type (L)"]]
MATCHES = full_matches.loc[win_filter].drop_duplicates().reset_index(drop=True) # dedupe seems to work now
# Go back and check if dedupe is removing more than it should -- maybe not distinguishing multiple bouts on same day
MATCHES.shape

In [None]:
# Copy infoscrape function from Wrestling Tables notebook

def infoscrape(fullname,df):
    '''infoscrape receives full name of wrestler and matches dataframe
    and collects wrestler info from dataset'''

    # Initialize values of interest
    weight_class = 0
    wins = 0
    losses = 0
    matches = 0
    school = ''
    school_code = ''
    first_name = ''
    last_name = ''
    
    # Find observations corresponding to wrestler name
    win_id = df['Winner Full Name'] == fullname
    loss_id = df['Loser Full Name'] == fullname
    winning_matches = df.loc[win_id,:]
    losing_matches = df.loc[loss_id,:]
    
    # Split full name
    first_name, last_name = fullname.split(' ',1)
    
    # Counting stats (should check if names show in correct columns for forfeits, byes, etc.)
    wins = sum(win_id)
    losses = sum(loss_id)
    matches = wins+losses
    
    # Extract weight class, school, etc.
    win_weight = winning_matches['Weight Class'].unique()
    loss_weight = losing_matches['Weight Class'].unique()
    
    if win_weight.size > 0: # Avoiding 'if win_weight:' because it gives truth amibiguity warning
        weight_class = int(win_weight[0])
    else: # !!!Still need to add consideration for multiple weight classes!!!
        weight_class = int(loss_weight[0])
        
    win_school = winning_matches['Winner School (L)'].unique()
    win_school_code = winning_matches['Winner School (S)'].unique()
    loss_school = losing_matches['Loser School (L)'].unique()
    loss_school_code = losing_matches['Loser School (S)'].unique()
    
    if win_school.size > 0: # Avoiding 'if win_school:' because it gives truth amibiguity warning
        school = win_school[0]
        school_code = win_school_code[0]
    else: 
        school = loss_school[0]
        school_code = loss_school_code[0]
        
    # Return list of extracted data 
    return({'First Name':first_name,'Last Name':last_name,'Full Name':fullname,
            'School Name':school,'School Code':school_code,
            'Weight Class':weight_class,'Wins':wins,'Losses':losses,'Matches':matches})

In [None]:
# Remake wrestlers df
# Note: union of winner/loser full names is set of all wrestlers in dataset
wrestlers = set(MATCHES['Winner Full Name']) | set(MATCHES['Loser Full Name'])
wrestlers = [x for x in wrestlers if x==x] # remove nan, convert to list
wrestler_data = [infoscrape(wrestler,MATCHES) for wrestler in wrestlers]
WRESTLERS = pd.DataFrame(wrestler_data)

In [None]:
WRESTLERS.shape

In [None]:
# Function to create train and test data split by date of wrestling events
# Note: research paper trained on one weight class and tested on all the rest.
# Why this is a big separate function: have to remake wrestlers dataframe from filtered matches dataframe

def train_test_split(match_data, wrestler_data=None, split_method='date',
                    earliest=None, latest=None, train_size=0.75):
    '''train_test_split creates train and test data using given match data.
    Can split by date range for train set or desired train data size (default is date).
    Train_size is between 0 and 1. earliest/latest are dates in format YYYYMMDD.
    Returns dict of match_train, match_test, wrestler_train, wrestler_test.'''
    
    event_dates = match_data["Event Date"]
    
    # Default dates
    if earliest is None:
        earliest = min(event_dates)
    if latest is None:
        latest = max(event_dates)
    
    # Handle input exceptions        
    if latest > max(event_dates):
        raise ValueError('Invalid indexing: latest ({}) cannot be after most recent event ({})'\
                         .format(latest,max(event_dates)))
    if earliest >= latest:
        raise ValueError('Invalid indexing: earliest ({}) must be less than latest ({})'\
                         .format(earliest,latest))
        
    # Train-Test Split
    
    if split_method == 'size': # split by train_size
        
        indices = match_data.index.values
        n = len(indices)
        train_start = int(np.quantile(indices,q=1-train_size))
        train_id = range(train_start,n)
        test_id = range(0,train_start)
        match_train = match_data.iloc[train_id,:]
        match_test = match_data.iloc[test_id,:]
        
    if split_method == 'date': # split by date range
        
        date_range = range(earliest,latest+1)
        train_bool = [date in date_range for date in event_dates]
        test_bool = [not index for index in train_bool]
        match_train = match_data.loc[train_bool]
        match_test = match_data.loc[test_bool]
        
        
    # Name wrestlers to train or test sets
    wrestler_names_train = set(match_train['Winner Full Name']) | set(match_train['Loser Full Name'])
    wrestler_names_train = [x for x in wrestler_names_train if x==x] # remove nan, convert to list
    
    # Not sure if making wrestler test set like this makes total sense but I'll do it for now
    # Maybe because of cumulative stats, wrestler test set is always up-to-date full wrestler data?
    wrestler_names_test = set(match_test['Winner Full Name']) | set(match_test['Loser Full Name'])
    wrestler_names_test = [x for x in wrestler_names_test if x==x] # remove nan, convert to list

    # Call infoscrape to construct wrestler dataframes
    wrestler_train = [infoscrape(wrestler,match_train) for wrestler in wrestler_names_train]
    wrestler_train = pd.DataFrame(wrestler_train)
    wrestler_test = [infoscrape(wrestler,match_test) for wrestler in wrestler_names_test]
    wrestler_test = pd.DataFrame(wrestler_test)
    
    # Store train/test splits in dict
    train_test_dict = {"match_train":match_train,"match_test":match_test,
                      "wrestler_train":wrestler_train,"wrestler_test":wrestler_test}
    
    return(train_test_dict)

In [None]:
def closest(arr, K): 
    idx = (np.abs(arr - K)).argmin() 
    return(arr[idx])

In [None]:
def win_perc_pred(wrestler1,wrestler2,wrestler_train,wrestler_test,match_train,match_test):
    '''win_perc_pred makes a simple prediction that the winner of a match between
    wrestler1 and wrestler2 will be the wrestler with the higher win percentage. Uses wrestlers' fullnames.'''
    
    # Note: this predictor doesn't use match data for now, but I want to 
    # keep the algorithm inputs consistent for evaluation --> alg  args dict
    
    # log_dict to track when defaulting to school WP or matches WP, or other info tidbits
    
    # add confidence measure based on difference in WPs
    
    # Wrestler 1
    wrestler1_bool = wrestler_train["Full Name"]==wrestler1 # should add school clarification step for same names
    
    if sum(wrestler1_bool) == 0: # wrestler not in train set, try to use their school's average WP
        
        wrestler1_bool = wrestler_test["Full Name"]==wrestler1
        school = wrestler_test.loc[wrestler1_bool]["School Name"].values[0]
            
        if sum(wrestler_train["School Name"]==school) == 0: # No other wrestlers from school in data :( 
            # Try using average win perc of all wrestlers with same number of matches (experience counts!)
            
            test_match_num = wrestler_test.loc[wrestler1_bool]["Matches"].values[0]
            wrestlers_by_matches = wrestler_train.groupby("Matches")
            wins_by_match = wrestlers_by_matches["Wins"].mean()
            train_match_num = closest(wins_by_match.index.values,test_match_num)            
            win_perc_1 = wins_by_match[train_match_num] / train_match_num
            
        else: # School WP
            wrestlers_by_school = wrestler_train.groupby("School Name")
            win_perc_1 = wrestlers_by_school["Wins"].mean()[school] / wrestlers_by_school["Matches"].mean()[school]

    else: # wrestler 1 in train set, has their own WP
        win_perc_1 = wrestler_train.loc[wrestler1_bool]["Wins"] / wrestler_train.loc[wrestler1_bool]["Matches"]
        win_perc_1 = win_perc_1.values[0]

        
    # Wrestler 2
    wrestler2_bool = wrestler_train["Full Name"]==wrestler2 # should add school clarification step for same names
    
    if sum(wrestler2_bool) == 0: # wrestler not in train set, try to use their school's average WP
    
        wrestler2_bool = wrestler_test["Full Name"]==wrestler2
        school = wrestler_test.loc[wrestler2_bool]["School Name"].values[0]
        
        if sum(wrestler_train["School Name"]==school) == 0: # No other wrestlers from school in data :( 
            # Try using average win perc of all wrestlers with same/closest number of matches (experience counts!)

            test_match_num = wrestler_test.loc[wrestler2_bool]["Matches"].values[0]
            wrestlers_by_matches = wrestler_train.groupby("Matches")
            wins_by_match = wrestlers_by_matches["Wins"].mean()
            train_match_num = closest(wins_by_match.index.values,test_match_num)            
            win_perc_2 = wins_by_match[train_match_num] / train_match_num
        
        else: # School WP
            wrestlers_by_school = wrestler_train.groupby("School Name")
            win_perc_2 = wrestlers_by_school["Wins"].mean()[school] / wrestlers_by_school["Matches"].mean()[school]
            
    else: # wrestler 2 in train set, has their own WP
        win_perc_2 = wrestler_train.loc[wrestler2_bool]["Wins"] / wrestler_train.loc[wrestler2_bool]["Matches"]
        win_perc_2 = win_perc_2.values[0]

    
    # Do we want a minimal difference to declare a decision?
    wp_diff = abs(win_perc_1 - win_perc_2) 
    # Reformat this into a more general confidence measure -> e.g. 0.0-0.25 == "*", 0.25-0.5 == "**", etc.
    
    if win_perc_1 > win_perc_2:
        return({"Winner":wrestler1,"WP_diff":wp_diff})
    elif win_perc_1 == win_perc_2:
        return({"Winner":None,"WP_diff":wp_diff}) # TODO: Track these in validation
    else:
        return({"Winner":wrestler2,"WP_diff":wp_diff})

In [None]:
def matchmaker(match_test):
    '''matchmaker takes in match test data and returns a list of 
    the associated wrestler matchup pairs'''
    
    
    test_matchups = []
    
    for i in range(0,match_test.shape[0]):
        match = match_test.loc[i]
        w1 = match["Winner Full Name"]
        w2 = match["Loser Full Name"]
        
        # nan entry -> just have wrestler go against himself for now (should result in no winner)
        # no option for both nan, but that is a datapoint I don't even want
        if w1!=w1:
            w1 = w2
        elif w2!=w2:
            w2 = w1
        
        test_matchups.append((w1,w2))
        
    return(test_matchups)

In [None]:
def test_algorithm(algorithm,match_train,match_test,wrestler_train,wrestler_test):
    '''test_algorithm implements a given algorithm using given wrestler and match 
    train/test data and returns prediction accuracy'''
    
    # Extract matchups from test matches
    test_matchups = matchmaker(match_test)

    # True and predicted winners
    true_winners = match_test["Winner Full Name"] 
    pred_output = [algorithm(W1,W2,wrestler_train,wrestler_test,match_train,match_test) for W1,W2 in test_matchups]
    pred_winners = [output["Winner"] for output in pred_output]
    pred_confidences = [output["WP_diff"] for output in pred_output] # make this more general for new algs
    
    # Calculate prediction accuracy, save incorrect pred info
    pred_results = true_winners == pred_winners
    incorrect_preds = match_test.loc[true_winners != pred_winners,:]
    n = len(pred_results)
    correct = sum(pred_results)
    incorrect = n - correct
    pred_accuracy = pred_results.mean()
    
    return({"Accuracy":pred_accuracy,"NumCorrect":correct,"NumIncorrect":incorrect,"N":n,
           "WrongPreds":incorrect_preds, "PredConfidences":pred_confidences})

In [None]:
# See if functions do their job

In [None]:
# Define train/test sets
train_test_dict = train_test_split(MATCHES,split_method='size',train_size=0.8) # takes a minute because infoscrape function
match_train = train_test_dict["match_train"]
wrestler_train = train_test_dict["wrestler_train"]
match_test = train_test_dict["match_test"]
wrestler_test = train_test_dict["wrestler_test"]

In [None]:
# Test win percentage algorithm
# Note: create algorithm_args dict argument for algorithms and algorithm tester
WP_pred_results = test_algorithm(win_perc_pred,match_train,match_test,wrestler_train,wrestler_test)

In [None]:
WP_pred_results['Accuracy']

In [None]:
# Algorithm diagnostics


In [None]:
# Show distribution of weight classes among wrestlers in incorrect pred cases
# Misleading: should scale by num wrestler/matches in weight class
# Note: make this a bar chart instead if possible
WP_pred_results['WrongPreds'].hist(column="Weight Class")
plt.xlabel("Weight Class")
plt.ylabel("Number of Incorrect Predictions")
plt.title("Incorrect WP Preds by Weight Class")

# Save fig
plt.savefig('./Plots/incorrect_preds_weight_class.png')

In [None]:
# See predictive power of WP on this dataset by shrinking train data size
train_sizes = np.linspace(start=0.5,stop=1.0,num=6)
accuracy_dict = {}

for size in train_sizes:
    
    train_test_dict = train_test_split(MATCHES,split_method='size',train_size=size)
    match_train = train_test_dict["match_train"]
    wrestler_train = train_test_dict["wrestler_train"]
    match_test = train_test_dict["match_test"]
    wrestler_test = train_test_dict["wrestler_test"]
    
    WP_pred_results = test_algorithm(win_perc_pred,match_train,match_test,wrestler_train,wrestler_test)
    
    accuracy_dict[size] = WP_pred_results['Accuracy']

In [None]:
size

In [None]:
# Plot results
plt.bar(range(len(accuracy_dict)), list(accuracy_dict.values()), align='center')
plt.xticks(range(len(accuracy_dict)), list(accuracy_dict.keys()))
plt.title('Prediction Accuracy')
plt.xlabel('Train Set Size')
plt.ylabel('Accuracy')

# Save fig
plt.savefig('./Plots/win_perc_pred_accuracy.png')

plt.show()

In [None]:
# Show distribution of number of matches among wrestlers
WRESTLERS.hist(column="Matches")
plt.xlabel("Number of Matches")
plt.ylabel("Number of Wrestlers")
plt.title("Distribution of Wrestlers by Number of Matches")

# Save fig
plt.savefig('./Plots/wrestler_match_dist.png')

# Vast majority of wrestlers have less than 15 matches

In [None]:
# Show distribution of matches by weight class
wrestlers_by_weight = WRESTLERS.groupby('Weight Class')
wrestlers_by_weight.sum().plot.bar(y="Matches")
plt.title("Distribution of Matches by Weight Class")

# Save fig
plt.savefig('./Plots/matches_by_weight_class.png')

# Fairly balanced weight classes; flattened bell curve

In [None]:
# Check count of matches by victory type 
# There are nan entries for Victory Type (L)
#matches_by_wintype = MATCHES.groupby('Victory Type (L)')
#matches_by_wintype.describe()['Match ID'].plot.bar(y='count')
#plt.title("Distribution of Matches by Victory Type")

# Save fig
#plt.savefig('./Plots/matches_by_win_type.png')

# Practically all victory types are fall or decision

In [None]:
# Show distribution of avg win perc by matches
wrestlers_by_matches = WRESTLERS.groupby('Matches')
avg_wins = wrestlers_by_matches.mean()['Wins']
num_matches = avg_wins.index.values
win_percs = avg_wins / num_matches
win_percs.plot.bar()
plt.title("Average Win Percentage by Number of Matches")

# Save fig
plt.savefig('./Plots/win_percs_by_matches.png')

# Does win percentage increase with number of matches?
# More matches means more experience, but also consider that you may only get more matches if you're already winning
# Some positive correlation, as expected