# Teddy Einsidler's Additions and Continuations May 21 2021

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
# There was an error message that I believe showed up for Vala as well
df = pd.read_csv('/Users/TeddyEinsidler/Desktop/QBS-DFS-main/2019NBA')

# Looking at df NaN

In [3]:
print(df.isna().sum())
# df[df['Date'].isna()]
# df[df['Stats'].isna()]

# Drop columns for non-FanDuel or non-DraftKings platforms
df = df.drop(columns = ['DD Sal', 'DD Change', 'YH Sal', 'YH Change', 'DD pos', 'YH pos'])

# Drop the rows of filler/legend info
df = df[df['Date'].notna()]

df[df.isna()].shape
print(df.isna().sum())

GID                0
Last               1
 First             4
First  Last        4
Date               4
Team               4
Opp                4
H/A                4
GameID             4
GTime(ET)          4
Team pts           4
Opp pts            4
Start              4
Minutes            4
GP                 4
active             4
FDP                4
DKP                4
DDP                4
YHP                4
Stats          14988
DoubleD            4
TripleD            4
FD Sal           825
FD Change       1378
DK Sal          1588
DK Change       2144
DD Sal         42866
DD Change      42866
YH Sal           448
YH Change       1004
FD pos           825
DK pos          1588
DD pos         42866
YH pos           448
ADI              124
VMI              124
Unnamed: 37    42866
dtype: int64
GID                0
Last               0
 First             0
First  Last        0
Date               0
Team               0
Opp                0
H/A                0
GameID             0


# Utilize the RSI function Vala previously presented

In [4]:
def computeRSI (data, time_window):
    diff = data.diff(1).dropna()        # diff in one field(one day)

    #this preservers dimensions off diff values
    up_chg = 0 * diff
    down_chg = 0 * diff
    
    # up change is equal to the positive difference, otherwise equal to zero
    up_chg[diff > 0] = diff[ diff>0 ]
    
    # down change is equal to negative deifference, otherwise equal to zero
    down_chg[diff < 0] = diff[ diff < 0 ]
    
    # check pandas documentation for ewm
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html
    # values are related to exponential decay
    # we set com=time_window-1 so we get decay alpha=1/time_window
    up_chg_avg   = up_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
    down_chg_avg = down_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
    
    rs = abs(up_chg_avg/down_chg_avg)
    rsi = 100 - 100/(1+rs)
    return rsi

# Create a function that cleans the given rotoguru data and enables toggling of RSI values, outputting a nested dictionary of statistics.

In [5]:
def nba_eval_RSI(GID, num_games, threshold = 50, fanduel = True, dataframe = df):
    '''
    Returns a nested dictionary, containing a confusion matrix 
    and other basic statistics for an NBA player based on the RSI.
    
    The 'GID' numeric value from the rotoguru data must be used and the 'num_games' represents
    one less than the number of games involved in the RSI and moving average calculations.
    
    The default 'threshold' for the RSI boundary to buy is set to 50.
   
    There is the choice between platform on interest: FanDuel (fanduel = True).
    fanduel = False is for DraftKings. The default platform is FanDuel.
    
    The 'dataframe' that is instantiated from rotoguru is to be referenced here.
    It is defaulted to df
    
    This utilizes computeRSI(data, time_window).
    
    Pandas and numpy (as np) must be imported
    '''
    
    # Subset based on GID
    player_df = df[df['GID'] == GID]
    
    # Get the player's name
    player_name = list(player_df["First  Last"].unique())[0]
    
    # Conditionally add FanDuel or DraftKings points and salary
    if fanduel:
        dfs_points = 'FDP'
        dfs_sal = 'FD Sal'
    else:
        dfs_points = 'DKP'
        dfs_sal = 'DK Sal'

    # Subset the player dataframe to more relevant columns
    player_df = player_df[["First  Last", "Date", "Team pts", "Minutes", dfs_points, dfs_sal]]
    
    # Reset the indices of this subset
    player_df.reset_index(drop=True)
    
    # Sort by earliest date
    player_df = player_df.sort_values('Date', ascending = True)
    
    # Drop games where the player did not play for one minute
    player_df = player_df.drop(player_df[player_df.Minutes < 1].index)
    
    # Create a column to evaluate a player's price per fantasy point (salary/points)
    player_df["Salary/Point"] = player_df[dfs_sal] / player_df[dfs_points]
    
    # Create a column of the moving average
    player_df['Salary/Point_SMA'] = player_df["Salary/Point"].rolling(window=(num_games)).mean()
    
    # Create a column that calcuates the RSI values for the number of games
    player_df['RSI'] = computeRSI(player_df["Salary/Point"], (num_games-1))

    # Calculate the daily percent change in the players' fantasy points
    player_df["Percent_Change"] = player_df[dfs_points].pct_change() * 100
                                  
    # Create a binary column that indicates whether a buy signal (1) occurs
    # A.k.a the RSI passes the threshold
    # This will signal to buy the player for the following game in which they play in the dataset
    player_df['Signal'] = np.where(player_df['RSI'] >= threshold, 1, 0)
    
    # Create a 'Bought' column that utilizes the 'Signal' column and shifts it down
    # to demonstrate the day that the RSI indicates to buy the player
    player_df['Bought'] = player_df.Signal.shift(1)
    
    # Fill the NaN that is introduced into the 'Bought' column by the shift
    player_df['Bought'] = player_df['Bought'].fillna(0)
    
    # Creating a 'PosChange' column as a binary indicator as to whether
    # the player had a positive increase in fantasy points from the previous game
    
    
    
    ############## RECENT CHANGE
#     player_df['PosChange'] = np.where(player_df['Percent_Change'] > 0, 1, 0)
    
    
    
    # Issue: just being positive is not enough, varied thresholds for the players
    # Make threshold over median points or median salary/points
    # Above the moving average?
    player_df['PosChange'] = np.where(player_df[dfs_points] > player_df[dfs_points].median(), 1, 0)
    
    
    # Create a confusion maxtrix to determine the accuracy of the RSI
    confusion_matrix = pd.crosstab(player_df['PosChange'], player_df['Bought'], 
                                   rownames=['Actual'], colnames=['Predicted'])
    
    # Calculate accuracy
    accuracy = np.diag(confusion_matrix).sum() / confusion_matrix.to_numpy().sum()
    
    # Find the frequency of singals for the player over the tested games
    freq_signal = player_df['Signal'].sum() / player_df['Signal'].count()
    
    # Find the mean fantasy points for when the player was bought
    # versus when the player was not bought
    bought_mean = player_df.loc[player_df['Bought'] == 1][dfs_points].mean()
    not_bought_mean = player_df.loc[player_df['Bought'] == 0][dfs_points].mean()
    
    # Find the standard deviation of fantasy points for when the player was bought
    # versus when the player was not bought
    bought_sd = player_df.loc[player_df['Bought'] == 1][dfs_points].std()
    not_bought_sd = player_df.loc[player_df['Bought'] == 0][dfs_points].std()
    
    # Create a dictionary of interesting statistics
    player_dict = {'name': player_name,
                   'number of games': num_games,
                   'threshold': threshold,
                   'confusion matrix': confusion_matrix,
                   'accuracy': accuracy,
                   'signal frequency': freq_signal,
                   'bought mean dfs': bought_mean,
                   'not mean dfs': not_bought_mean,
                   'bought st dev dfs': bought_sd,
                   'not st dev dfs': not_bought_sd}
    
    return player_dict
    
    

# Testing with Kyrie at different thresholds and numbers of games for RSI

In [6]:
# Make lists of values for the RSI threshold and games
thresholds = list(range(50, 71, 10))
games = list(range(3,13,1))

# Make an empty list to store returned dictionaries
kyrie_data = []

# Loop through the game and threshold values to compute the RSI stats
for game in games:
    for thresh in thresholds:
        testing_kyrie = nba_eval_RSI(4379, game, threshold = thresh)
        
        # Append to the empty list
        kyrie_data.append(testing_kyrie)
        
        # Print out the values
        for key in testing_kyrie.keys():
            print(f'{key}:')
            print(testing_kyrie[key])
            print('\n')
        print('---------------------------')

name:
Kyrie Irving


number of games:
3


threshold:
50


confusion matrix:
Predicted  0.0  1.0
Actual             
0           20   18
1           22   16


accuracy:
0.47368421052631576


signal frequency:
0.4605263157894737


bought mean dfs:
43.15000000000001


not mean dfs:
43.478571428571435


bought st dev dfs:
12.647870823329766


not st dev dfs:
13.509265758139554


---------------------------
name:
Kyrie Irving


number of games:
3


threshold:
60


confusion matrix:
Predicted  0.0  1.0
Actual             
0           23   15
1           26   12


accuracy:
0.4605263157894737


signal frequency:
0.3684210526315789


bought mean dfs:
43.60370370370371


not mean dfs:
43.181632653061214


bought st dev dfs:
12.603585615624246


not st dev dfs:
13.408869475546583


---------------------------
name:
Kyrie Irving


number of games:
3


threshold:
70


confusion matrix:
Predicted  0.0  1.0
Actual             
0           30    8
1           29    9


accuracy:
0.5131578947368421




name:
Kyrie Irving


number of games:
10


threshold:
60


confusion matrix:
Predicted  0.0  1.0
Actual             
0           35    3
1           36    2


accuracy:
0.4868421052631579


signal frequency:
0.07894736842105263


bought mean dfs:
42.54


not mean dfs:
43.38732394366197


bought st dev dfs:
4.255937029609343


not st dev dfs:
13.463017381780833


---------------------------
name:
Kyrie Irving


number of games:
10


threshold:
70


confusion matrix:
Predicted  0.0  1.0
Actual             
0           37    1
1           38    0


accuracy:
0.4868421052631579


signal frequency:
0.013157894736842105


bought mean dfs:
42.7


not mean dfs:
43.339999999999996


bought st dev dfs:
nan


not st dev dfs:
13.132948043920848


---------------------------
name:
Kyrie Irving


number of games:
11


threshold:
50


confusion matrix:
Predicted  0.0  1.0
Actual             
0           25   13
1           30    8


accuracy:
0.4342105263157895


signal frequency:
0.2894736842105263


In [7]:
# Turn the data into a dataframe
kyrie = pd.DataFrame(kyrie_data)

# Drop the confusion matrices
kyrie = kyrie.drop(columns = 'confusion matrix')

kyrie.head()

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
0,Kyrie Irving,3,50,0.473684,0.460526,43.15,43.478571,12.647871,13.509266
1,Kyrie Irving,3,60,0.460526,0.368421,43.603704,43.181633,12.603586,13.408869
2,Kyrie Irving,3,70,0.513158,0.236842,47.047059,42.261017,11.447877,13.367704
3,Kyrie Irving,4,50,0.486842,0.447368,43.506061,43.197674,12.837541,13.352412
4,Kyrie Irving,4,60,0.5,0.25,45.327778,42.712069,10.493404,13.763428


In [8]:
# Find out the most accurate RSI for Kyrie
best_kyrie = kyrie.loc[kyrie['accuracy'] == kyrie['accuracy'].max()]
best_kyrie

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
2,Kyrie Irving,3,70,0.513158,0.236842,47.047059,42.261017,11.447877,13.367704


In [9]:
# See what his top accuraries are
kyrie = kyrie.sort_values('accuracy', ascending = False)
kyrie.head()

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
2,Kyrie Irving,3,70,0.513158,0.236842,47.047059,42.261017,11.447877,13.367704
29,Kyrie Irving,12,70,0.5,0.0,,43.331579,,13.045308
4,Kyrie Irving,4,60,0.5,0.25,45.327778,42.712069,10.493404,13.763428
26,Kyrie Irving,11,70,0.5,0.0,,43.331579,,13.045308
9,Kyrie Irving,6,50,0.5,0.355263,44.419231,42.766,13.161885,13.082058


In [10]:
# See what his top points are
kyrie = kyrie.sort_values('bought mean dfs', ascending = False)
kyrie.head()

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
2,Kyrie Irving,3,70,0.513158,0.236842,47.047059,42.261017,11.447877,13.367704
4,Kyrie Irving,4,60,0.5,0.25,45.327778,42.712069,10.493404,13.763428
9,Kyrie Irving,6,50,0.5,0.355263,44.419231,42.766,13.161885,13.082058
6,Kyrie Irving,5,50,0.486842,0.394737,43.989655,42.925532,13.037401,13.174357
19,Kyrie Irving,9,60,0.5,0.092105,43.966667,43.277143,5.167462,13.527944


# Now, let's see if LeBron James is any different

In [11]:
lebron_GID = 3541
# Make lists of values for the RSI threshold and games
thresholds = list(range(50, 71, 10))
games = list(range(3,13,1))

# Make an empty list to store returned dictionaries
lebron_data = []

# Loop through the game and threshold values to compute the RSI stats
for game in games:
    for thresh in thresholds:
        testing_lebron = nba_eval_RSI(lebron_GID, game, threshold = thresh)
        
        # Append to the empty list
        lebron_data.append(testing_lebron)
        
# Turn the data into a dataframe
lebron = pd.DataFrame(lebron_data)

# Drop the confusion matrices
lebron = lebron.drop(columns = 'confusion matrix')

# Find out the most accurate RSI for LeBron
best_lebron = lebron.loc[lebron['accuracy'] == lebron['accuracy'].max()]
best_lebron

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
24,LeBron James,11,50,0.654545,0.327273,57.647059,49.423684,11.995787,10.561491
27,LeBron James,12,50,0.654545,0.327273,57.647059,49.423684,11.995787,10.561491


In [12]:
# See what his top accuraries are
lebron = lebron.sort_values('accuracy', ascending = False)
lebron.head()

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
27,LeBron James,12,50,0.654545,0.327273,57.647059,49.423684,11.995787,10.561491
24,LeBron James,11,50,0.654545,0.327273,57.647059,49.423684,11.995787,10.561491
21,LeBron James,10,50,0.636364,0.345455,57.361111,49.340541,11.700686,10.694559
0,LeBron James,3,50,0.618182,0.472727,54.244,50.066667,12.077289,10.962643
18,LeBron James,9,50,0.618182,0.363636,55.889474,49.894444,13.055603,10.293936


In [13]:
# See what his top points are
lebron = lebron.sort_values('bought mean dfs', ascending = False)
lebron.head()

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
25,LeBron James,11,60,0.545455,0.018182,64.6,51.731481,,11.542568
28,LeBron James,12,60,0.545455,0.018182,64.6,51.731481,,11.542568
22,LeBron James,10,60,0.527273,0.036364,58.55,51.716981,8.555992,11.652529
8,LeBron James,5,70,0.527273,0.054545,58.55,51.716981,8.555992,11.652529
19,LeBron James,9,60,0.527273,0.054545,58.55,51.716981,8.555992,11.652529


# Trying well-known veteran Vince Carter

In [14]:
vince_GID = 2448
# Make lists of values for the RSI threshold and games
thresholds = list(range(50, 71, 10))
games = list(range(3,13,1))

# Make an empty list to store returned dictionaries
vince_data = []

# Loop through the game and threshold values to compute the RSI stats
for game in games:
    for thresh in thresholds:
        testing_vince = nba_eval_RSI(vince_GID, game, threshold = thresh)
        
        # Append to the empty list
        vince_data.append(testing_vince)
        
# Turn the data into a dataframe
vince = pd.DataFrame(vince_data)

# Drop the confusion matrices
vince = vince.drop(columns = 'confusion matrix')

# Find out the most accurate RSI for Vince
best_vince = vince.loc[vince['accuracy'] == vince['accuracy'].max()]
best_vince

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
0,Vince Carter,3,50,0.539474,0.315789,15.869565,13.684906,7.782056,7.335488


In [15]:
# See what his top accuraries are
vince = vince.sort_values('accuracy', ascending = False)
vince.head()

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
0,Vince Carter,3,50,0.539474,0.315789,15.869565,13.684906,7.782056,7.335488
3,Vince Carter,4,50,0.513158,0.289474,15.747619,13.810909,7.497974,7.485261
1,Vince Carter,3,60,0.513158,0.236842,15.547059,14.0,7.885994,7.404984
11,Vince Carter,6,70,0.513158,0.052632,14.633333,14.334247,6.127261,7.575201
8,Vince Carter,5,70,0.5,0.092105,12.566667,14.498571,7.139374,7.548385


In [16]:
# See what his top points are
vince = vince.sort_values('bought mean dfs', ascending = False)
vince.head()

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
0,Vince Carter,3,50,0.539474,0.315789,15.869565,13.684906,7.782056,7.335488
3,Vince Carter,4,50,0.513158,0.289474,15.747619,13.810909,7.497974,7.485261
1,Vince Carter,3,60,0.513158,0.236842,15.547059,14.0,7.885994,7.404984
27,Vince Carter,12,50,0.486842,0.210526,15.34,14.101639,7.645522,7.494609
12,Vince Carter,7,50,0.486842,0.263158,15.184211,14.066667,7.421236,7.556722


In [17]:
# Should we look at the difference in means?
def top_RSI_iterator(GID, num_top = 5, accuracy = True, min_g = 3, max_g = 10, step_g = 1, min_t = 50, max_t = 70, step_t = 5, fanduel = True, dataframe = df):
    '''
    Utilizes nba_eval_RSI() and iterates through the various combination of games for the RSI
    range(min_g, max_g+1, step_g) as well as the RSI threshold range(min_t, max_t+1, step_t)
    for a given player using their GID.
    
    Returns a pandas DataFrame of the num_top rows of top scenarios based on either accuracy
    (accuracy = True) or bought mean dfs (accuracy = False).
    
    dataframe and fanduel are set to the same defaults as in nba_eval_RSI().
    '''
    # Make a local empty list to store returned dictionaries
    player_data = []

    # Loop through the game and threshold values to compute the RSI stats
    for game in range(min_g, max_g+1, step_g):
        for threshold in range(min_t, max_t+1, step_t):
            testing_player = nba_eval_RSI(GID, game, threshold = threshold, fanduel = fanduel, dataframe = dataframe)

            # Append to the empty list
            player_data.append(testing_player)

    # Turn the data into a dataframe
    player = pd.DataFrame(player_data)

    # Drop the confusion matrices
    player = player.drop(columns = 'confusion matrix')
    
    # See what the players' top points are
    if accuracy:
        player = player.sort_values('accuracy', ascending = False)
        return player.head(num_top)
    else:
        player = player.sort_values('bought mean dfs', ascending = False)
        return player.head(num_top)
    
    

In [18]:
top_RSI_iterator(lebron_GID)

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
35,LeBron James,10,50,0.636364,0.345455,57.361111,49.340541,11.700686,10.694559
0,LeBron James,3,50,0.618182,0.472727,54.244,50.066667,12.077289,10.962643
30,LeBron James,9,50,0.618182,0.363636,55.889474,49.894444,13.055603,10.293936
25,LeBron James,8,50,0.6,0.381818,55.47,49.962857,12.845114,10.435913
10,LeBron James,5,50,0.581818,0.436364,54.269565,50.309375,13.230043,10.101433


# Now, let's iterate through all of the players in the dataset and see the most frequent indicator values for the top accuracy score

## There are RuntimeWarning messages that have since been ignored:

`/Users/TeddyEinsidler/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:81: RuntimeWarning: invalid value encountered in double_scalars
/Users/TeddyEinsidler/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:84: RuntimeWarning: invalid value encountered in long_scalars`

In [19]:
top = 1
player_ids = list(df.GID.unique())
# Make an empty DataFrame for concatenation
scores = pd.DataFrame()
for GID in player_ids:
    score = top_RSI_iterator(GID = GID, num_top = top, accuracy = True)
    scores = pd.concat([scores,score])
    
scores = scores.sort_values('accuracy', ascending = False)

# Clearly, there is some handling of null values that I may want to fix for the future (Edit: removed some cleaning was retroactively performed)

In [20]:
scores

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
0,Jordan Sibert,3,50,1.0,0.0,,3.0,,
0,D.J. Stephens,3,50,1.0,0.0,,4.0,,
0,George King,3,50,1.0,0.0,,1.2,,
0,Joe Chealey,3,50,1.0,0.0,,3.5,,
0,Donte Grantham,3,50,1.0,0.0,,0.0,,
...,...,...,...,...,...,...,...,...,...
0,Jonathan Gibson,3,50,,,,,,
0,Keenan Evans,3,50,,,,,,
0,Scotty Hopson,3,50,,,,,,
0,Isaiah Whitehead,3,50,,,,,,


In [21]:
scores.info()
scores.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 558 entries, 0 to 0
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               558 non-null    object 
 1   number of games    558 non-null    int64  
 2   threshold          558 non-null    int64  
 3   accuracy           526 non-null    float64
 4   signal frequency   526 non-null    float64
 5   bought mean dfs    465 non-null    float64
 6   not mean dfs       526 non-null    float64
 7   bought st dev dfs  414 non-null    float64
 8   not st dev dfs     516 non-null    float64
dtypes: float64(6), int64(2), object(1)
memory usage: 43.6+ KB


name                   0
number of games        0
threshold              0
accuracy              32
signal frequency      32
bought mean dfs       93
not mean dfs          32
bought st dev dfs    144
not st dev dfs        42
dtype: int64

In [22]:
scores = scores.dropna()
scores

Unnamed: 0,name,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
9,J.R. Smith,4,70,1.000000,0.545455,24.380,6.033333,8.391484,5.041693
0,Daryl Macon,3,50,0.857143,0.714286,11.225,2.000000,9.243511,3.000000
0,Amile Jefferson,3,50,0.833333,0.250000,9.900,4.888889,6.934695,4.314349
30,Jordan McRae,9,50,0.807692,0.346154,18.975,7.955556,8.092986,10.430454
5,Dwight Howard,4,50,0.777778,0.333333,41.050,22.214286,13.930004,10.837962
...,...,...,...,...,...,...,...,...,...
0,Antonio Blakeney,3,50,0.472727,0.363636,10.645,12.094286,6.535770,7.398527
35,Zaza Pachulia,10,50,0.471429,0.485714,12.200,11.319444,8.534138,7.685453
0,Andrew Bogut,3,50,0.466667,0.333333,10.910,11.635000,9.353959,9.296534
0,Courtney Lee,3,50,0.441176,0.588235,8.520,10.278571,10.312006,8.562726


# It appears that, for the majority of players, the best accuracies were found using 3 games (standard seasons average 3.5 games per week for a team) and a threshold of 50

In [23]:
scores['number of games'].value_counts(normalize = True)

3     0.282609
10    0.135266
5     0.115942
4     0.113527
6     0.103865
7     0.103865
8     0.077295
9     0.067633
Name: number of games, dtype: float64

In [24]:
scores['threshold'].value_counts(normalize = True)

50    0.425121
60    0.154589
70    0.154589
55    0.140097
65    0.125604
Name: threshold, dtype: float64

In [25]:
scores.describe()

Unnamed: 0,number of games,threshold,accuracy,signal frequency,bought mean dfs,not mean dfs,bought st dev dfs,not st dev dfs
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,5.811594,57.222222,0.568449,0.241955,22.240439,19.275119,8.97106,9.14022
std,2.505831,7.552168,0.060197,0.138826,11.539497,10.771218,3.431252,2.390589
min,3.0,50.0,0.365854,0.022989,1.8,1.175,0.141421,0.8544
25%,3.0,50.0,0.53038,0.121314,13.392028,11.219931,7.003015,7.713655
50%,5.0,55.0,0.555556,0.233589,20.39,17.091071,8.931401,9.021043
75%,8.0,65.0,0.596305,0.350798,28.71141,25.331234,11.1736,10.617463
max,10.0,70.0,1.0,0.714286,67.55,58.043836,22.934203,17.991861


In [26]:
# Finding the top 5 accuracy scores
top = 5
player_ids = list(df.GID.unique())
# Make an empty DataFrame for concatenation
scores = pd.DataFrame()
for GID in player_ids:
    score = top_RSI_iterator(GID = GID, num_top = top, accuracy = True)
    scores = pd.concat([scores,score])
    
scores = scores.sort_values('accuracy', ascending = False)
scores = scores.dropna()
print(scores['number of games'].value_counts(normalize = True))
print(scores['threshold'].value_counts(normalize = True))

3     0.201871
4     0.130970
5     0.123584
6     0.122600
7     0.108321
10    0.108321
9     0.104382
8     0.099951
Name: number of games, dtype: float64
50    0.304284
55    0.183161
60    0.175775
65    0.169867
70    0.166913
Name: threshold, dtype: float64


# Let's instead focus on the signals that produce the highest mean bought scores for each player

In [27]:
# Finding the highest mean bought scores
top = 1
player_ids = list(df.GID.unique())
# Make an empty DataFrame for concatenation
scores = pd.DataFrame()
for GID in player_ids:
    score = top_RSI_iterator(GID = GID, num_top = top, accuracy = False)
    scores = pd.concat([scores,score])
    
scores = scores.sort_values('accuracy', ascending = False)
scores = scores.dropna()
print(scores['number of games'].value_counts(normalize = True))
print(scores['threshold'].value_counts(normalize = True))

10    0.236559
3     0.206989
6     0.107527
5     0.094086
8     0.091398
9     0.091398
4     0.088710
7     0.083333
Name: number of games, dtype: float64
70    0.279570
50    0.228495
60    0.185484
65    0.169355
55    0.137097
Name: threshold, dtype: float64


In [28]:
# Finding the top 5 mean bought scores
top = 5
player_ids = list(df.GID.unique())
# Make an empty DataFrame for concatenation
scores = pd.DataFrame()
for GID in player_ids:
    score = top_RSI_iterator(GID = GID, num_top = top, accuracy = False)
    scores = pd.concat([scores,score])
    
scores = scores.sort_values('accuracy', ascending = False)
scores = scores.dropna()
print(scores['number of games'].value_counts(normalize = True))
print(scores['threshold'].value_counts(normalize = True))

3     0.164487
9     0.131791
10    0.129276
7     0.123742
4     0.121227
5     0.110664
6     0.110664
8     0.108149
Name: number of games, dtype: float64
50    0.219316
70    0.209759
65    0.205231
60    0.203722
55    0.161972
Name: threshold, dtype: float64


# From here, I am thinking of tweaking the functions I wrote or subsetting the data by position and/or salary in order to see if there are different patterns for different types of players. It may also be interesting to perform a separate analysis on players that, throughout the season, are listed at different positions to see if the thresholds are different for those players within a single position as well as a whole.

# Let's try this on upcoming games/untested seasons using 3 games and threshold of 50

# Don't wire salary and position too much

# Plot or calc difference between actual and our pick