# Matchup Feature Engineering

This notebook uses the teamsheets data from TeamDataCleaning.ipynb to create features for predicting the probability of victory for NCAA tournament games.

In [1]:
import pandas as pd

In [2]:
teams = pd.read_csv('mydata/teamsheets.csv')
teams.head()

Unnamed: 0,Colley_Rating,Season,TeamID,Teamrank_Rating,Teamrank10_Rating,Trank_OE,Trank_DE,Trank_Rating,EFG%,EFGD%,...,Kenpom_OE,Kenpom_DE,Kenpom_Rating,seed,Weighted_Rating,3ptRate,Ast%,FT%,Opp3ptRate,OppAst%
0,1.0,2008,1314,0.939394,0.903704,120.2,91.8,0.974641,52.4,48.0,...,120.686,92.5415,0.912825,1.0,40.802268,0.221826,0.525392,0.753986,0.347945,0.504329
1,0.967124,2008,1272,0.925253,0.866667,113.3,83.8,0.987633,53.0,42.5,...,114.695,85.3169,0.931297,1.0,40.63683,0.367143,0.55726,0.595561,0.287572,0.490489
2,0.959789,2008,1417,0.915152,0.914815,116.4,86.9,0.984203,52.3,48.0,...,118.735,88.16,0.949233,1.0,40.727208,0.281216,0.542237,0.736091,0.278707,0.506667
3,0.956817,2008,1397,0.864646,0.864815,116.3,92.2,0.951881,52.8,49.2,...,117.787,95.4494,0.82584,2.0,38.524187,0.390556,0.615716,0.654206,0.356119,0.554855
4,0.938298,2008,1242,1.0,0.961111,121.0,85.6,1.0,56.3,44.8,...,121.433,87.4681,1.0,1.0,42.312421,0.291796,0.627572,0.707756,0.380901,0.535411


In [3]:
ratings = teams[['Season', 'TeamID', 'Weighted_Rating']]
ratings.head()

Unnamed: 0,Season,TeamID,Weighted_Rating
0,2008,1314,40.802268
1,2008,1272,40.63683
2,2008,1417,40.727208
3,2008,1397,38.524187
4,2008,1242,42.312421


In [4]:
tournament_data = pd.read_csv('DataFiles/NCAATourneyCompactResults.csv')
tournament_data.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [5]:
# The rating data is only from 2008 and 2010-2018 so we'll only use results from 2008 and 2010-2018
tournament_data = tournament_data.query('Season > 2009 | Season == 2008').drop(columns = ['DayNum', 'WLoc', 'NumOT'])

In [6]:
game_data = pd.merge(tournament_data, ratings, left_on = ['Season', 'WTeamID'], right_on = ['Season', 'TeamID'])
game_data = pd.merge(game_data, ratings, left_on = ['Season', 'LTeamID'], right_on = ['Season', 'TeamID'])
game_data = game_data.drop(columns = ['WTeamID', 'LTeamID']).rename(index = str, columns = {'WScore': 'Score_x', 'LScore': 'Score_y'})
game_data.head(10)

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,Weighted_Rating_x,TeamID_y,Weighted_Rating_y
0,2008,69,60,1291,19.603241,1164,9.92806
1,2008,71,70,1181,40.358259,1125,23.232983
2,2008,85,61,1242,42.312421,1340,25.040472
3,2008,75,56,1242,42.312421,1424,32.235702
4,2008,72,57,1242,42.312421,1437,31.214883
5,2008,59,57,1242,42.312421,1172,35.145146
6,2008,84,66,1242,42.312421,1314,40.802268
7,2008,75,68,1242,42.312421,1272,40.63683
8,2008,80,67,1243,34.640906,1425,34.980175
9,2008,74,66,1266,37.019513,1246,30.003724


We want to set up the data to have the "better" team as team x and the "worse" team as team y rather than x being the winner and y being the loser because when we make predictions we'll want to predict upsets instead of just picking whether an arbitrary team wins or loses. The code below rearranges team x and team y so team x always has the higher weighted rating. In the data above, the 9th row has team x with a lower weighted rating than team y.

In [7]:
# To make team x be the team with the higher weighted rating and team y be the team with the lower weighted rating
for i in range(len(game_data)):
    # if weighted rating x is less than weighted rating y
    if game_data.iloc[i, 4] < game_data.iloc[i, 6]:
        underdog = game_data.iloc[i, 1]  # "Worse" team's score
        favorite = game_data.iloc[i, 2]  # "Better" team's score
        game_data.iloc[i, 1] = favorite
        game_data.iloc[i, 2] = underdog
        underdog = game_data.iloc[i, 3]  # "Worse" team's ID
        favorite = game_data.iloc[i, 5]  # "Better" team's ID
        game_data.iloc[i, 3] = favorite
        game_data.iloc[i, 5] = underdog
        underdog = game_data.iloc[i, 4]  # "Worse" team's weighted rating
        favorite = game_data.iloc[i, 6]  # "Better" team's weighted rating
        game_data.iloc[i, 4] = favorite
        game_data.iloc[i, 6] = underdog

In [8]:
game_data.head(10)

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,Weighted_Rating_x,TeamID_y,Weighted_Rating_y
0,2008,69,60,1291,19.603241,1164,9.92806
1,2008,71,70,1181,40.358259,1125,23.232983
2,2008,85,61,1242,42.312421,1340,25.040472
3,2008,75,56,1242,42.312421,1424,32.235702
4,2008,72,57,1242,42.312421,1437,31.214883
5,2008,59,57,1242,42.312421,1172,35.145146
6,2008,84,66,1242,42.312421,1314,40.802268
7,2008,75,68,1242,42.312421,1272,40.63683
8,2008,67,80,1425,34.980175,1243,34.640906
9,2008,74,66,1266,37.019513,1246,30.003724


As you can see, the 9th row now has the teams in the correct order, with team x having a higher weighted rating than team y.

In [9]:
game_data = pd.merge(game_data, teams, left_on = ['Season', 'TeamID_x'], right_on = ['Season', 'TeamID'])
game_data = pd.merge(game_data, teams, left_on = ['Season', 'TeamID_y'], right_on = ['Season', 'TeamID'])
game_data.head()

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,Weighted_Rating_x,TeamID_y,Weighted_Rating_y,Colley_Rating_x,TeamID_x.1,Teamrank_Rating_x,...,Kenpom_OE_y,Kenpom_DE_y,Kenpom_Rating_y,seed_y,Weighted_Rating_y.1,3ptRate_y,Ast%_y,FT%_y,Opp3ptRate_y,OppAst%_y
0,2008,69,60,1291,19.603241,1164,9.92806,0.484032,1291,0.446465,...,92.4179,106.658,0.277968,16.0,9.92806,0.242117,0.457143,0.717325,0.378745,0.604621
1,2008,71,70,1181,40.358259,1125,23.232983,0.921698,1181,0.921212,...,109.436,106.857,0.529898,15.0,23.232983,0.473819,0.628736,0.698341,0.343105,0.534198
2,2008,67,73,1181,40.358259,1452,36.520674,0.921698,1181,0.921212,...,113.57,93.4302,0.792929,7.0,36.520674,0.349661,0.585534,0.679325,0.321202,0.518569
3,2008,79,75,1462,37.274341,1452,36.520674,0.86592,1462,0.822222,...,113.57,93.4302,0.792929,7.0,36.520674,0.349661,0.585534,0.679325,0.321202,0.518569
4,2008,85,61,1242,42.312421,1340,25.040472,0.938298,1242,1.0,...,108.411,102.574,0.578689,16.0,25.040472,0.411585,0.613784,0.670989,0.332923,0.548209


In [10]:
game_data = game_data.loc[:,~game_data.columns.duplicated()]  # to remove duplicate columns
game_data.head()

Unnamed: 0,Season,Score_x,Score_y,TeamID_x,Weighted_Rating_x,TeamID_y,Weighted_Rating_y,Colley_Rating_x,Teamrank_Rating_x,Teamrank10_Rating_x,...,Kenpom_Tempo_y,Kenpom_OE_y,Kenpom_DE_y,Kenpom_Rating_y,seed_y,3ptRate_y,Ast%_y,FT%_y,Opp3ptRate_y,OppAst%_y
0,2008,69,60,1291,19.603241,1164,9.92806,0.484032,0.446465,0.444444,...,62.2413,92.4179,106.658,0.277968,16.0,0.242117,0.457143,0.717325,0.378745,0.604621
1,2008,71,70,1181,40.358259,1125,23.232983,0.921698,0.921212,1.0,...,68.1091,109.436,106.857,0.529898,15.0,0.473819,0.628736,0.698341,0.343105,0.534198
2,2008,67,73,1181,40.358259,1452,36.520674,0.921698,0.921212,1.0,...,63.0942,113.57,93.4302,0.792929,7.0,0.349661,0.585534,0.679325,0.321202,0.518569
3,2008,79,75,1462,37.274341,1452,36.520674,0.86592,0.822222,0.848148,...,63.0942,113.57,93.4302,0.792929,7.0,0.349661,0.585534,0.679325,0.321202,0.518569
4,2008,85,61,1242,42.312421,1340,25.040472,0.938298,1.0,0.961111,...,65.7791,108.411,102.574,0.578689,16.0,0.411585,0.613784,0.670989,0.332923,0.548209


Now, we can create a new dataset that will be used in the predictions. For each matchup, there will be a column for "Upset", which will be the response variable. And we'll enineer features for each matchup based on each team's statistics that will help predict whether or not the matchup is will be an upset. We'll start with the stats for each team as predictor features. Then, we'll create features by averaging and/or taking the difference of the team's stats.

In [11]:
# Start with the stats for each team
matchups = game_data.drop(columns = ['Season', 'Score_x', 'Score_y', 'TeamID_x', 'TeamID_y'])

# Response variable
matchups['Upset'] = game_data['Score_x'] < game_data['Score_y']  
matchups['Upset'] = matchups['Upset'].astype('int64')

# Predictors

# Difference in NCAA tournament Seeds
matchups['SeedDiff'] = matchups['seed_x'] - matchups['seed_y']

# Difference in efficiency Metrics
matchups['WeightedRatingDiff'] = game_data['Weighted_Rating_x'] - game_data['Weighted_Rating_y']
matchups['ColleyRatingDiff'] = game_data['Colley_Rating_x'] - game_data['Colley_Rating_y']
matchups['TeamrankRatingDiff'] = game_data['Teamrank_Rating_x'] - game_data['Teamrank_Rating_y']
matchups['TrankRatingDiff'] = game_data['Trank_Rating_x'] - game_data['Trank_Rating_y']
matchups['KenpomRatingDiff'] = game_data['Kenpom_Rating_x'] - game_data['Kenpom_Rating_y']

# Average Tempo, tempo difference, and absolute value of tempo difference for Trank and Kenpom tempos
matchups['TrankAvgTempo'] = (game_data['Trank_Tempo_x'] + game_data['Trank_Tempo_y']) / 2
matchups['TrankTempoAbsDiff'] = abs(game_data['Trank_Tempo_x'] - game_data['Trank_Tempo_y'])
matchups['TrankTempoDiff'] = game_data['Trank_Tempo_x'] - game_data['Trank_Tempo_y']
matchups['KenpomAvgTempo'] = (game_data['Kenpom_Tempo_x'] + game_data['Kenpom_Tempo_y']) / 2
matchups['KenpomTempoAbsDiff'] = abs(game_data['Kenpom_Tempo_x'] - game_data['Kenpom_Tempo_y'])
matchups['KenpomTempoDiff'] = game_data['Kenpom_Tempo_x'] - game_data['Kenpom_Tempo_y']

# Offensive vs defensive efficiency averages and differences for Kenpom and Trank efficiencies
matchups['xOffyDefKenpomDiff'] = game_data['Kenpom_OE_x'] - game_data['Kenpom_DE_y']
matchups['yOffxDefKenpomDiff'] = game_data['Kenpom_OE_y'] - game_data['Kenpom_DE_x']
matchups['xOffyDefKenpomAvg'] = (game_data['Kenpom_OE_x'] + game_data['Kenpom_DE_y']) / 2
matchups['yOffxDefKenpomAvg'] = (game_data['Kenpom_OE_y'] + game_data['Kenpom_DE_x']) / 2
matchups['xOffyDefTrankDiff'] = game_data['Trank_OE_x'] - game_data['Trank_DE_y']
matchups['yOffxDefTrankDiff'] = game_data['Trank_OE_y'] - game_data['Trank_DE_x']
matchups['xOffyDefTrankAvg'] = (game_data['Trank_OE_x'] + game_data['Trank_DE_y']) / 2
matchups['yOffxDefTrankAvg'] = (game_data['Trank_OE_y'] + game_data['Trank_DE_x']) / 2

# Offensive vs defensive turnover rate averages and differences
matchups['xOffyDefTODiff'] = game_data['TOR_x'] - game_data['TORD_y']
matchups['yOffxDefTODiff'] = game_data['TOR_y'] - game_data['TORD_x']
matchups['xOffyDefTOAvg'] = (game_data['TOR_x'] + game_data['TORD_y']) / 2
matchups['yOffxDefTOAvg'] = (game_data['TOR_y'] + game_data['TORD_x']) / 2

# Offensive vs defensive rebound rate averages and differences
matchups['xOffyDefRebDiff'] = game_data['ORB_x'] - game_data['DRB_y']
matchups['yOffxDefRebDiff'] = game_data['ORB_y'] - game_data['DRB_x']
matchups['xOffyDefRebAvg'] = (game_data['ORB_x'] + game_data['DRB_y']) / 2
matchups['yOffxDefRebAvg'] = (game_data['ORB_y'] + game_data['DRB_x']) / 2

# Offensive vs defensive 3pt rate averages and differences
matchups['xOffyDef3ptRateDiff'] = game_data['3ptRate_x'] - game_data['Opp3ptRate_y']
matchups['yOffxDef3ptRateDiff'] = game_data['3ptRate_y'] - game_data['Opp3ptRate_x']
matchups['xOffyDef3ptRateAvg'] = (game_data['3ptRate_x'] + game_data['Opp3ptRate_y']) / 2
matchups['yOffxDef3ptRateAvg'] = (game_data['3ptRate_y'] + game_data['Opp3ptRate_x']) / 2

# Offensive vs defensive FT rate averages and differences
matchups['xOffyDefFTRateDiff'] = game_data['FTR_x'] - game_data['FTRD_y']
matchups['yOffxDefFTRateDiff'] = game_data['FTR_y'] - game_data['FTRD_x']
matchups['xOffyDefFTRateAvg'] = (game_data['FTR_x'] + game_data['FTRD_y']) / 2
matchups['yOffxDefFTRateAvg'] = (game_data['FTR_y'] + game_data['FTRD_x']) / 2

# Offensive vs defensive assist rate averages and differences
matchups['xOffyDefAstDiff'] = abs(game_data['Ast%_x'] - game_data['OppAst%_y'])
matchups['yOffxDefAstDiff'] = abs(game_data['Ast%_y'] - game_data['OppAst%_x'])

# Points per fga from 3 pointers
matchups['xOffyDefPoints3'] = 3 * matchups['xOffyDef3ptRateAvg'] * (game_data['3P%_x'] + game_data['3P%D_y']) / 2
matchups['yOffxDefPoints3'] = 3 * matchups['yOffxDef3ptRateAvg'] * (game_data['3P%_y'] + game_data['3P%D_x']) / 2

# Points per fga from 2 pointers
matchups['xOffyDefPoints2'] = (1 - matchups['xOffyDef3ptRateAvg']) * (game_data['2P%_x'] + game_data['2P%D_y']) / 2
matchups['yOffxDefPoints2'] = (1 - matchups['xOffyDef3ptRateAvg']) * (game_data['2P%_y'] + game_data['2P%D_x']) / 2

# Points per fga from FTs
matchups['xOffyDefPoints1'] = matchups['xOffyDefFTRateAvg'] * game_data['FT%_x']
matchups['yOffxDefPoints1'] = matchups['yOffxDefFTRateAvg'] * game_data['FT%_y']

# Points per fga
matchups['xOffyDefPoints'] = matchups['xOffyDefPoints1'] + matchups['xOffyDefPoints2'] + matchups['xOffyDefPoints3']
matchups['yOffxDefPoints'] = matchups['yOffxDefPoints1'] + matchups['yOffxDefPoints2'] + matchups['yOffxDefPoints3']

# Points per fga difference
matchups['PointDiff'] = matchups['xOffyDefPoints'] - matchups['yOffxDefPoints']

In [12]:
matchups.head()

Unnamed: 0,Weighted_Rating_x,Weighted_Rating_y,Colley_Rating_x,Teamrank_Rating_x,Teamrank10_Rating_x,Trank_OE_x,Trank_DE_x,Trank_Rating_x,EFG%_x,EFGD%_x,...,yOffxDefAstDiff,xOffyDefPoints3,yOffxDefPoints3,xOffyDefPoints2,yOffxDefPoints2,xOffyDefPoints1,yOffxDefPoints1,xOffyDefPoints,yOffxDefPoints,PointDiff
0,19.603241,9.92806,0.484032,0.446465,0.444444,97.3,99.1,0.445334,49.8,46.5,...,0.021118,0.363127,0.263351,0.319767,0.290067,0.275786,0.267921,0.958679,0.821339,0.13734
1,40.358259,23.232983,0.921698,0.921212,1.0,117.2,88.8,0.978279,54.1,47.6,...,0.152258,0.403022,0.374303,0.328184,0.314259,0.263955,0.230452,0.995161,0.919015,0.076146
2,40.358259,36.520674,0.921698,0.921212,1.0,117.2,88.8,0.978279,54.1,47.6,...,0.109056,0.394202,0.307715,0.31197,0.314546,0.275416,0.234707,0.981588,0.856967,0.12462
3,37.274341,36.520674,0.86592,0.822222,0.848148,115.2,92.3,0.943463,55.0,46.8,...,0.018604,0.387736,0.377983,0.319887,0.311065,0.302387,0.233008,1.01001,0.922057,0.087953
4,42.312421,25.040472,0.938298,1.0,0.961111,121.0,85.6,1.0,56.3,44.8,...,0.078373,0.347188,0.439235,0.358261,0.316658,0.257623,0.231491,0.963072,0.987385,-0.024313


In [13]:
matchups.to_csv('mydata/matchups.csv', index = False)