<h1 style="text-align:center;">Clustering Routes Attempt 1</h1>

<p style="text-align:center;">Vincent Etherton -- SML 312: Research Projects in Data Science</p>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
trimmed_weeks = []

for i in range(8):
    df = pd.read_csv(f'./data/week_{i}_trimmed.csv')
    trimmed_weeks.append(df)

trimmed_weeks[0].head()

Unnamed: 0.1,Unnamed: 0,gameId,playId,nflId,frameId,time,jerseyNumber,team,playDirection,x,y,s,a,dis,o,dir,event
0,0,2021090900,97,41233.0,6,2021-09-10T00:26:31.600,13.0,TB,right,0.0,0.0,0.0,0.0,0.0,129.28,4.76,ball_snap
1,1,2021090900,97,41233.0,7,2021-09-10T00:26:31.700,13.0,TB,right,0.01,0.01,0.0,0.0,0.01,129.28,23.29,
2,2,2021090900,97,41233.0,8,2021-09-10T00:26:31.800,13.0,TB,right,0.02,0.02,0.02,0.55,0.01,127.86,41.24,
3,3,2021090900,97,41233.0,9,2021-09-10T00:26:31.900,13.0,TB,right,0.04,0.03,0.27,2.93,0.02,125.97,61.85,
4,4,2021090900,97,41233.0,10,2021-09-10T00:26:32.000,13.0,TB,right,0.08,0.05,0.64,3.88,0.04,125.17,61.07,


In [4]:
for idx, df in enumerate(trimmed_weeks):
    df.drop(columns='Unnamed: 0', inplace=True)

trimmed_weeks[0].head()

Unnamed: 0,gameId,playId,nflId,frameId,time,jerseyNumber,team,playDirection,x,y,s,a,dis,o,dir,event
0,2021090900,97,41233.0,6,2021-09-10T00:26:31.600,13.0,TB,right,0.0,0.0,0.0,0.0,0.0,129.28,4.76,ball_snap
1,2021090900,97,41233.0,7,2021-09-10T00:26:31.700,13.0,TB,right,0.01,0.01,0.0,0.0,0.01,129.28,23.29,
2,2021090900,97,41233.0,8,2021-09-10T00:26:31.800,13.0,TB,right,0.02,0.02,0.02,0.55,0.01,127.86,41.24,
3,2021090900,97,41233.0,9,2021-09-10T00:26:31.900,13.0,TB,right,0.04,0.03,0.27,2.93,0.02,125.97,61.85,
4,2021090900,97,41233.0,10,2021-09-10T00:26:32.000,13.0,TB,right,0.08,0.05,0.64,3.88,0.04,125.17,61.07,


In [5]:
combined_df = pd.concat(trimmed_weeks[:-2], axis=0, ignore_index=True)

combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 684037 entries, 0 to 684036
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   gameId         684037 non-null  int64  
 1   playId         684037 non-null  int64  
 2   nflId          684037 non-null  float64
 3   frameId        684037 non-null  int64  
 4   time           684037 non-null  object 
 5   jerseyNumber   684037 non-null  float64
 6   team           684037 non-null  object 
 7   playDirection  684037 non-null  object 
 8   x              684037 non-null  float64
 9   y              684037 non-null  float64
 10  s              684037 non-null  float64
 11  a              684037 non-null  float64
 12  dis            684037 non-null  float64
 13  o              684037 non-null  float64
 14  dir            684037 non-null  float64
 15  event          66348 non-null   object 
dtypes: float64(9), int64(3), object(4)
memory usage: 83.5+ MB


We'll use bezier, a Python library to create Bezier curves to equally sample points from our routes. Some routes are shorter/longer than others so we use this as a way to ensure they are all the same dimensional vectors.

In [6]:
import bezier

n_control_points = 12
num_points = 100

def calc_bezier(df):
    x = np.array(df.x)
    y = np.array(df.y)

    indices = np.linspace(0, len(x) - 1, n_control_points, dtype=int)
    x_down = x[indices]
    y_down = y[indices]

    points_down = np.array([x_down, y_down])

    curve = bezier.Curve.from_nodes(points_down)

    s_vals = np.linspace(0.0, 1.0, num_points)
    curve_points = curve.evaluate_multi(s_vals)

    interleaved_points = np.empty(curve_points.size, dtype=curve_points.dtype)
    interleaved_points[0::2] = curve_points[0]  # x-coordinates
    interleaved_points[1::2] = curve_points[1]  # y-coordinates
    
    return interleaved_points

In [10]:
routes_with_plays = []
for (gameId, playId, nflId), play_data in combined_df.groupby(['gameId', 'playId', 'nflId']):
    bezier_curve = calc_bezier(play_data)
    routes_with_plays.append({
        'gameId': gameId,
        'playId': playId,
        'nflId': nflId,
        'bezier_curve': bezier_curve
    })

# Convert to DataFrame
routes_df = pd.DataFrame(routes_with_plays)

routes_df.head()

Unnamed: 0,gameId,playId,nflId,bezier_curve
0,2021090900,97,35481.0,"[0.0, -0.0, 0.04408592462578288, -0.0253990196..."
1,2021090900,97,35634.0,"[0.0, -0.0, 0.03746511921836344, -0.0295724723..."
2,2021090900,97,39985.0,"[0.0, 0.0, 0.006772223731511142, 0.00894146393..."
3,2021090900,97,41233.0,"[0.0, 0.0, 0.005664714794905781, 0.00393140070..."
4,2021090900,97,44896.0,"[0.0, 0.0, 0.03957989532202715, 0.085623435588..."


Now that we have each Bezier curve representation for each gameId, playId, nflId combination representing a player's route during a play in a game, we'll create our k-means clustering model.

In [12]:
from sklearn.model_selection import train_test_split

curves = np.array(routes_df['bezier_curve'].tolist())
train_idx, test_idx = train_test_split(
    np.arange(len(curves)), test_size=0.2, random_state=0
)

routes_df['train_test_split'] = 'train'
routes_df.loc[test_idx, 'train_test_split'] = 'test'

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

train_curves = curves[train_idx]
test_curves = curves[test_idx]

scaler_curves = StandardScaler()

scaled_train_curves = scaler_curves.fit_transform(train_curves)
scaled_test_curves = scaler_curves.transform(test_curves)  

kmeans = KMeans(n_clusters=50, random_state=0).fit(scaler_curves.transform(curves))
routes_df['cluster_label'] = kmeans.predict(scaler_curves.transform(curves))

train_data = routes_df[routes_df['train_test_split'] == 'train']
test_data = routes_df[routes_df['train_test_split'] == 'test']

We used silhouette score outlined in this __[article](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html)__ from scikit-learn to evaluate the quality of our model.

In [14]:
from sklearn.metrics import silhouette_score

silhouette_score(scaled_train_curves, kmeans.predict(scaled_train_curves))

0.1877582564746018

In [15]:
silhouette_score(scaled_test_curves, kmeans.predict(scaled_test_curves))

0.18625498591019737

We see, on the training and testing data, we have a silhouette score around 0.18. The score ranges from -1 to 1, so we'd like to see some more improvement in the CombiningClusters.ipynb notebook.

In [16]:
combined_df = combined_df.merge(
    routes_df[['gameId', 'playId', 'nflId', 'cluster_label', 'train_test_split']],
    on=['gameId', 'playId', 'nflId'],
    how='left'
)

combined_df.head(10)

Unnamed: 0,gameId,playId,nflId,frameId,time,jerseyNumber,team,playDirection,x,y,s,a,dis,o,dir,event,cluster_label,train_test_split
0,2021090900,97,41233.0,6,2021-09-10T00:26:31.600,13.0,TB,right,0.0,0.0,0.0,0.0,0.0,129.28,4.76,ball_snap,39,train
1,2021090900,97,41233.0,7,2021-09-10T00:26:31.700,13.0,TB,right,0.01,0.01,0.0,0.0,0.01,129.28,23.29,,39,train
2,2021090900,97,41233.0,8,2021-09-10T00:26:31.800,13.0,TB,right,0.02,0.02,0.02,0.55,0.01,127.86,41.24,,39,train
3,2021090900,97,41233.0,9,2021-09-10T00:26:31.900,13.0,TB,right,0.04,0.03,0.27,2.93,0.02,125.97,61.85,,39,train
4,2021090900,97,41233.0,10,2021-09-10T00:26:32.000,13.0,TB,right,0.08,0.05,0.64,3.88,0.04,125.17,61.07,,39,train
5,2021090900,97,41233.0,11,2021-09-10T00:26:32.100,13.0,TB,right,0.16,0.09,1.11,4.22,0.09,123.72,61.91,,39,train
6,2021090900,97,41233.0,12,2021-09-10T00:26:32.200,13.0,TB,right,0.29,0.16,1.63,4.35,0.15,122.6,63.15,,39,train
7,2021090900,97,41233.0,13,2021-09-10T00:26:32.300,13.0,TB,right,0.48,0.26,2.28,4.73,0.22,120.24,62.43,,39,train
8,2021090900,97,41233.0,14,2021-09-10T00:26:32.400,13.0,TB,right,0.72,0.41,2.92,4.87,0.28,115.86,59.7,,39,train
9,2021090900,97,41233.0,15,2021-09-10T00:26:32.500,13.0,TB,right,1.0,0.6,3.49,4.56,0.34,110.4,57.9,,39,train


In [17]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 684037 entries, 0 to 684036
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   gameId            684037 non-null  int64  
 1   playId            684037 non-null  int64  
 2   nflId             684037 non-null  float64
 3   frameId           684037 non-null  int64  
 4   time              684037 non-null  object 
 5   jerseyNumber      684037 non-null  float64
 6   team              684037 non-null  object 
 7   playDirection     684037 non-null  object 
 8   x                 684037 non-null  float64
 9   y                 684037 non-null  float64
 10  s                 684037 non-null  float64
 11  a                 684037 non-null  float64
 12  dis               684037 non-null  float64
 13  o                 684037 non-null  float64
 14  dir               684037 non-null  float64
 15  event             66348 non-null   object 
 16  cluster_label     68