__1. Set up data mining environment__

Load the processed data, remove unnecessary attributes, rename attributes for ease of referencing

In [1]:
import pandas as pd
import math, random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

# Specify the input CSV file and row number to split at
input_file = 'merged_datasets.csv'
# Function to separate CSV based on row number

# Read the input CSV file
df = pd.read_csv(input_file, header="infer")

# Split the DataFrame based on the row number
df = df.drop(columns=["HUMAN PLAYER POSITION (X) metres","HUMAN PLAYER POSITION (Y) metres","INITITAL VELOCITY OF SHUTTELCOCK(m/s)","INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE)","SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE)","OUT_OF_COURT_SHOTS"])
df = df.rename(columns={"SHUTTLECOCK POSITIION IN AIR(X ) metres":"X","SHUTTLECOCK POSITIION IN AIR(Y) metres":"Y","SHUTTLECOCK POSITIION IN AIR(Z) metres":"Z"})

print(df.shape)
df

(65006, 4)


Unnamed: 0,SHOT_ID,X,Y,Z
0,1,4.075543,1.000000,1.662912
1,1,4.152007,1.000000,1.724866
2,1,4.228906,1.000000,1.788139
3,1,4.302100,1.000000,1.845245
4,1,4.376877,1.000000,1.904128
...,...,...,...,...
65001,448,8.953783,8.783813,1.096171
65002,448,8.968729,8.798245,0.870647
65003,448,8.984867,8.813831,0.639024
65004,448,9.001328,8.829726,0.398100


__2. Create training and testing set__

70:30 split used

In [2]:
trainingRatio = 0.7
numShots = df["SHOT_ID"].max()
print(numShots)
trainsetSize = int(numShots*trainingRatio)
print(trainsetSize)
testsetSize = numShots-trainsetSize
traindata = df.loc[df["SHOT_ID"]<= trainsetSize]
testdata = df.loc[df["SHOT_ID"] > trainsetSize]
print(traindata.head())
print(testdata.head())

448
313
   SHOT_ID         X    Y         Z
0        1  4.075543  1.0  1.662912
1        1  4.152007  1.0  1.724866
2        1  4.228906  1.0  1.788139
3        1  4.302100  1.0  1.845245
4        1  4.376877  1.0  1.904128
       SHOT_ID         X         Y         Z
44351      314  4.051489  2.010944  2.685567
44352      314  4.098172  2.020867  2.768408
44353      314  4.148763  2.031621  2.852487
44354      314  4.199231  2.042348  2.933508
44355      314  4.247574  2.052623  3.014149


__4. Functions to create X_train, Y_train, X_test and Y_test__

prepare_data(): Create feature set (X,Y,Z) and class labels (adjacent X,Y,Z)

prepare_data_deltas(): Create feature set (deltaX, deltaY, deltaZ) and class labels (adjacent deltaX, deltaY, deltaZ)

In [33]:
# Individual data points in X, Y
def prepare_data(data, sequence_length):
    X = []
    Y = []
    for shot_id in data['SHOT_ID'].unique(): # for each shot id
        shot_data = data.loc[data['SHOT_ID'] == shot_id][['X', 'Y', 'Z']].values
        for i in range(len(shot_data)-sequence_length):
            X.append(shot_data[i:i+sequence_length]) # get 1 set of datapoints
            Y.append(shot_data[i+sequence_length]) # get set of values that the datapoints should predict
    X = np.array(X)
    X = X.reshape(X.shape[0], X.shape[1]*X.shape[2])
    Y = np.array(Y)
    Y = Y.reshape(-1, 3)
    print(X.shape, Y.shape)
    return X, Y

# Delta between consecutive data points in X, Y
def prepare_data_deltas(data, sequence_length):
    X = []
    Y = []
    # if seq len = 5,
    # 0  1  2  3  4 -> 4 diffs
    for shot_id in data['SHOT_ID'].unique(): # for each shot id
        shot_data = (data.loc[data['SHOT_ID'] == shot_id][['X', 'Y', 'Z']].values) # get xyz
        for i in range(len(shot_data)-sequence_length):
            for j in range(sequence_length-1):
                X1 = shot_data[j]
                X2 = shot_data[j+1] 
                X.append(X2 - X1)
            Y1 = shot_data[sequence_length+i-1]
            Y2 = shot_data[sequence_length+i]
            Y.append(Y2 - Y1)
    X = np.array(X)
    print("X pre-reshape: ", X.shape)
    X = X.reshape(-1, (sequence_length-1)*3)
    Y = np.array(Y)
    Y = Y.reshape(-1, 3)
    # final shape should be n rows of 3
    print(X.shape, Y.shape)
    return X, Y

__5. Create X_train, Y_train, X_test, Y_test__

Sequence length dictates how many data points to use per shot

_For prepare_data():_

  - Attribute set contains 0..n XYZ values for each shot, and class labels containing 1..n+1 XYZ values of each shot

_For prepare_data_deltas():_

  - Attribute set contains n XYZ values for each shot, referring to i+1['X'] - i['X']. i+1['Y'] - i['Y'] and so on for values i in 0 to n.

  - Class labels contain n XYZ values for each shot, referring to i+2['X'] - i+1['X'], similar for Y and Z, for values i in 0 to n.

In [34]:
sequence_length = 15
X_train, Y_train = prepare_data(traindata, sequence_length)

X_test, Y_test = prepare_data(testdata, sequence_length)

X_delta_train, Y_delta_train = prepare_data_deltas(traindata, sequence_length)

X_delta_test, Y_delta_test = prepare_data_deltas(testdata, sequence_length)

(39656, 45) (39656, 3)
(18630, 45) (18630, 3)
X pre-reshape:  (555184, 3)
(39656, 42) (39656, 3)
X pre-reshape:  (260820, 3)
(18630, 42) (18630, 3)


__6. Gaussian Process Regressor 1 (prepare_data())__

Train Gaussian Regressor on X_train and Y_train obtained through prepare_data()

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
import joblib

kernel = RBF()
gpr = GaussianProcessRegressor(kernel=kernel)
gpr.n_features_in_ = 3
gpr.fit(X_train, Y_train)
joblib.dump(gpr, 'gpr.pkl')


__7. Gaussian Process Regressor 1 Performance__

GPR prediction, RMSE and plot

In [None]:
# Convert predictions back to original scale for plotting
from sklearn.metrics import mean_squared_error

Y_pred = gpr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)

print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)

# 3D Plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')

ax.scatter(Y_test[:,0], Y_test[:,1], Y_test[:,2], color='b', label='Actual')
ax.scatter(Y_pred[:,0], Y_pred[:,1], Y_pred[:,2], color='r',label='Predicted')
plt.legend()
plt.show()

__8. Gaussian Process Regressor 2 (predict_data_deltas())__

GPR_delta model, trained on X_delta_train and Y_delta_trained

In [33]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
import joblib

kernel = RBF()
gpr_delta = GaussianProcessRegressor(kernel=kernel)
gpr_delta.n_features_in_ = 3
gpr_delta.fit(X_delta_train, Y_delta_train)
joblib.dump(gpr_delta, 'gpr_delta.pkl')

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


GaussianProcessRegressor(kernel=RBF(length_scale=1))

__9. GPR2 Performance__

GPR prediction, RMSE and plot

In [38]:
# Convert predictions back to original scale for plotting
from sklearn.metrics import mean_squared_error

Y_pred = gpr_delta.predict(X_delta_test)

# Evaluate the model
mse = mean_squared_error(Y_delta_test, Y_pred)
rmse = np.sqrt(mse)

print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)

Y_test_coord = np.zeros_like(Y_pred)

# Edit points to plot (Y_pred and Y_test are the deltas)
for i in range(len(Y_pred)):
    Y_pred[i] = Y_pred[i] + X_test[i] # X_test contains actual coordinates from which the deltas were derived
    Y_test_coord[i] = Y_delta_test[i] + X_test[i]

# 3D Plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')

ax.scatter(Y_test_coord[:,0], Y_test_coord[:,1], Y_test_coord[:,2], color='b', label='Actual')
ax.scatter(Y_pred[:,0], Y_pred[:,1], Y_pred[:,2], color='r',label='Predicted')
plt.legend()
plt.show()

NameError: name 'Y_pred' is not defined