## Gaussian Process Regression
### 1. Introduction
### 2. Data Preparation

In [1]:
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF 

In [2]:
# pandas will automatically convert the 'NaN' values to np.nan
red_ball_coordinates = pd.read_csv('../test-data/red_ball_coordinates_128px_2024-02-27_11h29m.csv', delimiter=',')
red_ball_coordinates

Unnamed: 0,u0,v0,u1,v1,u2,v2,u3,v3,u4,v4,u5,v5,u6,v6
0,0.0,42.0,87.0,39.0,,,44.0,83.0,89.0,83.0,,,69.0,122.0
1,0.5,42.5,87.0,40.0,,,45.0,84.0,89.0,84.0,,,68.5,120.5
2,0.5,43.5,87.0,41.0,,,45.0,85.0,89.0,85.0,,,68.5,120.5
3,1.0,45.0,87.5,42.0,,,45.0,86.0,89.0,86.0,,,68.5,120.5
4,1.0,45.5,88.0,43.0,,,45.0,86.0,90.0,87.0,,,68.5,120.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,68.0,10.0,55.0,26.0,64.0,59.0,111.0,59.0,59.5,69.0,54.5,105.0,68.0,118.5
497,67.0,9.0,55.0,26.0,63.0,58.0,110.0,58.0,59.5,69.0,54.0,103.0,68.0,118.5
498,66.0,8.0,55.0,26.0,62.0,57.0,109.0,57.0,59.5,69.0,53.0,102.5,68.0,118.5
499,65.0,5.0,55.0,26.0,61.0,55.0,107.5,55.0,59.5,69.0,52.0,101.0,68.0,118.5


In [3]:
xyz_coordinates = pd.read_csv('../data/positions_xyz_128px_017.csv', delimiter=',')
xyz_coordinates

Unnamed: 0,x,y,z
0,6.000000,4.000000,14.00000
1,6.000000,7.000000,14.00000
2,6.000000,7.000000,14.00000
3,6.000000,7.000000,14.00000
4,6.169909,6.993336,14.07999
...,...,...,...
1872,3.642321,5.456923,12.76173
1873,3.751164,5.628386,12.83502
1874,3.867227,5.792615,12.90916
1875,3.990135,5.948880,12.98405


In [4]:
xyz_coordinates = xyz_coordinates.reset_index(drop=True)
# remove the first 4 rows
xyz_coordinates = xyz_coordinates.iloc[4:]
xyz_coordinates

Unnamed: 0,x,y,z
4,6.169909,6.993336,14.07999
5,6.339272,6.973373,14.15992
6,6.507547,6.940200,14.23974
7,6.674192,6.893964,14.31939
8,6.838673,6.834871,14.39882
...,...,...,...
1872,3.642321,5.456923,12.76173
1873,3.751164,5.628386,12.83502
1874,3.867227,5.792615,12.90916
1875,3.990135,5.948880,12.98405


### 3. Data Preprocessing

In [5]:
# split into features and target
X = red_ball_coordinates
# remove the first 3
y = xyz_coordinates

In [6]:
from sklearn.impute import SimpleImputer
# scale the features and the target
from sklearn.preprocessing import StandardScaler

#### 3.1. Impute the NaN values with mean SimpleImputer

In [7]:
imputer = SimpleImputer(strategy='mean')
X_imputed_mean = imputer.fit_transform(X)
y_imputed_mean = imputer.fit_transform(y)

#### 3.2. Impute the NaN values with median SimpleImputer

In [8]:
imputer = SimpleImputer(strategy='median')
X_imputed_median = imputer.fit_transform(X)
y_imputed_median = imputer.fit_transform(y)

### 3.3. Impute the NaN values with KNNImputer

In [9]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
X_imputed_knn = imputer.fit_transform(X)

### 4. Gaussian Process Regression

In [10]:
total_samples = len(X_imputed_mean)
train_samples = int(0.8 * total_samples)

# Split the data
X_train = X_imputed_mean[:train_samples]
y_train = y[:train_samples]

X_test = X_imputed_mean[train_samples:]
y_test = y[train_samples:]

In [11]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()
kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-5, 1e5))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=20)

In [12]:
X_scaled_train = scaler_X.fit_transform(X_train)
y_scaled_train = scaler_y.fit_transform(y_train)

gp.fit(X_scaled_train, y_scaled_train)



### 5. Predictions

In [13]:
X_test_scaled = scaler_X.transform(X_test)
y_pred, sigma = gp.predict(X_test_scaled, return_std=True)
y_pred = scaler_y.inverse_transform(y_pred)
y_pred

array([[ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.36852505],
       [ 6.23323086,  4.10877013, 14.368

### 6. Conclusion

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print(f'R2 Squared: {r2_score(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')    

ValueError: Found input variables with inconsistent numbers of samples: [1473, 101]

#### 6.1 Visualize the results
#### 6.1.1 3D Visualization of the predicted ball trajectory

In [None]:
from utils.plot_maker_gaussian_process import plot_maker_3d
import numpy as np

plot_maker_3d(np.array(y_test), np.array(y_pred), '3d Scatter plot subset 1')

In [None]:
y_pred = np.array(y_pred)
sigma = np.array(sigma)
y_test = np.array(y_test)

In [None]:
# Get the data for each dimension (x, y, z)
y_pred_x, y_pred_y, y_pred_z = y_pred[:, 0], y_pred[:, 1], y_pred[:, 2]
sigma_x, sigma_y, sigma_z = sigma[:, 0], sigma[:, 1], sigma[:, 2]
y_test_x, y_test_y, y_test_z = y_test[:, 0], y_test[:, 1], y_test[:, 2]

#### 6.1.2 Gaussian Process In Time with Covariance For Each Dimension

In [None]:
from utils.plot_maker_gaussian_process import gaussian_process_plot
# for the x dimension
gaussian_process_plot(y_test_x, y_pred_x, sigma_x)

In [None]:
# for the y dimension
gaussian_process_plot(y_test_y, y_pred_y, sigma_y)

In [None]:
# for the z dimension
gaussian_process_plot(y_test_z[:100], y_pred_z[:100], sigma_z[:100])