## Gaussian Process Regression
### 1. Introduction
### 2. Data Preparation

In [7]:
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF 
from sklearn.model_selection import train_test_split

In [8]:
# pandas will automatically convert the 'NaN' values to np.nan
red_ball_coordinates = pd.read_csv('../data/red_ball_coordinates_128px_full.csv', delimiter=',')
red_ball_coordinates

Unnamed: 0,u0,v0,u1,v1,u2,v2,u3,v3,u4,v4,u5,v5,u6,v6
0,,,45.038462,83.307692,86.925926,39.333333,0.333333,41.500000,,,88.136364,82.727273,68.5,118.5
1,,,43.350000,83.850000,87.000000,40.111111,0.444444,42.555556,,,89.500000,84.500000,68.5,118.5
2,,,43.791667,84.666667,87.310345,41.034483,0.444444,43.444444,,,89.473684,85.631579,68.5,118.5
3,,,43.411765,85.705882,87.400000,41.400000,0.666667,44.500000,,,89.777778,86.500000,68.5,118.5
4,,,44.000000,86.285714,87.740741,42.000000,1.062500,45.437500,,,89.944444,87.222222,68.5,118.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25352,68.500000,118.166667,60.500000,69.000000,,,86.988235,82.435294,55.0,24.0,,,,
25353,68.500000,118.166667,60.500000,69.000000,,,92.730337,87.629213,55.0,24.0,,,,
25354,68.571429,118.857143,60.500000,69.000000,,,110.329545,101.409091,55.0,24.0,,,,
25355,68.571429,118.857143,60.500000,69.000000,,,104.370787,97.269663,55.0,24.0,,,,


In [9]:
xyz_coordinates = pd.read_csv('../data/positions_xyz_128px_full.csv', delimiter=',')
xyz_coordinates

Unnamed: 0,x,y,z
0,10.138330,3.169981,15.099980
1,10.133340,3.269852,15.199850
2,10.125010,3.369500,15.299500
3,10.113370,3.468816,15.398820
4,10.098430,3.567688,15.497690
...,...,...,...
25352,5.115520,4.872968,9.123255
25353,4.845301,5.128964,8.955405
25354,4.577969,5.373679,8.790463
25355,4.314191,5.604666,8.628434


In [10]:
xyz_coordinates = xyz_coordinates.reset_index(drop=True)
xyz_coordinates

Unnamed: 0,x,y,z
0,10.138330,3.169981,15.099980
1,10.133340,3.269852,15.199850
2,10.125010,3.369500,15.299500
3,10.113370,3.468816,15.398820
4,10.098430,3.567688,15.497690
...,...,...,...
25352,5.115520,4.872968,9.123255
25353,4.845301,5.128964,8.955405
25354,4.577969,5.373679,8.790463
25355,4.314191,5.604666,8.628434


In [11]:
combined_data = []
# combine the two dataframes
if red_ball_coordinates.shape[0] == xyz_coordinates.shape[0]:
    combined_data = pd.concat([red_ball_coordinates, xyz_coordinates], axis=1)
combined_data

Unnamed: 0,u0,v0,u1,v1,u2,v2,u3,v3,u4,v4,u5,v5,u6,v6,x,y,z
0,,,45.038462,83.307692,86.925926,39.333333,0.333333,41.500000,,,88.136364,82.727273,68.5,118.5,10.138330,3.169981,15.099980
1,,,43.350000,83.850000,87.000000,40.111111,0.444444,42.555556,,,89.500000,84.500000,68.5,118.5,10.133340,3.269852,15.199850
2,,,43.791667,84.666667,87.310345,41.034483,0.444444,43.444444,,,89.473684,85.631579,68.5,118.5,10.125010,3.369500,15.299500
3,,,43.411765,85.705882,87.400000,41.400000,0.666667,44.500000,,,89.777778,86.500000,68.5,118.5,10.113370,3.468816,15.398820
4,,,44.000000,86.285714,87.740741,42.000000,1.062500,45.437500,,,89.944444,87.222222,68.5,118.5,10.098430,3.567688,15.497690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25352,68.500000,118.166667,60.500000,69.000000,,,86.988235,82.435294,55.0,24.0,,,,,5.115520,4.872968,9.123255
25353,68.500000,118.166667,60.500000,69.000000,,,92.730337,87.629213,55.0,24.0,,,,,4.845301,5.128964,8.955405
25354,68.571429,118.857143,60.500000,69.000000,,,110.329545,101.409091,55.0,24.0,,,,,4.577969,5.373679,8.790463
25355,68.571429,118.857143,60.500000,69.000000,,,104.370787,97.269663,55.0,24.0,,,,,4.314191,5.604666,8.628434


### 3. Data Preprocessing

In [12]:
# split into features and target
X = combined_data.iloc[:, :14]
y = combined_data.iloc[:, 14:]

In [13]:
from sklearn.impute import SimpleImputer
# scale the features and the target
from sklearn.preprocessing import StandardScaler

#### 3.1. Impute the NaN values with mean SimpleImputer

In [14]:
imputer = SimpleImputer(strategy='mean')
X_imputed_mean = imputer.fit_transform(X)
y_imputed_mean = imputer.fit_transform(y)

#### 3.2. Impute the NaN values with median SimpleImputer

In [15]:
imputer = SimpleImputer(strategy='median')
X_imputed_median = imputer.fit_transform(X)
y_imputed_median = imputer.fit_transform(y)

### 3.3. Impute the NaN values with KNNImputer

In [16]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
X_imputed_knn = imputer.fit_transform(X)

### 4. Gaussian Process Regression

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed_mean, y, test_size=0.2, random_state=42)
scaler_X = StandardScaler()
scaler_y = StandardScaler()
kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-5, 1e5))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=20)

In [None]:
X_scaled_train = scaler_X.fit_transform(X_train)
y_scaled_train = scaler_y.fit_transform(y_train)

gp.fit(X_scaled_train, y_scaled_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


### 5. Predictions

In [None]:
X_test_scaled = scaler_X.transform(X_test)
y_pred, sigma = gp.predict(X_test_scaled, return_std=True)
y_pred = scaler_y.inverse_transform(y_pred)
y_pred

### 6. Conclusion

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print(f'R2 Squared: {r2_score(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')    

#### 6.1 Visualize the results
#### 6.1.1 3D Visualization of the predicted ball trajectory

In [None]:
from utils.plot_maker_gaussian_process import plot_maker_3d
import numpy as np

plot_maker_3d(np.array(y_test), np.array(y_pred), '3d Scatter plot subset 1')
plot_maker_3d(np.array(y_test[100:120]), np.array(y_pred[100:120]), '3d Scatter plot subset 2')
plot_maker_3d(np.array(y_test[200:220]), np.array(y_pred[200:220]),'3d Scatter plot subset 3')

In [None]:
# Get the data for each dimension (x, y, z)
y_pred_x, y_pred_y, y_pred_z = y_pred[:, 0], y_pred[:, 1], y_pred[:, 2]
sigma_x, sigma_y, sigma_z = sigma[:, 0], sigma[:, 1], sigma[:, 2]
y_test_x, y_test_y, y_test_z = y_test[:, 0], y_test[:, 1], y_test[:, 2]

#### 6.1.2 Gaussian Process In Time with Covariance For Each Dimension

In [None]:
from utils.plot_maker_gaussian_process import gaussian_process_plot
# for the x dimension
gaussian_process_plot(y_test_x[:100], y_pred_x[:100], sigma_x[:100])

In [None]:
# for the y dimension
gaussian_process_plot(y_test_y[:100], y_pred_y[:100], sigma_y[:100])

In [None]:
# for the z dimension
gaussian_process_plot(y_test_z[:100], y_pred_z[:100], sigma_z[:100])