## Gaussian Process Regression
### 1. Introduction
### 2. Data Preparation

In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF 
from sklearn.model_selection import train_test_split

In [93]:
# pandas will automatically convert the 'NaN' values to np.nan
red_ball_coordinates = pd.read_csv('data/red_ball_coordinates_40px_022.csv', delimiter=',')
red_ball_coordinates

Unnamed: 0,u0,v0,u1,v1,u2,v2,u3,v3,u4,v4,u5,v5,u6,v6
0,,,,,,,,,,,,,,
1,,,,,,,6.500000,32.000000,21.000000,30.000000,,,,
2,,,20.5,18.0,,,7.000000,32.000000,,,,,,
3,,,20.5,18.0,,,6.500000,32.000000,20.000000,31.500000,,,,
4,,,20.5,20.0,,,6.333333,33.666667,20.333333,31.333333,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,,,,,,,,,,,,,20.000000,37.500000
1001,,,,,,,,,,,,,20.000000,37.500000
1002,,,,,,,,,,,,,20.000000,37.500000
1003,,,,,,,,,,,,,20.000000,37.500000


In [94]:
xyz_coordinates = pd.read_csv('data/positions_xyz_40px_022.csv', delimiter=',')
xyz_coordinates

Unnamed: 0,x,y,z
0,6.000000,4.000000,15.00000
1,6.000000,4.000000,15.00000
2,6.000000,4.000000,15.00000
3,6.000000,4.000000,15.00000
4,6.016666,4.199852,15.04667
...,...,...,...
1004,5.030804,1.933471,12.28625
1005,5.014458,1.793189,12.24048
1006,4.998124,1.662711,12.19475
1007,4.981801,1.542617,12.14904


In [95]:
# remove the first rows of xyz_coordinates
xyz_coordinates = xyz_coordinates.iloc[4:]
xyz_coordinates = xyz_coordinates.reset_index(drop=True)
xyz_coordinates

Unnamed: 0,x,y,z
0,6.016666,4.199852,15.04667
1,6.033333,4.398816,15.09333
2,6.049999,4.596008,15.14000
3,6.066665,4.790552,15.18666
4,6.083330,4.981584,15.23332
...,...,...,...
1000,5.030804,1.933471,12.28625
1001,5.014458,1.793189,12.24048
1002,4.998124,1.662711,12.19475
1003,4.981801,1.542617,12.14904


In [96]:
combined_data = []
# combine the two dataframes
if red_ball_coordinates.shape[0] == xyz_coordinates.shape[0]:
    combined_data = pd.concat([red_ball_coordinates, xyz_coordinates], axis=1)
combined_data

Unnamed: 0,u0,v0,u1,v1,u2,v2,u3,v3,u4,v4,u5,v5,u6,v6,x,y,z
0,,,,,,,,,,,,,,,6.016666,4.199852,15.04667
1,,,,,,,6.500000,32.000000,21.000000,30.000000,,,,,6.033333,4.398816,15.09333
2,,,20.5,18.0,,,7.000000,32.000000,,,,,,,6.049999,4.596008,15.14000
3,,,20.5,18.0,,,6.500000,32.000000,20.000000,31.500000,,,,,6.066665,4.790552,15.18666
4,,,20.5,20.0,,,6.333333,33.666667,20.333333,31.333333,,,,,6.083330,4.981584,15.23332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,,,,,,,,,,,,,20.000000,37.500000,5.030804,1.933471,12.28625
1001,,,,,,,,,,,,,20.000000,37.500000,5.014458,1.793189,12.24048
1002,,,,,,,,,,,,,20.000000,37.500000,4.998124,1.662711,12.19475
1003,,,,,,,,,,,,,20.000000,37.500000,4.981801,1.542617,12.14904


### 3. Data Preprocessing

In [97]:
# split into features and target
X = combined_data.iloc[:, :14]
y = combined_data.iloc[:, 14:]

### 4. Gaussian Process Regression

In [98]:
kernel = 1.0 * RBF(length_scale=1.0)
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gp.fit(X_train, y_train)

ValueError: Input X contains NaN.
GaussianProcessRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values