## Gaussian Process Regression
### 1. Introduction
### 2. Data Preparation

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF 
from sklearn.model_selection import train_test_split

In [65]:
# pandas will automatically convert the 'NaN' values to np.nan
red_ball_coordinates = pd.read_csv('data/red_ball_coordinates_60px_026.csv', delimiter=',')
red_ball_coordinates

Unnamed: 0,u0,v0,u1,v1,u2,v2,u3,v3,u4,v4,u5,v5,u6,v6
0,,,40.571429,18.714286,,,20.500000,39.500000,41.500000,39.500000,,,,
1,,,40.600000,19.000000,,,20.500000,39.500000,41.500000,39.500000,,,,
2,,,40.500000,19.500000,,,20.500000,39.500000,41.500000,40.000000,,,,
3,,,40.666667,19.666667,,,20.800000,40.600000,42.000000,40.000000,,,,
4,,,40.666667,19.666667,,,21.400000,41.200000,42.200000,41.400000,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3036,5.333333,27.333333,41.800000,17.200000,2.666667,47.666667,24.333333,40.000000,42.750000,39.000000,,,49.5,55.0
3037,5.500000,27.500000,41.500000,15.000000,3.000000,47.500000,24.666667,40.333333,42.333333,36.666667,,,49.5,55.0
3038,6.500000,27.500000,41.500000,15.250000,3.000000,48.000000,24.666667,40.333333,44.000000,41.000000,,,49.5,55.0
3039,6.666667,27.666667,41.750000,15.250000,4.400000,49.200000,25.500000,38.000000,42.666667,36.666667,,,49.5,55.0


In [66]:
xyz_coordinates = pd.read_csv('data/positions_xyz_60px_026.csv', delimiter=',')
xyz_coordinates

Unnamed: 0,x,y,z
0,7.140000,3.070000,15.00000
1,10.140000,3.070000,15.00000
2,10.140000,3.070000,15.00000
3,10.140000,3.070000,15.00000
4,10.138330,3.169981,15.09998
...,...,...,...
3040,9.429935,5.008091,16.93809
3041,9.364073,5.083331,17.01333
3042,9.295740,5.156333,17.08633
3043,9.225012,5.227018,17.15702


In [67]:
# remove the first rows of xyz_coordinates
xyz_coordinates = xyz_coordinates.iloc[4:]
xyz_coordinates = xyz_coordinates.reset_index(drop=True)
xyz_coordinates

Unnamed: 0,x,y,z
0,10.138330,3.169981,15.09998
1,10.133340,3.269852,15.19985
2,10.125010,3.369500,15.29950
3,10.113370,3.468816,15.39882
4,10.098430,3.567688,15.49769
...,...,...,...
3036,9.429935,5.008091,16.93809
3037,9.364073,5.083331,17.01333
3038,9.295740,5.156333,17.08633
3039,9.225012,5.227018,17.15702


In [68]:
combined_data = []
# combine the two dataframes
if red_ball_coordinates.shape[0] == xyz_coordinates.shape[0]:
    combined_data = pd.concat([red_ball_coordinates, xyz_coordinates], axis=1)
combined_data

Unnamed: 0,u0,v0,u1,v1,u2,v2,u3,v3,u4,v4,u5,v5,u6,v6,x,y,z
0,,,40.571429,18.714286,,,20.500000,39.500000,41.500000,39.500000,,,,,10.138330,3.169981,15.09998
1,,,40.600000,19.000000,,,20.500000,39.500000,41.500000,39.500000,,,,,10.133340,3.269852,15.19985
2,,,40.500000,19.500000,,,20.500000,39.500000,41.500000,40.000000,,,,,10.125010,3.369500,15.29950
3,,,40.666667,19.666667,,,20.800000,40.600000,42.000000,40.000000,,,,,10.113370,3.468816,15.39882
4,,,40.666667,19.666667,,,21.400000,41.200000,42.200000,41.400000,,,,,10.098430,3.567688,15.49769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3036,5.333333,27.333333,41.800000,17.200000,2.666667,47.666667,24.333333,40.000000,42.750000,39.000000,,,49.5,55.0,9.429935,5.008091,16.93809
3037,5.500000,27.500000,41.500000,15.000000,3.000000,47.500000,24.666667,40.333333,42.333333,36.666667,,,49.5,55.0,9.364073,5.083331,17.01333
3038,6.500000,27.500000,41.500000,15.250000,3.000000,48.000000,24.666667,40.333333,44.000000,41.000000,,,49.5,55.0,9.295740,5.156333,17.08633
3039,6.666667,27.666667,41.750000,15.250000,4.400000,49.200000,25.500000,38.000000,42.666667,36.666667,,,49.5,55.0,9.225012,5.227018,17.15702


### 3. Data Preprocessing

In [69]:
# split into features and target
X = combined_data.iloc[:, :14]
y = combined_data.iloc[:, 14:]

In [70]:
from sklearn.impute import SimpleImputer
# scale the features and the target
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y)

#### 3.1. Impute the NaN values with mean SimpleImputer

In [71]:
imputer = SimpleImputer(strategy='mean')
X_imputed_mean = imputer.fit_transform(X_scaled)
y_imputed_mean = imputer.fit_transform(y_scaled)

#### 3.2. Impute the NaN values with median SimpleImputer

In [72]:
imputer = SimpleImputer(strategy='median')
X_imputed_median = imputer.fit_transform(X_scaled)
y_imputed_median = imputer.fit_transform(y_scaled)

### 3.3. Impute the NaN values with KNNImputer

In [73]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)
y_imputed = imputer.fit_transform(y)

### 4. Gaussian Process Regression

In [74]:
kernel = 1.0 * RBF(length_scale=1.0)
gp = GaussianProcessRegressor(kernel=kernel, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=10)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed_mean, y_imputed_mean, test_size=0.2, random_state=42)
gp.fit(X_train, y_train)

In [ ]:
# Make predictions on the test data
y_pred, sigma = gp.predict(X_test, return_std=True)
 
# Visualize the results
x = np.linspace(0, 5, 1000)[:, np.newaxis]
y_mean, y_cov = gp.predict(x, return_cov=True)
 
plt.figure(figsize=(10, 5))
plt.scatter(X_train, y_train, c='r', label='Training Data')
plt.plot(x, y_mean, 'k', lw=2, zorder=9, label='Predicted Mean')
plt.fill_between(x[:, 0], y_mean - 1.96 * np.sqrt(np.diag(y_cov)), y_mean + 1.96 *
                 np.sqrt(np.diag(y_cov)), alpha=0.2, color='k', label='95% Confidence Interval')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()