In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import StandardScaler

In [4]:
data_path = "/home/cal/jvega/Documents/exoplanets/data/"
file = "gridparams_full.parquet.gzip"

df = pd.read_parquet(data_path + file, engine='fastparquet')

In [5]:
df.head()

Unnamed: 0,file,radius,period,a,b,t0,t12,t23,t14,u,f,phi,SNR,SNR_prime,RMS_noingress
0,r0.169-a_55.798-b_0.000-period2.415-f0.500-phi...,0.168889,2.41452,55.798032,3.416644e-15,0,0.002327,0.011448,0.016102,0.0,0.5,0.349066,5.912227e-05,5.812407e-05,1.36624e-15
1,r0.071-a_55.798-b_0.433-period0.890-f0.389-phi...,0.070556,0.889517,55.798032,0.4333333,0,0.000398,0.004173,0.004968,0.0,0.388889,0.349066,0.0001081182,8.435717e-05,0.0008254048
2,r0.071-a_2.000-b_0.433-period2621.607-f0.500-p...,0.070556,2621.606561,2.0,0.4333333,0,37.761194,362.739297,438.261684,0.0,0.5,2.792527,0.0005103985,0.0004968444,1.21687e-13
3,r0.005-a_2.000-b_0.578-period48.290-f0.222-phi...,0.005,48.290406,2.0,0.5777778,0,0.054367,6.714642,6.823375,0.0,0.222222,2.094395,1.41128e-07,1.377701e-07,1.760224e-12
4,r0.234-a_7.573-b_0.289-period0.328-f0.444-phi1...,0.234444,0.327701,7.572958,0.2888889,0,0.003413,0.009787,0.016613,0.0,0.444444,1.047198,8.306537e-05,8.20725e-05,1.741142e-05


In [20]:
X = df[['radius', 'period', 'a', 'b', 'f', 'phi']]
Y = df["SNR"]
Y_alt = df["SNR_prime"]
Y_alt_2 = df["RMS_noingress"]
X = StandardScaler().fit_transform(X)

In [13]:
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.4)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5)

### Tune hyperparameters

In [None]:
# kernel ridge regression function using sklearn
def kernel_ridge_regression(X_train: np.ndarray, 
                            y_train: np.ndarray,
                            X_test: np.ndarray,
                            kernel_function):
    
    param_distributions = {
        'alpha': np.logspace(-2, 1, 30),
        'gamma': np.logspace(-1, 2, 10),
        }
    
    if kernel_function == 'poly':
        print("Polynomial kernel")
        kernel_ridge = KernelRidge(kernel='polynomial')
        del param_distributions['gamma']
        param_distributions['degree'] = np.arange(2, 5)
        
    elif kernel_function == 'rbf':
        print("RBF kernel")
        kernel_ridge = KernelRidge(kernel=RBF())
        
    # find the best hyperparameter
    kernel_ridge_tuned = RandomizedSearchCV(
        kernel_ridge,
        param_distributions=param_distributions,
        n_iter=100,
        n_jobs = 1,
        scoring='neg_mean_squared_error',
    )
    
    kernel_ridge_tuned.fit(X_train, y_train)
    best_alpha = kernel_ridge_tuned.best_params_['alpha']
    
    if kernel_function == 'rbf':
        best_gamma = kernel_ridge_tuned.best_params_['gamma']
        print(f"Best gamma: {best_gamma}")
    else:
        best_degree = kernel_ridge_tuned.best_params_['degree']
        print(f"Best degree: {best_degree}")
    
    ytest_pred = kernel_ridge_tuned.predict(X_test)
    print(f"Best alpha: {best_alpha}")
    
    return ytest_pred

In [22]:
ytest_pred = kernel_ridge_regression(X_train, Y_train, X_test, 'rbf')
mse_rbf = mean_squared_error(Y_test, ytest_pred)
print(f"Mean squared error: {mse_rbf}")

RBF kernel


KeyboardInterrupt: 

In [None]:
ytest_pred = kernel_ridge_regression(X_train, Y_train, X_test, 'poly')
mse_poly = mean_squared_error(Y_test, ytest_pred)
print(f"Mean squared error: {mse_poly}")