### Imports and Load Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('ALL_data.csv')

In [3]:
data.head()

Unnamed: 0,sample_id,OC,spc.X7498,spc.X7496.1,spc.X7494.2,spc.X7492.3,spc.X7490.3,spc.X7488.4,spc.X7486.5,spc.X7484.5,...,spc.X1699,spc.X1697.1,spc.X1695.2,spc.X1693.2,spc.X1691.3,spc.X1689.4,spc.X1687.4,spc.X1685.5,spc.X1683.6,spc.X1681.7
0,228241,2.78,0.597123,0.585532,0.57542,0.577522,0.589599,0.592888,0.577229,0.560463,...,1.497683,1.511603,1.526784,1.541859,1.554136,1.565369,1.574856,1.583577,1.59743,1.613562
1,228242,2.82,0.574771,0.563346,0.553535,0.555874,0.568379,0.572621,0.55797,0.541848,...,1.491209,1.504699,1.519252,1.533783,1.545827,1.556991,1.566309,1.574412,1.587285,1.602902
2,228243,2.75,0.543495,0.532264,0.521497,0.523761,0.537179,0.541816,0.526906,0.509963,...,1.455947,1.46976,1.484671,1.499452,1.511718,1.523022,1.532564,1.541187,1.554432,1.569966
3,228244,2.37,0.510573,0.49801,0.487154,0.489219,0.501669,0.505493,0.490587,0.474807,...,1.496655,1.510801,1.526071,1.541353,1.554054,1.565749,1.575605,1.584386,1.59789,1.614017
4,228245,2.6,0.498573,0.485602,0.474588,0.477919,0.492342,0.497191,0.481731,0.464248,...,1.463118,1.476103,1.490475,1.505019,1.517131,1.528273,1.537486,1.5456,1.558462,1.573774


### PLSR Model

In [124]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

X = data.drop(['sample_id', 'OC'], axis=1).copy().values
y = np.sqrt(data['OC'].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=96819)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [129]:
kf = KFold(n_splits=10, shuffle=True, random_state=96819)
param_grid = {'n_components':range(20,30)}

pls = PLSRegression()

searcher = GridSearchCV(pls, param_grid=param_grid, cv=kf, scoring='neg_mean_squared_error')
searcher.fit(X_train, y_train)
best_model = searcher.best_estimator_
print(searcher.best_params_)

{'n_components': 26}


In [126]:
y_pred = best_model.predict(X_test)
print('RMSE:')
print(np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE
0.2257637709655425


In [127]:
print('R Squared:')
print(r2_score(y_test, y_pred))

R Squared
0.9674808771824825
