# Read input files and define function

In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle 

#==============================================================================
#  Define EMSC
#==============================================================================
#%% Extended multiplicative signal correction
def EMSC(X, reference, degree=4):
    # Create polynomials up to chosen degree
    poly = []; pvar = [1]
    for i in range(degree):
        poly.append( np.polyval(pvar,np.linspace(-1,1,len(reference))) )
        pvar.append(0)
    # Reference spectrum and polynomials
    emsc_basis = np.vstack([reference, np.vstack(poly)])
    # Estimate EMSC parameters
    (params,_,_,_) = np.linalg.lstsq(emsc_basis.T, X.T, rcond=None)
    # Correct and return
    return (X - params[1:,:].T @ emsc_basis[1:,:])/params[:1,:].T


#==============================================================================
#  Read train and test file
#==============================================================================
pickle_train = open("train.pkl","rb")
train_object = pickle.load(pickle_train)
pickle_test = open("test.pkl","rb")
test_object = pickle.load(pickle_test)

#==============================================================================
#  Read dictionary objects into arrays and Matrices
#==============================================================================
columns = train_object['shifts'].flatten()
X_train = train_object['RamanCal']
y_train = train_object['IodineCal']
replicates_train = train_object['repCal']

X_test = test_object['RamanVal']
replicates_test = test_object['repVal']

# Technical preprocessing

In [2]:
#==============================================================================
#  Keep only the shifts between 500 and 3100- train aand test
#==============================================================================
X_cut_train = X_train[ :, (columns>=500) & (columns<=3100)]
colnames = columns[ (columns>=500) & (columns<=3100) ]
X_emsc_train = EMSC(X_cut_train, X_cut_train[1343, :]  , degree=7)

X_cut_test = X_test[ :, (columns>=500) & (columns<=3100)]
X_emsc_test = EMSC(X_cut_test, X_cut_train[1343, :]  , degree=7)

# Creating DataFrames which are used later

In [3]:
#==============================================================================
#  Create dataframes for data visualization
#==============================================================================
col_str = list( map( str, colnames))
col_str.insert(0,'replicates')

test_df = pd.DataFrame( np.concatenate( (replicates_test[:, np.newaxis ]
            , X_emsc_test), axis =1), columns= col_str)

col_str.append('Iodine')

train_df = pd.DataFrame( np.concatenate( (replicates_train[:, np.newaxis ]
            , X_emsc_train, y_train), axis =1)
            , columns= col_str)

In [18]:
train_avg_df = train_df.iloc[:, :].groupby('replicates').mean()
test_avg_df = test_df.iloc[:, :].groupby('replicates').mean()

# PLS Regression as a intermediate step - Regularized
## Third best result

In [19]:
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet

from sklearn.pipeline import  make_pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from sklearn.cross_decomposition import PLSRegression

In [20]:
class PLSRegressionWrapper( PLSRegression):

    def transform(self, X):
        return super().transform(X)

    def fit_transform(self, X, Y):
        return self.fit( X,Y).transform(X)

In [22]:
# my_pipe = make_pipeline(PLSRegressionWrapper(n_components=4, scale=False), LinearRegression())
# my_pipe.fit(X_train,y_train)

In [23]:
pls_wrap =  PLSRegressionWrapper(copy=True, max_iter=100, n_components=14,
                               scale=False, tol=1e-06)

In [26]:
pls_train = pls_wrap.fit_transform( train_avg_df.iloc[:,:-1].values , train_avg_df['Iodine'].values )
pls_train.shape

(155, 14)

In [28]:
pls_test = pls_wrap.transform(test_avg_df.iloc[:,:].values)
pls_test.shape

(77, 14)

In [30]:
y= train_avg_df['Iodine'].values

In [31]:
regr = ElasticNetCV(cv=10, random_state=0)
regr.fit(pls_train , y)

ElasticNetCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True,
             l1_ratio=0.5, max_iter=1000, n_alphas=100, n_jobs=None,
             normalize=False, positive=False, precompute='auto', random_state=0,
             selection='cyclic', tol=0.0001, verbose=0)

In [32]:
print(regr.alpha_)
print(regr.intercept_)

29.282741949805924
24.760859999999997


In [38]:
print( 'l1 ratio: ' , regr.l1_ratio_)

print( 'coeffs: ' , regr.coef_) 

print ('mse for all : ',  np.min( regr.mse_path_) )

print( 'alphas :' , np.min( regr.alphas_) )

print( ' number of iterations: ', regr.n_iter_ )

l1 ratio:  0.5
coeffs:  [-0.00039013  0.00012346  0.00042981  0.00027377 -0.00031605  0.00043853
 -0.00022825  0.00032238 -0.00051599 -0.0007331   0.00058015 -0.00030464
 -0.00030282 -0.00031607]
mse for all :  0.04145264158938757
alphas : 29.282741949805924
 number of iterations:  2


In [40]:
y_test = regr.predict( pls_test)

In [41]:
output = pd.DataFrame( y_test )
output['Id'] = output.index
output=output.rename(columns={ 0: "label"})
output.to_csv("pls_elastinet.csv", index=False)