In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle 

#==============================================================================
#  Define EMSC
#==============================================================================
#%% Extended multiplicative signal correction
def EMSC(X, reference, degree=4):
    # Create polynomials up to chosen degree
    poly = []; pvar = [1]
    for i in range(degree):
        poly.append( np.polyval(pvar,np.linspace(-1,1,len(reference))) )
        pvar.append(0)
    # Reference spectrum and polynomials
    emsc_basis = np.vstack([reference, np.vstack(poly)])
    # Estimate EMSC parameters
    (params,_,_,_) = np.linalg.lstsq(emsc_basis.T, X.T, rcond=None)
    # Correct and return
    return (X - params[1:,:].T @ emsc_basis[1:,:])/params[:1,:].T


#==============================================================================
#  Read train and test file
#==============================================================================
pickle_train = open("train.pkl","rb")
train_object = pickle.load(pickle_train)
pickle_test = open("test.pkl","rb")
test_object = pickle.load(pickle_test)

#==============================================================================
#  Read dictionary objects into arrays and Matrices
#==============================================================================
columns = train_object['shifts'].flatten()
X_train = train_object['RamanCal']
y_train = train_object['IodineCal']
replicates_train = train_object['repCal']

X_test = test_object['RamanVal']
replicates_test = test_object['repVal']


In [2]:

#==============================================================================
#  Keep only the shifts between 500 and 3100- train aand test
#==============================================================================
X_cut_train = X_train[ :, (columns>=500) & (columns<=3100)]
colnames = columns[ (columns>=500) & (columns<=3100) ]
X_emsc_train = EMSC(X_cut_train, X_cut_train[1343, :]  , degree=7)

X_cut_test = X_test[ :, (columns>=500) & (columns<=3100)]
X_emsc_test = EMSC(X_cut_test, X_cut_train[1343, :]  , degree=7)

In [3]:
#==============================================================================
#  Create dataframes for data visualization
#==============================================================================
col_str = list( map( str, colnames))
col_str.insert(0,'replicates')

test_df = pd.DataFrame( np.concatenate( (replicates_test[:, np.newaxis ]
            , X_emsc_test), axis =1), columns= col_str)

col_str.append('Iodine')

train_df = pd.DataFrame( np.concatenate( (replicates_train[:, np.newaxis ]
            , X_emsc_train, y_train), axis =1)
            , columns= col_str)

In [None]:
#==============================================================================
# Trying PCA and Random Forest regression in a gridsearch
#==============================================================================
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import  make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [None]:
pca = PCA(n_components=20)
X_pca_train = pca.fit_transform(X_emsc_train)

In [None]:
pca_df = pd.DataFrame(X_pca_train)
pca_df['target'] = y_train

In [None]:
# import seaborn as sns
# %matplotlib inline
# sns.set(style="darkgrid")
# sns.pairplot( pca_df  )

In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import  make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [None]:
pls6 = PLSRegression(n_components=25, scale=False, max_iter= 5000)
pls6.fit(X_emsc_train, y_train.flatten() )

In [None]:
print( mean_squared_error(y_train.flatten(), pls6.predict(X_emsc_train) ))
y_test_pred = pls6.predict(X_emsc_test)

In [None]:
output = pd.DataFrame( y_test_pred )
output['Id'] = output.index
output=output.rename(columns={ 0: "label"})
output.to_csv("submission_ca05_4.csv", index=False)

In [None]:
output.head()

In [None]:
# train_df.replicates
groups = np.asarray( [])
import random
random.seed(3)

my_list = list(range(1,32))

for i in  range( 1, 32 ) :
    random_item_from_list =random.choice(my_list)    
    my_list.remove(random_item_from_list)
    lower_limit = (i-1)*5
    upper_limit = (i)*5
    rand_append_nos = sum ( (train_df.replicates < upper_limit)& (train_df.replicates >=lower_limit) )
    groups = np.concatenate( (groups, np.repeat(random_item_from_list,rand_append_nos)),axis=None ) 
print(len(groups))
print( set(groups))

In [None]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(31)

In [None]:
for train_id, test_id in gkf.split(X_emsc_train, y_train.flatten(), groups):
    print(len(train_id), len( test_id), end=' ; ')

In [None]:
comp_range = [ 14,15,16,17,18] 
scale_range = [True, False] 
max_iter_range = [200,300,500]

pls6 = make_pipeline( PLSRegression() )

param_grid = [ {'plsregression__n_components': comp_range
                , 'plsregression__scale': scale_range
                , 'plsregression__max_iter': max_iter_range} ]


gs = GridSearchCV(estimator=pls6,
                param_grid=param_grid,
                scoring='neg_mean_squared_error',
                cv=gkf,
                n_jobs=-1,
                refit=True)

gs.fit(X_emsc_train, y_train.flatten() ,groups=groups)


In [None]:
print(gs.best_score_)
print(gs.best_params_)
gs.best_estimator_

In [None]:
gs.cv_results_['mean_test_score']

In [None]:
gs=  PLSRegression(copy=True, max_iter=200, n_components=14,
                               scale=False, tol=1e-06)
gs.fit(X_emsc_train, y_train.flatten() )

In [None]:
print( mean_squared_error(y_train.flatten(), gs.predict(X_emsc_train) ) )
y_test_pred = gs.predict(X_emsc_test)

In [None]:
output = pd.DataFrame( y_test_pred )
output['Id'] = output.index
output=output.rename(columns={ 0: "label"})
output.to_csv("submission_ca05_10.csv", index=False)

# Try running with just the averages

In [None]:
train_avg_df = train_df.iloc[:, :].groupby('replicates').mean()
test_avg_df = test_df.iloc[:, :].groupby('replicates').mean()

In [None]:
train_avg_df

In [None]:
pls_avg =  PLSRegression(copy=True, max_iter=500, n_components=16,
                               scale=False, tol=1e-06)

In [None]:
comp_range = [ 14,15,16,17,18] 
scale_range = [True, False] 
max_iter_range = [200,300,500]

pls6 = make_pipeline( PLSRegression() )

param_grid = [ {'plsregression__n_components': comp_range
                , 'plsregression__scale': scale_range
                , 'plsregression__max_iter': max_iter_range} ]

gs = GridSearchCV(estimator=pls6,
                param_grid=param_grid,
                scoring='neg_mean_squared_error',
                cv=10,
                n_jobs=-1,
                refit=True)

gs.fit( train_avg_df.iloc[:,:-1].values , train_avg_df['Iodine'].values )

In [None]:
print(gs.best_score_)
print(gs.best_params_)
gs.best_estimator_

In [None]:
output = pd.DataFrame( gs.predict( test_avg_df.iloc[:,:].values))
output['Id'] = output.index
output=output.rename(columns={ 0: "label"})
output.to_csv("avg_plsr_1.csv", index=False)

**This gave 0.73 in the test dataset**