In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import lines
import matplotlib.collections as collections
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_score
# mape is from scikit V0.24, unstable, taken from nightly build
from sklearn.metrics import r2_score, mean_squared_error as mse, mean_absolute_percentage_error as mape
from scipy.stats import f, pearsonr, linregress
from scipy.signal import argrelmax

In [None]:
from HottelingT2 import selectVars
from helpers import cal_val_split, rmspe, get_line_ends
from stats import nromalized_wls, rmspe, calc_error, ratio_ratio_r

In [2]:
COLORS = {
    #57: '#ff98a6',
    #64: '#3dae46',
    69: '#4270dd',
    74: '#d44958',
    76: '#3dae46',
    81: '#a22bdd',
    87: '#cc7929',
    96: '#f0e521',
    102: '#796f00'
}
ROWS = 3
COLS = 3
SIZE = 4

In [3]:
data = pd.read_csv("20201025_Dataset_plots_n216.csv")
for col in ['Ca', 'Cb', 'TChl']:
    data[col] = data[col]*1000

In [4]:
data.groupby('LeafType').agg({'DAT': 'unique'})

Unnamed: 0_level_0,DAT
LeafType,Unnamed: 1_level_1
FlagLeaf,"[69, 74, 76, 81, 87, 96, 102]"
FullyDeveloped,"[57, 64]"


In [5]:
def findOptimalLoadings(X, y, maxLoadings):
    scores = np.zeros((maxLoadings, 2))
    for i in range(len(scores)):
        ncomp = i+1
        pls = PLSRegression(n_components=ncomp, scale=False) # range starts from 0
        cv=10
        scores[i] = ncomp, cross_val_score(pls, X, y, cv=cv, scoring='neg_mean_squared_error').sum()
    # argrelmax instead of argelmin - looking for min MSE but scoring returns -MSE. So search for max -MSE
    # argrelmax can return many minima. get the first local minima [0][0]
    try:
        min_ind = argrelmax(scores[:, 1])[0][0]
    except IndexError: # no max
        min_ind = 0
    return scores[min_ind, 0].astype(int)

In [6]:
flagData = data[(data.LeafType == 'FlagLeaf')].copy()
flagData = flagData.drop(columns=map(str, range(329, 400)))
wls_columns = flagData.columns[11:]
flagData['Ratio'] = flagData.Ca/flagData.Cb
param_data = flagData[flagData.columns.difference(wls_columns)]
wls_data = flagData[wls_columns]
cal, val = cal_val_split(param_data)

In [7]:
def runModel(cal, val, predict, wls, maxLoadings=10, alpha=[0.2, 0.15, 0.1, 0.05, 0.01]):
    '''
    select the important variables for the model
    cal - calibration set (boolean series)
    val - validation set (boolean series)
    predit - column containing variable to predict
    wls - dataframe containing X variables of model (wavelengths)
    loadings - max ammount of loadings in the PLS model
    '''
    
    # R2, RMSE, is validation, is selection
    dataStore = []
    Xcal = wls[cal]
    Xval = wls[val]
    ycal = predict[cal]
    yval = predict[val]
    loadings = findOptimalLoadings(Xcal, ycal, maxLoadings)
    pls = PLSRegression(n_components = loadings, scale=False)
    pls.fit(Xcal, ycal)
    calPredict = pls.predict(Xcal)
    valPredict = pls.predict(Xval)
    rmse =  mse(ycal, calPredict, squared=False)
    dataStore.append([predict.name, pls.score(Xcal, ycal),rmse, rmspe(ycal, rmse), wls.columns.to_numpy(), len(wls.columns), 0, 0, None])
    
    weights = pls.x_weights_
    
    K = np.zeros((len(alpha), 8))
    V = []
    predicts = []
    coefs = []
    models = []
    for i, a in enumerate(alpha):
        selected = selectVars(weights, wls, alpha=a)
        if len(selected.columns) < 2:
            continue
        t2_Xcal = selected[cal]
        t2_Xval = selected[val]
        t2_loadings = findOptimalLoadings(t2_Xcal, ycal, np.min([maxLoadings, len(selected.columns)-1]))
        t2_pls = PLSRegression(n_components = t2_loadings, scale=False)
        t2_pls.fit(t2_Xcal, ycal)
        score_cal = t2_pls.score(t2_Xcal, ycal)
        yhat_cal = t2_pls.predict(t2_Xcal)
        rmse_cal = mse(ycal, yhat_cal, squared=False)
        rmspe_cal = rmspe(ycal, rmse_cal)
        
        score = t2_pls.score(t2_Xval, yval)
        yhat = t2_pls.predict(t2_Xval)
        rmse = mse(yval, yhat, squared=False)
        rmspe_val = rmspe(yval, rmse)
        K[i] = score_cal, rmse_cal, rmspe_cal,  score, rmse, rmspe_val, len(selected.columns), a
        V.append(selected.columns)
        predicts.append(((ycal ,yhat_cal.flatten()),(yval ,yhat.flatten())))
        coefs.append((t2_pls.coef_))
        models.append(t2_pls)
    
    # minimal RMSE
    rmse_arr = K[:, 4]
    rmse_arr = rmse_arr[rmse_arr != 0]
    p = np.where(rmse_arr == np.min(rmse_arr))[0]
    minData = K[p[0]]  
    minVars = V[p[0]]
    t2predicts = [pd.DataFrame({'obs': obs, 'pred': pred}) for obs, pred in predicts[p[0]]]
    t2coefs = coefs[p[0]] #
    t2model = models[p[0]] # chosen T2PLSR model

    dataStore.append([predict.name, minData[0], minData[1], minData[2], minVars.to_numpy(),
                      len(minVars), 0, 1, minData[7]]) # cal-sel
    dataStore.append([predict.name, minData[3], minData[4], minData[5], minVars.to_numpy(),
                      len(minVars), 1, 1, minData[7]]) # val-sel
    collection = pd.DataFrame(data=dataStore, columns=['Type', 'R2', 'RMSE', 'RMSPE', 'vars', 'var_len', 'val',
                                                       'selection', 'alpha'])
    return collection, t2predicts, t2coefs, t2model

In [8]:
def model_wls_coefs(data, coefs, filename):
    print(filename)
    arr = np.zeros((len(coefs), 2))
    arr[:, 1] = coefs[:, 0]
    arr[:, 0] = data.vars[1]
    
    df = pd.DataFrame(data=arr, columns=['wl', 'coef'])
    df.to_csv(f"./coef tables/{filename}.csv", ",", index=False)

In [9]:
def populateModels(cal, val, predicteds, excluders, wls):
    '''
    run model based on the same cal-val split for all the predicteds values
    cal - calibration boolean series selector
    val - validation boolean series selector
    predicteds - dataframe in which each column is to be predicteds
    excluders = dataframe in which each column can be used to exclude samples (union with cal & val)
    wls - X variables (wavelenghts).
    ====
    predictes, excluders & wls come from the same original dataframe, flagData
    CAUTION! This function has HARD-CODED exclusions that change the cal/val split or remove points
    '''
    keywords = ['data', 'T2Predict', 'coefs', 'models']
    models = {}

    for col, predict in predicteds.iteritems():
        include = ~(excluders.NotAnalizedFor == col) # if True should stay
        ncal = cal & include
        nval = val & include
        if col == 'Cb':
            switcher = (excluders.Plots == 517) & (excluders.DAT == 81) 
            ncal = ncal & ~switcher # turn off switcher
            nval = nval | switcher # turn on switcher
        # print(f"predict: {col} cal: {cal.sum()} val: {val.sum()} ncal: {ncal.sum()} nval: {nval.sum()}")
        models[col] = dict(zip(keywords, runModel(ncal, nval, predict, wls)))
    return models


In [10]:
def drawCoefs(ax, wls, coefs, Xvars, units):
    full_wls = Xvars.columns.astype(int)
    lmin, lmax = full_wls.min(), full_wls.max()
    coef_min = coefs.min()
    coef_max = coefs.max()*1.2
    ax.set_xlim(lmin, lmax)
    xticks = np.arange(lmin, lmax, math.floor((lmax-lmin)/10))
    ax.set_xticks(xticks)
    ax.tick_params(axis='x', labelrotation=45)
    ax.set_ylabel(f"Coefficient {units}")
    ax.set_xlabel("Wavelength (nm)")
    ax.scatter(wls.astype(int), coefs, marker='.')

def drawFiltered(ax, data, selected, alpha):
    wls = data.columns.astype(int)
    lmin, lmax = wls.min(), wls.max()
    ax.set_xlim(lmin, lmax)
    xticks = np.arange(lmin, lmax, math.floor((lmax-lmin)/10))
    ax.set_xticks(xticks)
    ax.tick_params(axis='x', labelrotation=45)
    ax.set_ylabel('Reflectance spectra')
    ax.set_xlabel('Wavelength (nm)')
    
    ax.plot(wls, data.T)
    wls = np.arange(lmin, lmax, 1)
    ix = np.in1d(wls, selected)
    collection = collections.BrokenBarHCollection.span_where(
    wls, ymin=0, ymax=ax.get_ylim()[1], where=ix == False, facecolor='red', alpha=0.3) # mark the non-selected
    ax.add_collection(collection)
    alph = r"$\alpha$"
    ax.text(0.02, 0.8, f"{alph} = {alpha}\nselected wavelengths = {len(selected)}", transform=ax.transAxes)

def makeLegend(legend):
    handels = [lines.Line2D([], [], markerfacecolor='none', markeredgecolor='black', marker='o', linestyle='None', label='Calibration'), 
                   lines.Line2D([], [], color='black', marker='+', linestyle='None', label='Validation')]
    return handels+[lines.Line2D([], [], color=color, marker='.', linestyle='None', label=name) for name, color in legend.items()]

def finalFig(drawFinals, analysis, param_data, filename):
    rows = len(analysis)
    fig, axes = plt.subplots(rows, COLS, figsize=((SIZE*COLS, SIZE*rows)))
    
    nextNumber = 1
    ax = 0
    for key, results in analysis.items():
        nextNumber = drawFinals(axes[ax], results['data'], results['models'], results['coefs'], 
                                param_data[key], num=nextNumber)
        ax += 1

    handles = makeLegend(COLORS)
    plt.figlegend(handles=handles, loc="upper center", ncol=len(handles), bbox_to_anchor=(0.5, 0.12))
    fig.tight_layout()
    fig.subplots_adjust(bottom=0.18)
    fig.text(0.02, 0.02, "(A)-(C) are for the chlorophyl a content, (D)-(F) are for chlorophyl b content and (G)-(I) are for total chlorophyl content.\n(A), (D), (G) are y-y plots for the T2PLS models. Statistics shown for model validation.\nLine of optimal fit is colored green, line of best fit for the validation data is colored red.\n(B), (E), (H) are coeffient plots for each the selected wavelengths for the model.\n(C), (F), (I) show the full spectra of the validation set. Filtered wavelengths have red background.")
    #plt.show()
    plt.savefig(f'./graphs/{filename}.png')
    plt.close()

def ratio_graph(filename, cal, val, rmse):
    alpha = 0.4
    colors = {
        'cal': 'blue',
        'val': 'red'
    }
    fig, ax = plt.subplots(1, 1)
    # plot the dots
    # 0 -real data, 1 - model predicted
    ax.scatter(cal[0], cal[1], color=colors['cal'], alpha=alpha, label="calib")
    ax.scatter(val[0], val[1], color=colors['val'], alpha=alpha, label="valid")
    # calculate best fit lines
    z_cal = np.polyfit(cal[0], cal[1], 1)
    z_val = np.polyfit(val[0], val[1], 1)

    # plot best fit lines
    # ax.plot(*get_line_ends(cal[0], np.polyval(z_cal, cal[0])), color='black', linestyle='solid', linewidth=1)
    # ax.plot(*get_line_ends(val[0], np.polyval(z_val, val[0])), color='black', linestyle=(0, (5, 10)), linewidth=1)

    # plot optimal fit line
    ax.plot([0, 1], [0, 1], linewidth=1, c='black', linestyle='dotted', transform=ax.transAxes)

    # calc R2s
    R2C = r2_score(cal[0], cal[1])
    R2V = r2_score(val[0], val[1])

    # draw R2s
    R2C_str = r"─── $Cal: {R}^{2}$"
    R2V_str = r"─ ─ $Val: {R}^{2}$"
    ax.text(0.55, 0.1, f"{R2C_str}={R2C:.2f}\n{R2V_str}={R2V:.2f}", transform=ax.transAxes)
    # labels
    ax.set_xlabel(f"Measured chl-a/chl-b")
    ax.set_ylabel(f"Predicted chl-a/chl-b")
    ax.legend()
    fig.suptitle(f"Ratio prediction from Chl-a/b prediction. RMSE: {rmse:.2f}")
    fig.savefig(f"./graphs/{filename}.png")
    plt.close()

In [11]:
def finalGraphs(cal, val, groups, Xvars, colors):
    ''' supply inner function with location of flag leaf internal and external'''
    lx = 0.02
    ly = 0.93
    markers = {'cal': 'o', 'val': '+'}
    alpha=0.6
    units = r'($\mu gcm^{-2}$)'
    predict_text = {
        'Ca': 'Chl-a',
        'Cb': 'Chl-b',
        'TChl': 'TChl',
        'Ratio': 'Chl-a/Chl-b'
    }
    def inner(axes, data, t2model, coefs, predict, num):
        # valid and calib alpha & selected vars are the same. The plots expect the valid dataset
        valid_model_info = data.loc[(data['val'] == 1) & (data['selection'] == 1), ["vars", "alpha"]]
        selected = valid_model_info.vars.iloc[0]
        selected_alpha = valid_model_info.alpha.iloc[0]
        
        ax0 = axes[0]
        ax0.set_xlabel(f"Observed {predict_text[predict.name]} {units}")
        ax0.set_ylabel(f"Predicted {predict_text[predict.name]} {units}")
        ax0.set_aspect(adjustable='box', aspect='equal') # MAYBE
        
        axLetter = chr(ord('`')+num)
        num += 1
        
        real_cal = []
        real_val = []
        hat_cal = []
        hat_val = []
        group_names = groups.unique()
        for name in group_names:
            group_selector = groups == name
            Xvars_cal = Xvars.loc[cal & group_selector, selected]
            Xvars_val = Xvars.loc[val & group_selector, selected]
            y_cal = predict[cal & group_selector]
            y_val = predict[val & group_selector]
            y_cal_hat = t2model.predict(Xvars_cal)
            y_val_hat = t2model.predict(Xvars_val)
            real_cal.append(y_cal)
            real_val.append(y_val)
            hat_cal.append(y_cal_hat)
            hat_val.append(y_val_hat)
            
            ax0.scatter(y_cal, y_cal_hat, facecolors='none', edgecolors=colors[name], marker=markers['cal'], alpha=alpha)
            ax0.scatter(y_val, y_val_hat, color=colors[name], marker=markers['val'], alpha=alpha)
        
        merged_truth = np.concatenate([*real_cal, *real_val])
        merged_real_cal = np.concatenate(real_cal)
        merged_hat_cal = np.concatenate(hat_cal)
        merged_real_val = np.concatenate(real_val)
        merged_hat_val = np.concatenate(hat_val)
        R2V = data[data['val'] == 1].R2.iloc[0]
        R2C = data[data['val'] == 0].R2.iloc[0]
        #RMSPE = data[data['val'] == 1].RMSPE.iloc[0]
        z_cal = np.polyfit(merged_real_cal, merged_hat_cal, 1)
        z_val = np.polyfit(merged_real_val, merged_hat_val, 1)
        
        ax0.plot(*get_line_ends(merged_real_cal, np.polyval(z_cal, merged_real_cal)), color='black', linestyle="solid", linewidth=1)
        ax0.plot(*get_line_ends(merged_real_val, np.polyval(z_val, merged_real_val)), color='black', linestyle=(0, (5, 10)), linewidth=1)
        #ax0.plot(merged_truth, merged_truth, linewidth=1, c='black', linestyle='dotted')
        ax0.plot((0, 1), (0, 1), linewidth=1, c='black', linestyle='dotted', transform=ax0.transAxes)
        ax0.set_yticks(ax0.get_xticks())
        ax0.set_ylim(ax0.get_xlim())
        print(ax0.get_xlim())
        #ax0.set_xlim(merged_truth.min(), merged_truth.max())
        #ax0.set_ylim(merged_truth.min(), merged_truth.max())
        R2C_str = r"─── $Cal: {R}^{2}$"
        R2V_str = r"─ ─ $Val: {R}^{2}$"
        ax0.text(0.55, 0.1, f"{R2C_str}={R2C:.2f}\n{R2V_str}={R2V:.2f}", transform=ax0.transAxes)
        ax0.text(lx, ly, f"{axLetter}", weight='bold', transform=ax0.transAxes)
        
        ax1 = axes[1]
        axLetter = chr(ord('`')+num)
        ax1.text(lx, ly, f"{axLetter}", weight='bold', transform=ax1.transAxes)
        num += 1
        drawCoefs(ax1, selected, coefs.T[0], Xvars, units)
        
        ax2 = axes[2]
        axLetter = chr(ord('`')+num)
        ax2.text(lx, ly, f"{axLetter}", weight='bold', transform=ax2.transAxes)
        num += 1
        drawFiltered(ax2, Xvars[val], selected.astype(int), selected_alpha)
        return num
    return inner

In [12]:
normalize_by = ['614', '525', '415', '406']
datasets = { wl: wls_data.divide(wls_data[wl], axis=0) for wl in normalize_by }
datasets['common'] = wls_data

predicteds = param_data[['Ca', 'Cb', 'TChl']]#, 'Ratio']]
excluders = param_data[['Plots', 'DAT', 'NotAnalizedFor']]

In [14]:
models = {name: populateModels(cal, val, predicteds, excluders, dataset) for name, dataset in datasets.items()}

In [15]:
SAVE_CSV = True
if SAVE_CSV:
    #allData = pd.concat([CaData, CbData, TChlData, Flag_CaData, Flag_CbData, Flag_TChlData])
    #allData.to_csv("./stat_table.csv", ",")
    for key, model in models.items():
        for predict, frames in model.items():
            model_wls_coefs(frames['data'], frames['coefs'], f"{key}_{predict}")

614_Ca
614_Cb
614_TChl
525_Ca
525_Cb
525_TChl
415_Ca
415_Cb
415_TChl
406_Ca
406_Cb
406_TChl
common_Ca
common_Cb
common_TChl


In [16]:
for key, model in models.items():
    dfs = []
    for frames in model.values():
        dfs.append(frames['data'])
    pd.concat(dfs).to_csv(f"./model_stats/{key}_table.csv")

In [17]:
for key, model in models.items():
    drawFinals = finalGraphs(cal, val, param_data['DAT'], datasets[key], COLORS)
    finalFig(drawFinals, model, predicteds, f"n_graph_{key}")
plt.show()

(19.762608599999997, 49.6255274)
(4.2593906, 13.1576074)
(24.029371099999995, 62.628324899999996)
(19.762608599999997, 49.6255274)
(4.2593906, 13.1576074)
(24.029371099999995, 62.628324899999996)
(19.762608599999997, 49.6255274)
(4.2593906, 13.1576074)
(24.029371099999995, 62.628324899999996)
(19.762608599999997, 49.6255274)
(4.2593906, 13.1576074)
(24.029371099999995, 62.628324899999996)
(19.762608599999997, 49.6255274)
(4.2593906, 13.1576074)
(24.029371099999995, 62.628324899999996)


## Ratio based on models

In [18]:
a_min_wl, a_min, b_min_wl, b_min = None, 999, None, 999
for wl, model in models.items():
    Ca = model['Ca']['data']
    Cb = model['Cb']['data']
    ca_min = Ca.loc[(Ca.val == 1), 'RMSE'].iloc[0]
    cb_min = Cb.loc[(Cb.val == 1), 'RMSE'].iloc[0]
    if ca_min < a_min:
        a_min = ca_min
        a_min_wl = wl
    if cb_min < b_min:
        b_min = cb_min
        b_min_wl = wl

print(a_min_wl, a_min, b_min_wl, b_min)

614 1.9150318106446242 525 0.5174065632967012


In [19]:
remove_514 = param_data[((param_data.Plots == 514) & (param_data.DAT == 81))].index[0]
switch_517 = param_data[((param_data.Plots == 517) & (param_data.DAT == 81))].index[0]
ProtoCaCal, ProtoCaVal = models[a_min_wl]['Ca']['T2Predict']
CbCal, CbVal = models[b_min_wl]['Cb']['T2Predict']
# remove 514 from CaVal
# move 517 from CaCal to CaVal
CaVal = ProtoCaVal.drop(index=remove_514)
CaVal.loc[switch_517] = ProtoCaCal.loc[switch_517]
CaCal = ProtoCaCal.drop(index=switch_517)
ratio_data = param_data[['Ca', 'Cal_Val', 'Cb', 'DAT', 'Ratio', 'Treatment', 'Genotype']].copy()
ratio_data.loc[switch_517, 'Cal_Val'] = 'Val' # switch 517 to Val
cal_ratio_predict = CaCal.pred/CbCal.pred
val_ratio_predict = CaVal.pred/CbVal.pred # [val][estimated]
ratio_data.loc[ratio_data.Cal_Val == 'Cal', 'Pred_Ratio'] = cal_ratio_predict #.reshape(1, -1)[0]
ratio_data.loc[ratio_data.Cal_Val == 'Val', 'Pred_Ratio'] = val_ratio_predict #.reshape(1, -1)[0]
averaged_ratio = ratio_data.groupby(['DAT', 'Treatment', 'Genotype']).agg({'Ratio': 'mean', 'Pred_Ratio': 'mean'}).reset_index()
ratio_data['edgecolors'] = ratio_data.DAT.map(COLORS)
ratio_data['facecolors'] = ratio_data.apply(lambda row: row.edgecolors if row.Treatment == 'WW' else "none", axis=1)
averaged_ratio['edgecolors'] = averaged_ratio.DAT.map(COLORS)
averaged_ratio['facecolors'] = averaged_ratio.apply(lambda row: row.edgecolors if row.Treatment == 'WW' else "none", axis=1)

In [None]:
val_ratio_data = ratio_data[(ratio_data.Cal_Val == 'Val') & (~ratio_data.Pred_Ratio.isna())]
avg_ratio = averaged_ratio.Ratio
avg_pred_ratio = averaged_ratio.Pred_Ratio
avg_slope, avg_intercept, _, avg_p, _ = linregress(avg_ratio, avg_pred_ratio)
avg_r2 = 1-((((avg_ratio-avg_pred_ratio)**2).sum())/(((avg_ratio-avg_ratio.mean())**2).sum()))
avg_rmse = mse(avg_ratio, avg_pred_ratio, squared=False)
avg_minmax = np.array([avg_ratio.min(), avg_ratio.max()])

val_ratio = val_ratio_data.Ratio
val_pred_ratio = val_ratio_data.Pred_Ratio
val_slope, val_intercept, _, val_p, _ = linregress(val_ratio, val_pred_ratio)
val_r2 = 1-((((val_ratio-val_pred_ratio)**2).sum())/(((val_ratio-val_ratio.mean())**2).sum()))
val_rmse = mse(val_ratio, val_pred_ratio, squared=False)
val_minmax = np.array([val_ratio.min(), val_ratio.max()])

R2_str = r"${R}^{2}$"
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 6))
ax0 = axes[0]
ax = axes[1]
ax0.set_xlabel('Chl-a:b (observed)', weight='bold')
ax0.set_ylabel('Chl-a:b (best PLSR)', weight='bold')
ax0.set_aspect(adjustable='datalim', aspect='equal')
ax.set_xlabel('Chl-a:b (observed)', weight='bold')
ax.set_ylabel('Chl-a:b (best PLSR)', weight='bold')
ax.set_aspect(adjustable='datalim', aspect='equal')


ax0.scatter(x=val_ratio, y=val_pred_ratio, color=val_ratio_data.edgecolors, facecolor=val_ratio_data.facecolors)
ax0.plot([0, 1], [0, 1], color='black', linestyle='dotted', transform=ax0.transAxes) # 1:1 line
ax0.plot(val_minmax, val_minmax*val_slope+val_intercept, color='black') # regression line
ax0.text(s='a', x=0.02, y=0.93, weight='bold', transform=ax0.transAxes)
ax0.text(s=f"n = {len(val_ratio_data)}\n{R2_str} = {val_r2:.2f}\nRMSE: {val_rmse:.2f}\np<0.0001", x=0.02, y=0.75, transform=ax0.transAxes)

ax.scatter(x=avg_ratio, y=avg_pred_ratio, color=averaged_ratio.edgecolors, facecolor=averaged_ratio.facecolors)
ax.plot([0, 1], [0, 1], color='black', linestyle='dotted', transform=ax.transAxes) # 1:1 line
ax.plot(avg_minmax, avg_minmax*avg_slope+avg_intercept, color='black') # regression line
ax.text(s='b', x=0.02, y=0.93, weight='bold', transform=ax.transAxes)
ax.text(s=f"n = {len(averaged_ratio)}\n{R2_str} = {avg_r2:.2f}\nRMSE: {avg_rmse:.2f}\np<0.0001", x=0.02, y=0.75, transform=ax.transAxes)

handles = [lines.Line2D([], [], markerfacecolor='none', markeredgecolor='black', marker='o', linestyle='None', label='WL'), 
                   lines.Line2D([], [], color='black', marker='o', linestyle='None', label='WW')]
handles += [lines.Line2D([], [], color=color, marker='.', linestyle='None', label=f'{name} DAT') for name, color in COLORS.items()]
plt.figlegend(handles=handles, loc="upper center", ncol=len(handles), bbox_to_anchor=(0.5, 0.2))
fig.tight_layout()
fig.subplots_adjust(bottom=0.3)
fig.text(0.02, 0.02, '''1:1 graph of the relationship between observed Chl-a:b vs PLSR predicted Chl-a:b (i.e. Chl-a:b
calculated from the best performing PLSR for Chl-a and Chl-b separately). (a) presents result from calibration model on the
30 % validation dataset, and (b) presents averaged dataset from calibration model on all (100%) the dataset.''')
plt.show()

## Ratio PLS
Now that I have everything ordered in nice dicts we can move on to ratio PLS

In [27]:
models['614']['Ca']['T2Predict'][1].shape

(50, 2)

In [47]:
param_data[((param_data.Plots == 517) & (param_data.DAT == 81))].index[0]

86

In [58]:
by_model = {}
remove_514 = param_data[((param_data.Plots == 514) & (param_data.DAT == 81))].index[0]
switch_517 = param_data[((param_data.Plots == 517) & (param_data.DAT == 81))].index[0]
for name, model in models.items():
    info = {}
    CaCal, CaVal = model['Ca']['T2Predict']
    CbCal, CbVal = model['Cb']['T2Predict']
    # remove 514 from CaVal
    # move 517 from CaCal to CaVal
    CaVal2 = CaVal.drop(index=remove_514)
    CaVal2.loc[switch_517] = CaCal.loc[switch_517]
    CaCal2 = CaCal.drop(index=switch_517)
        
    cal_real_data = CaCal2.obs/CbCal.obs
    cal_model_predict = CaCal2.pred/CbCal.pred
    val_real_data = CaVal2.obs/CbVal.obs
    val_model_predict = CaVal2.pred/CbVal.pred
    by_model[name] = {
        'cal': (cal_real_data, cal_model_predict),
        'val': (val_real_data, val_model_predict),
        'rmse': mse(val_real_data, val_model_predict, squared=False)
    }

In [63]:
for name, cal_val in by_model.items():
    cal = cal_val['cal']
    val = cal_val['val']
    rmse = cal_val['rmse']
    ratio_graph(f"ratio_{name}", cal, val, rmse)

## Deviation cause
1. Is there a relation between the errors of the chl-a & chl-b predictions and the Ratio predictions?
0. What is the relationship between the chl-a/b prediction and the denovo prediction of the ratio?

In [471]:
#predicts.append(((ycal ,yhat_cal),(yval ,yhat)))
#for key, model in models.items():
def relationship_plot(models, key, cal=True, denovo=True):
    predictors = ['Ca', 'Cb', 'Ratio']
    model = models[key]
    if denovo:
        series = [calc_error(model, predictor, cal) for predictor in predictors]
    else:
        series = [calc_error(model, predictor, cal) for predictor in predictors[:-1]]
        ratio_model = by_model[key]
        if cal:
            ratio_set = ratio_model['cal']
        else:
            ratio_set = ratio_model['val']
        nonovo_error = ratio_set[0]-ratio_set[1] 
        nonovo_error.name = 'Ratio'
        series.append(nonovo_error)
    errors = pd.concat(series, axis=1)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection = '3d')

    ax.set_xlabel("Ca")
    ax.set_ylabel("Cb")
    ax.set_zlabel("Ratio")

    ax.scatter(errors.Ca, errors.Cb, errors.Ratio)

    plt.show()

#relationship_plot(models, 'common')


3


In [475]:
relationship_plot(models, '406', denovo=False)

3


In [502]:
def ratio_ratio_plot(models, key, cal=True):
    model = models[key]['Ratio']
    ratio = model['T2Predict']
    ratio_model = by_model[key]
    if cal:
        nonovo = ratio_model['cal']
        denovo = ratio[0]
    else:
        nonovo = ratio_model['val']
        denovo = ratio[1]
    
    fig = plt.figure()
    ax = fig.add_subplot(111)

    ax.set_xlabel("Ratio")
    ax.set_ylabel("Nonovo ratio")

    ax.scatter(denovo[1].flatten(), nonovo[1])
    ax.plot([0, 1], [0,1], transform=ax.transAxes)

    plt.show()

In [503]:
for key in by_model.keys():
    print(key, ratio_ratio_r(models, key))

614 (0.9489131034373915, 1.5866874809347982e-75)
525 (0.9787835633699455, 4.192062973272186e-103)
415 (0.49824785909393715, 1.0070073700832216e-10)
406 (0.9107922975945921, 2.405971561519362e-58)
common (0.8247782522850662, 3.1805829878314884e-38)


In [504]:
for key in by_model.keys():
    print(key, ratio_ratio_r(models, key, cal=False))

614 (0.945217957624768, 2.361254501913028e-31)
525 (0.9821413487997065, 5.741939874651324e-46)
415 (0.4669365481796376, 0.00011443108472969538)
406 (0.9309743751099661, 2.2054783412742584e-28)
common (0.7956864035833336, 6.621255590680845e-15)


In [3]:
plt.get_backend()

'TkAgg'