# Supervised learning

In [4]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [None]:
X_train_sc = pd.read_csv("X_train_sc.csv", index_col=0)
y_train = pd.read_csv("y_train.csv", index_col=0)

In [None]:
# get k best scores between features and label -> pearson, spearman, f_regression and multi_info_regression
def get_k_best_corrs(k, scores):
    idxs = np.argsort(scores)[-k:]
    feats = X_train_sc.columns[idxs]
    scores = np.sort(scores)[-k:]
    return {f: c for f, c in zip(feats, scores)}

In [None]:
pearson_corrs = [float(elem.strip()) for elem in open("pearson.txt").readlines()]
spearman_corrs = [float(elem.strip()) for elem in open("spearman.txt").readlines()]
f_values = [float(elem.strip()) for elem in open("f_values.txt").readlines()]
mutual_info = [float(elem.strip()) for elem in open("mutual_info.txt").readlines()]

Por fim, aplicámos algoritmos de aprendizagem supervisionada de modo a conseguirmos fazer previsões acerca da variável de output (<b>tm</b>) a partir de sequências de aminoácidos cujo valor de termostabilidade é desconhecido. Para isso, utilizámos sete algoritmos distintos implementados na biblioteca <b>sklearn</b>.

- <b>LinearRegression</b> (LR)
- <b>KNeighborsRegressor</b> (KNR)
- <b>RandomForestRegressor</b> (RFR)
- <b>SVR</b> -> Support Vector Regressor
- <b>MLPRegressor</b> (MLPR) -> Multi Layer Perceptron Regressor
- <b>AdaBoostRegressor</b> (ADA)
- <b>HistGradientBoostingRegressor</b> (HGBR)

### Get best combination dataset-model

In [35]:
from sklearn.ensemble import AdaBoostRegressor as ADA
from sklearn.ensemble import HistGradientBoostingRegressor as HGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import cross_validate, KFold
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.neural_network import MLPRegressor as MLPR
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

Começámos por determinar, a partir de cada um dos métodos de seleção de features referidos anteriormente (<b>Pearson</b>, <b>Spearman</b>, <b>ANOVA f-values</b> e <b>informação mútua</b>), qual a melhor combinação <b>modelo / número de features selecionadas</b>.

In [55]:
DEFAULT_NUMS = [200, 400, 600, 800]

In [56]:
def test_corr_models(model, method, corrs, cv=5, num_feats=DEFAULT_NUMS):
    print(f"Getting results for {model.__class__.__name__} using {method} to select features...\n")
    for k in num_feats:
        # get best corrs and cross-validate
        best_scores = get_k_best_corrs(k, corrs) # Get best features
        result = cross_validate(estimator=model,
                                X=X_train_sc.loc[:, best_scores.keys()],
                                y=y_train,
                                cv=KFold(n_splits=cv, shuffle=True),
                                return_train_score=True)
        # print results
        mean_train = np.sum(result["train_score"]) / cv
        mean_test = np.sum(result["test_score"]) / cv
        print(f"Results for {k} best features")
        print(f"Train scores: {result['train_score']} -> {mean_train = :.4f}")
        print(f"Test scores: {result['test_score']} -> {mean_test = :.4f}\n")

<b>Pearson</b>

In [58]:
for i, Model in enumerate([LR, RFR, SVR, MLPR, ADA, HGBR]):
    test_corr_models(model=Model(), method="Pearson correlation", corrs=pearson_corrs)
    if i < 5:
        print("----------------------------------------------------------------------------------------------------\n")

Getting results for LinearRegression using Pearson correlation to select features...

Results for 200 best features
Train scores: [0.31830325 0.31392415 0.31633945 0.31182916 0.3066358 ] -> mean_train = 0.3134
Test scores: [0.27919092 0.29974493 0.29115746 0.30118958 0.33140716] -> mean_test = 0.3005

Results for 400 best features
Train scores: [0.39371228 0.38799135 0.39360352 0.39383614 0.39987173] -> mean_train = 0.3938
Test scores: [0.38300769 0.35232374 0.37767527 0.37743098 0.35830092] -> mean_test = 0.3697

Results for 600 best features
Train scores: [0.42371335 0.4146264  0.42294396 0.42536268 0.42076352] -> mean_train = 0.4215
Test scores: [0.37103912 0.37512818 0.38437575 0.3865769  0.40484428] -> mean_test = 0.3844

Results for 800 best features
Train scores: [0.43373677 0.43318539 0.43969612 0.43861505 0.43600294] -> mean_train = 0.4362
Test scores: [0.41054996 0.3655078  0.38919964 0.39444717 0.36578661] -> mean_test = 0.3851

----------------------------------------------



Results for 200 best features
Train scores: [0.52105186 0.51666136 0.51690926 0.52675541 0.52715188] -> mean_train = 0.5217
Test scores: [0.43688842 0.45549741 0.42600597 0.45694164 0.44466639] -> mean_test = 0.4440





Results for 400 best features
Train scores: [0.46204741 0.49887642 0.4694671  0.45133553 0.50712451] -> mean_train = 0.4778
Test scores: [0.42765406 0.46299803 0.43444477 0.43355555 0.45313489] -> mean_test = 0.4424





Results for 600 best features
Train scores: [0.50828932 0.48730197 0.53997773 0.52672009 0.51295359] -> mean_train = 0.5150
Test scores: [0.48259759 0.42385795 0.48960782 0.47330764 0.47066749] -> mean_test = 0.4680





Results for 800 best features
Train scores: [0.55768337 0.53201379 0.57710713 0.54583701 0.53376656] -> mean_train = 0.5493
Test scores: [0.50072421 0.44872076 0.47927216 0.48096196 0.48445523] -> mean_test = 0.4788

----------------------------------------------------------------------------------------------------

Getting results for AdaBoostRegressor using Pearson correlation to select features...

Results for 200 best features
Train scores: [0.30878064 0.31747994 0.2821283  0.33309379 0.30994882] -> mean_train = 0.3103
Test scores: [0.31653165 0.28572524 0.27650359 0.31991506 0.28990877] -> mean_test = 0.2977

Results for 400 best features
Train scores: [0.31737068 0.33604038 0.30093692 0.30342464 0.29648123] -> mean_train = 0.3109
Test scores: [0.3274757  0.33384534 0.29182668 0.27977979 0.28082602] -> mean_test = 0.3028

Results for 600 best features
Train scores: [0.31718018 0.28876691 0.30003358 0.29836457 0.32519596] -> mean_train = 0.3059
Test scores: [0.29306934 0.26952092 

<b>Pearson</b> (further testing)

Novos testes para um novo modelo (<b>KNeighborsRegressor</b>) e para uma gama de features distinta da anterior (para os modelos <b>SVR</b>, <b>MLPRegressor</b> e <b>HistGradientBoostingRegressor</b>).

In [62]:
test_corr_models(model=KNR(), method="Pearson correlation", corrs=pearson_corrs)

Getting results for KNeighborsRegressor using Pearson correlation to select features...

Results for 200 best features
Train scores: [0.58503702 0.58997743 0.59379933 0.58816645 0.58779664] -> mean_train = 0.5890
Test scores: [0.3928789  0.36819901 0.36770372 0.38222477 0.39573199] -> mean_test = 0.3813

Results for 400 best features
Train scores: [0.61136161 0.60893551 0.60805813 0.61107387 0.60937698] -> mean_train = 0.6098
Test scores: [0.41522946 0.427479   0.40978524 0.41295209 0.41445057] -> mean_test = 0.4160

Results for 600 best features
Train scores: [0.61533186 0.61624814 0.61941333 0.6153637  0.62533147] -> mean_train = 0.6183
Test scores: [0.43304784 0.4479805  0.43914913 0.43093723 0.39303128] -> mean_test = 0.4288

Results for 800 best features
Train scores: [0.6232438  0.6171689  0.62247578 0.61839789 0.62404356] -> mean_train = 0.6211
Test scores: [0.41433254 0.43647186 0.4307311  0.4435146  0.42248312] -> mean_test = 0.4295



In [59]:
test_corr_models(model=SVR(), method="Pearson correlation", corrs=pearson_corrs, num_feats=[2000])

Getting results for SVR using Pearson correlation to select features...

Results for 2000 best features
Train scores: [0.25385633 0.24365601 0.24720895 0.2442057  0.25017659] -> mean_train = 0.2478
Test scores: [0.22252446 0.23985421 0.24987935 0.2456937  0.24385969] -> mean_test = 0.2404



In [60]:
test_corr_models(model=MLPR(), method="Pearson correlation", corrs=pearson_corrs, num_feats=[1000, 1200])

Getting results for MLPRegressor using Pearson correlation to select features...





Results for 1000 best features
Train scores: [0.56490654 0.55768887 0.55608917 0.55828499 0.52461717] -> mean_train = 0.5523
Test scores: [0.46331716 0.46617747 0.48015719 0.49865836 0.47098379] -> mean_test = 0.4759





Results for 1200 best features
Train scores: [0.60046975 0.62332129 0.58112659 0.60393118 0.58597108] -> mean_train = 0.5990
Test scores: [0.46578919 0.49174316 0.497005   0.47763045 0.49677766] -> mean_test = 0.4858





In [65]:
test_corr_models(model=HGBR(), method="Pearson correlation", corrs=pearson_corrs, num_feats=range(800, 2001, 100))

Getting results for HistGradientBoostingRegressor using Pearson correlation to select features...

Results for 800 best features
Train scores: [0.66663238 0.66909042 0.66829338 0.64459554 0.64295293] -> mean_train = 0.6583
Test scores: [0.5530253  0.53940209 0.53295276 0.53978152 0.52986614] -> mean_test = 0.5390

Results for 900 best features
Train scores: [0.65743789 0.66979245 0.66498664 0.6695574  0.66648605] -> mean_train = 0.6657
Test scores: [0.52125799 0.53635422 0.54399451 0.55205156 0.55057303] -> mean_test = 0.5408

Results for 1000 best features
Train scores: [0.66787554 0.67396278 0.67286435 0.66773249 0.66925989] -> mean_train = 0.6703
Test scores: [0.54568328 0.52206459 0.53540264 0.55007844 0.54338797] -> mean_test = 0.5393

Results for 1100 best features
Train scores: [0.69352346 0.6948638  0.68931223 0.69610607 0.68214437] -> mean_train = 0.6912
Test scores: [0.57321944 0.57223914 0.59692036 0.55132183 0.55748441] -> mean_test = 0.5702

Results for 1200 best features


Para este método de seleção de features (<b>correlação de Pearson</b>) obtivemos um score máximo de <b>0.5740</b> para a combinação <b>HistGradientBoostingRegressor / 1500 features</b>. No entanto, selecionámos a combinação <b>HistGradientBoostingRegressor / 1200 features </b> por se encontrar numa região mais estável no espaço de procura do número de features ótimo. Neste caso, o score obtido foi <b>0.5716</b>.

------------------------------------------------------------------------------------------------------------------------

In [67]:
# filter warnings -> MLPR related
import warnings
warnings.filterwarnings("ignore")

<b>Spearman</b>

In [68]:
for i, Model in enumerate([LR, KNR, RFR, SVR, MLPR, ADA, HGBR]):
    test_corr_models(model=Model(), method="Spearman correlation", corrs=spearman_corrs)
    if i < 6:
        print("----------------------------------------------------------------------------------------------------\n")

Getting results for LinearRegression using Spearman correlation to select features...

Results for 200 best features
Train scores: [0.30276032 0.30999642 0.29627182 0.30485768 0.30062867] -> mean_train = 0.3029
Test scores: [0.29231614 0.26266536 0.31845467 0.28441096 0.30099507] -> mean_test = 0.2918

Results for 400 best features
Train scores: [0.38744808 0.38428109 0.3826784  0.33979449 0.38501327] -> mean_train = 0.3758
Test scores: [0.3538583  0.36657723 0.33417186 0.30842076 0.36367539] -> mean_test = 0.3453

Results for 600 best features
Train scores: [0.41855357 0.41669686 0.41262532 0.41615306 0.42218293] -> mean_train = 0.4172
Test scores: [0.37755745 0.38235578 0.39904554 0.39122124 0.34962146] -> mean_test = 0.3800

Results for 800 best features
Train scores: [0.43666678 0.42842225 0.43421242 0.43776558 0.43959521] -> mean_train = 0.4353
Test scores: [0.39523955 0.38230352 0.37197668 0.39334337 0.39326577] -> mean_test = 0.3872

---------------------------------------------

<b>Spearman</b> (further testing)

Novos testes para uma gama de features distinta da anterior. Testes efetuados para o melhor modelo - <b>HistGradientBoostingRegressor</b> - e para o modelo <b>LinearRegression</b> (progresão de scores promissora).

In [69]:
test_corr_models(model=LR(), method="Spearman correlation", corrs=spearman_corrs, num_feats=range(900, 2001, 100))

Getting results for LinearRegression using Spearman correlation to select features...

Results for 900 best features
Train scores: [0.44845659 0.44690669 0.45188674 0.44772917 0.4460683 ] -> mean_train = 0.4482
Test scores: [0.37239857 0.40371682 0.36711783 0.40791929 0.41591596] -> mean_test = 0.3934

Results for 1000 best features
Train scores: [0.44677465 0.45609028 0.45454997 0.46056612 0.44835278] -> mean_train = 0.4533
Test scores: [0.38473902 0.39825773 0.40052927 0.38249102 0.41902439] -> mean_test = 0.3970

Results for 1100 best features
Train scores: [0.45673772 0.46337917 0.45801098 0.45532306 0.46020887] -> mean_train = 0.4587
Test scores: [0.39463812 0.37497419 0.41201434 0.42376544 0.39754653] -> mean_test = 0.4006

Results for 1200 best features
Train scores: [0.46731002 0.46380396 0.47062831 0.46190505 0.46045568] -> mean_train = 0.4648
Test scores: [0.39199155 0.39522075 0.38187573 0.42181252 0.41388638] -> mean_test = 0.4010

Results for 1300 best features
Train score

In [70]:
test_corr_models(model=HGBR(), method="Spearman correlation", corrs=spearman_corrs, num_feats=range(900, 2001, 100))

Getting results for HistGradientBoostingRegressor using Spearman correlation to select features...

Results for 900 best features
Train scores: [0.66640621 0.66761633 0.66318797 0.66438972 0.66299885] -> mean_train = 0.6649
Test scores: [0.5385035  0.53013342 0.53696804 0.54630403 0.53912704] -> mean_test = 0.5382

Results for 1000 best features
Train scores: [0.66843968 0.66608647 0.66707098 0.67020146 0.65024564] -> mean_train = 0.6644
Test scores: [0.54170773 0.53339188 0.55513566 0.52840985 0.54995223] -> mean_test = 0.5417

Results for 1100 best features
Train scores: [0.66817317 0.65897502 0.66969823 0.66850961 0.66736228] -> mean_train = 0.6665
Test scores: [0.54798365 0.54820868 0.54489859 0.53689083 0.53303332] -> mean_test = 0.5422

Results for 1200 best features
Train scores: [0.66697576 0.67070654 0.67109184 0.67369563 0.66743542] -> mean_train = 0.6700
Test scores: [0.56181496 0.53415526 0.54248205 0.5224809  0.55233844] -> mean_test = 0.5427

Results for 1300 best feature

Através deste método de seleção de features (<b>correlação de Spearman</b>) obtivemos um score máximo de <b>0.5458</b> para a combinação <b>HistGradientBoostingRegressor / 1500 features</b>. No entanto, mais uma vez, selecionámos a combinação <b>HistGradientBoostingRegressor / 1200 features </b> por se encontrar numa região mais estável no espaço de procura do número de features ótimo. Neste caso, o score obtido foi <b>0.5427</b>.

------------------------------------------------------------------------------------------------------------------------

<b>Univariate linear regression</b>

In [71]:
for i, Model in enumerate([LR, KNR, RFR, SVR, MLPR, ADA, HGBR]):
    test_corr_models(model=Model(), method="univariate linear regression", corrs=f_values)
    if i < 6:
        print("----------------------------------------------------------------------------------------------------\n")

Getting results for LinearRegression using univariate linear regression to select features...

Results for 200 best features
Train scores: [0.31373502 0.31453924 0.30417023 0.31123685 0.31630096] -> mean_train = 0.3120
Test scores: [0.28783469 0.29779471 0.30135783 0.31145979 0.29140639] -> mean_test = 0.2980

Results for 400 best features
Train scores: [0.39764482 0.39875594 0.39529086 0.39313225 0.39097182] -> mean_train = 0.3952
Test scores: [0.36595936 0.36158467 0.37670695 0.37031297 0.38661595] -> mean_test = 0.3722

Results for 600 best features
Train scores: [0.42097787 0.42265563 0.42186017 0.42585712 0.42528314] -> mean_train = 0.4233
Test scores: [0.40439878 0.40036036 0.40637312 0.38183613 0.30668828] -> mean_test = 0.3799

Results for 800 best features
Train scores: [0.43038356 0.43612914 0.43978573 0.44180339 0.43237401] -> mean_train = 0.4361
Test scores: [0.38657825 0.40212593 0.34410856 0.37866801 0.42163509] -> mean_test = 0.3866

-------------------------------------

<b>Univariate linear regression</b> (further testing)

Novos testes para uma gama de features distinta da anterior (apenas para o melhor modelo - <b>HistGradientBoostingRegressor</b>).

In [72]:
test_corr_models(model=HGBR(), method="univariate linear regression", corrs=f_values, num_feats=range(900, 2001, 100))

Getting results for HistGradientBoostingRegressor using univariate linear regression to select features...

Results for 900 best features
Train scores: [0.66688224 0.6693543  0.64699053 0.65804901 0.65496276] -> mean_train = 0.6592
Test scores: [0.54599391 0.53878915 0.53546626 0.55122983 0.52327278] -> mean_test = 0.5390

Results for 1000 best features
Train scores: [0.6694098  0.66224539 0.66859652 0.66918313 0.67199177] -> mean_train = 0.6683
Test scores: [0.54907084 0.55617924 0.53640704 0.54427348 0.51604045] -> mean_test = 0.5404

Results for 1100 best features
Train scores: [0.69247604 0.69628291 0.6926902  0.69139486 0.69126861] -> mean_train = 0.6928
Test scores: [0.5784869  0.56210165 0.56201292 0.5760861  0.57289338] -> mean_test = 0.5703

Results for 1200 best features
Train scores: [0.69627107 0.69432081 0.69199877 0.69308179 0.69435136] -> mean_train = 0.6940
Test scores: [0.56833448 0.57382256 0.57845046 0.56835063 0.5636445 ] -> mean_test = 0.5705

Results for 1300 best

Utilizando os <b>ANOVA f-values</b> como método de seleção de features obtivemos um score máximo de <b>0.5731</b> para a combinação <b>HistGradientBoostingRegressor / 1800 features</b>. Todavia, tal como havia acontecido nos casos anteriores, selecionámos a combinação <b>HistGradientBoostingRegressor / 1300 features </b> por se encontrar numa região mais estável no espaço de procura do número de features ótimo. Neste caso, o score obtido foi <b>0.5710</b>.

------------------------------------------------------------------------------------------------------------------------

<b>Mutual information regression</b>

In [73]:
for i, Model in enumerate([LR, KNR, RFR, SVR, MLPR, ADA, HGBR]):
    test_corr_models(model=Model(), method="mutual information regression", corrs=mutual_info)
    if i < 6:
        print("----------------------------------------------------------------------------------------------------\n")

Getting results for LinearRegression using mutual information regression to select features...

Results for 200 best features
Train scores: [0.29933053 0.30549307 0.30197076 0.30446767 0.29626695] -> mean_train = 0.3015
Test scores: [0.2959669  0.2691783  0.28315959 0.27444436 0.30818651] -> mean_test = 0.2862

Results for 400 best features
Train scores: [0.39009861 0.39224099 0.39230819 0.3884802  0.38664143] -> mean_train = 0.3900
Test scores: [0.34586822 0.34771363 0.35038695 0.36931395 0.37310897] -> mean_test = 0.3573

Results for 600 best features
Train scores: [0.4271809  0.42857239 0.42821025 0.4340767  0.42882269] -> mean_train = 0.4294
Test scores: [0.39660344 0.23434375 0.38580482 0.36781109 0.39394364] -> mean_test = 0.3557

Results for 800 best features
Train scores: [0.45024162 0.44818107 0.44927905 0.44205742 0.45097461] -> mean_train = 0.4481
Test scores: [0.38582873 0.3408515  0.34684105 0.4190393  0.38900584] -> mean_test = 0.3763

------------------------------------

<b>Mutual information regression</b> (further testing)

Obtivemos os melhores scores utilizando o modelo <b>HistGradientBoosting</b> e um número de features entre <b>200</b> e <b>400</b>. Então, efetuámos novas validações cruzadas do modelo tendo em conta números de features pertecentes a esta gama de valores.

In [199]:
test_corr_models(model=HGBR(), method="mutual information regression", corrs=mutual_info, num_feats=range(100, 401, 50))

Getting results for HistGradientBoostingRegressor using mutual information regression to select features...

Results for 100 best features
Train scores: [0.64638056 0.647755   0.65273351 0.64679615 0.64730562] -> mean_train = 0.6482
Test scores: [0.56075003 0.54913101 0.52380878 0.56463768 0.54638954] -> mean_test = 0.5489

Results for 150 best features
Train scores: [0.68533162 0.68011919 0.68446605 0.68878994 0.68179735] -> mean_train = 0.6841
Test scores: [0.59444313 0.58825639 0.5846732  0.56242807 0.61195944] -> mean_test = 0.5884

Results for 200 best features
Train scores: [0.69143393 0.68617854 0.68891249 0.689917   0.69075993] -> mean_train = 0.6894
Test scores: [0.58727969 0.59127946 0.58798012 0.58635699 0.58367   ] -> mean_test = 0.5873

Results for 250 best features
Train scores: [0.68964904 0.69257095 0.69232035 0.69044551 0.69519141] -> mean_train = 0.6920
Test scores: [0.5855083  0.5921357  0.58939877 0.59280559 0.57684508] -> mean_test = 0.5873

Results for 300 best fe

Utilizando a <b>informação mútua</b> como método de seleção de features obtivemos um score máximo de <b>0.5884</b> para a combinação <b>HistGradientBoostingRegressor / 150 features</b>. No entanto, tal como se sucedeu em todos os casos anteriores, selecionámos a combinação <b>HistGradientBoostingRegressor / 200 features </b> por se encontrar numa região mais estável no espaço de procura do número de features ótimo. Neste caso, o score obtido foi <b>0.5873</b>, o melhor resultado até ao momento.

------------------------------------------------------------------------------------------------------------------------

<b>SelectFromModel</b>

De seguida, de modo a testar outro método de seleção de features, utilizámos a classe <b>SelectFromModel</b> do <b>sklearn</b>. A seleção de features é realizada tendo por base o modelo <b>RandomForestRegressor</b> já que, a par do modelo <b>LinearRegression</b>, é o único que apresenta o atributo <b>feature_importances_</b> (<b>coef_</b> no caso do modelo <b>LinearRegression</b>), necessário para a definição de quais features manter e eliminar. As features selecionadas são depois partilhadas na definição do dataset que alimenta os restantes modelos de machine learning (<b>LR</b>, <b>KNR</b>, <b>SVR</b>, <b>MLPR</b>, <b>ADA</b>, <b>HGBR</b>).

In [20]:
from sklearn.feature_selection import SelectFromModel

In [83]:
def cv_select_from_model(models: list, cv=5):
    """
    Cross-validates models using features outputed by sklearn's SelectFromModel using a Random Forest
    Regressor as estimator. Returns the computed feature mask for further use.

    Parameters
    ----------
    models: list
        A list object containing uninitialized sklearn models
    cv: int (default=5)
        Number of folds used in cross-validation
    """
    # select best features according to RFR feature importances
    # features whose absolute importance value is greater or equal to the mean importance are kept
    selector = SelectFromModel(estimator=RFR())
    selector.fit(X_train_sc, y_train)
    feature_mask = selector.get_support()
    # new dataframe containing the features selected by SelectFromModel
    X_train_new = X_train_sc.iloc[:, feature_mask]
    # iterate through models and cross-validate
    for Model in models:
        InitModel = Model()
        print(f"Getting results for {InitModel.__class__.__name__} using 'SelectFromModel' to select features...\n")
        # cross-validate
        result = cross_validate(estimator=InitModel,
                                X=X_train_new,
                                y=y_train,
                                cv=KFold(n_splits=cv, shuffle=True),
                                return_train_score=True)
        # print cross-validation results
        mean_train = np.sum(result["train_score"]) / cv
        mean_test = np.sum(result["test_score"]) / cv
        print(f"Train scores: {result['train_score']} -> {mean_train = :.4f}")
        print(f"Test scores: {result['test_score']} -> {mean_test = :.4f}\n")
    return feature_mask

In [84]:
feature_mask = cv_select_from_model(models=[LR, KNR, RFR, SVR, MLPR, ADA, HGBR])

Getting results for LinearRegression using 'SelectFromModel' to select features...

Train scores: [0.47917286 0.4787124  0.47911779 0.47851584 0.47531577] -> mean_train = 0.4782
Test scores: [0.3846648  0.41275611 0.3885643  0.38054965 0.4225524 ] -> mean_test = 0.3978

Getting results for KNeighborsRegressor using 'SelectFromModel' to select features...

Train scores: [0.64117131 0.64691899 0.65368522 0.64261927 0.65363452] -> mean_train = 0.6476
Test scores: [0.48092228 0.4558046  0.45069206 0.48195009 0.43438383] -> mean_test = 0.4608

Getting results for RandomForestRegressor using 'SelectFromModel' to select features...

Train scores: [0.89198975 0.8930869  0.89418728 0.89629146 0.89215876] -> mean_train = 0.8935
Test scores: [0.53534368 0.5273811  0.49640657 0.51720304 0.52155594] -> mean_test = 0.5196

Getting results for SVR using 'SelectFromModel' to select features...

Train scores: [0.2441218  0.23882408 0.2410639  0.24465293 0.24739255] -> mean_train = 0.2432
Test scores: [

Os resultados sugerem que o melhor modelo é o <b>HistGradientBoostingRegressor</b> com um score de <b>0.5742</b>.

------------------------------------------------------------------------------------------------------------------------

<b>Pearson + Spearman + mutual information</b>

Na tentativa de melhorar ligeiramente o nosso melhor modelo (combinação <b>HistGradientBoostingRegressor</b> / <b>200 features</b> utilizando a <b>informação mútua</b> como método de seleção de features), decidimos combinar as melhores features selecionadas a partir de cada método de seleção (com exceção dos <b>ANOVA f-values</b>). Para isso, definimos um lower bound de <b>150</b> features e um upper bound de <b>350</b> features, selecionámos as melhores features tendo em conta cada método de seleção e combinámo-las através da disjunção dos nomes associados às mesma. Finalmente, efetuámos novas validações cruzadas 5-fold a partir dos datasets resultantes.

In [201]:
for num in [150, 200, 250, 300, 350]:
    best_pearson = list(get_k_best_corrs(num, pearson_corrs).keys())
    best_spearman = list(get_k_best_corrs(num, spearman_corrs).keys())
    best_mutual_info = list(get_k_best_corrs(num, mutual_info).keys())
    best_feats = list(set(best_pearson + best_spearman + best_mutual_info))
    print(f"Cross-validating HistGradientBoostingRegressor using {len(best_feats)} ({num} comb) features...")
    x_train_psmi = X_train_sc[best_feats]
    result = cross_validate(estimator=HGBR(),
                            X=x_train_psmi,
                            y=y_train,
                            cv=KFold(n_splits=5, shuffle=True),
                            return_train_score=True)
    mean_train = np.sum(result["train_score"]) / 5
    mean_test = np.sum(result["test_score"]) / 5
    print(f"Train scores: {result['train_score']} -> {mean_train = :.4f}")
    print(f"Test scores: {result['test_score']} -> {mean_test = :.4f}\n")

Cross-validating HistGradientBoostingRegressor using 305 (150 comb) features...
Train scores: [0.68301761 0.69117031 0.68361854 0.69151876 0.68589539] -> mean_train = 0.6870
Test scores: [0.60715197 0.56753502 0.59320281 0.57107169 0.5921822 ] -> mean_test = 0.5862

Cross-validating HistGradientBoostingRegressor using 404 (200 comb) features...
Train scores: [0.6932829  0.68857836 0.69084999 0.69144631 0.6913527 ] -> mean_train = 0.6911
Test scores: [0.58061507 0.58997917 0.59100267 0.5860717  0.58747708] -> mean_test = 0.5870

Cross-validating HistGradientBoostingRegressor using 498 (250 comb) features...
Train scores: [0.69726495 0.69041586 0.69251149 0.69384081 0.69802487] -> mean_train = 0.6944
Test scores: [0.57633606 0.60185842 0.59790227 0.58521013 0.57564917] -> mean_test = 0.5874

Cross-validating HistGradientBoostingRegressor using 588 (300 comb) features...
Train scores: [0.69776227 0.69850622 0.69715715 0.69354673 0.6991712 ] -> mean_train = 0.6972
Test scores: [0.58759311 

Os resultados sugerem que o número ótimo de features (tendo em consideração o espaço de procura estabelecido) é <b>250</b> para cada método de seleção. Após disjunção dos nomes das features resultantes da seleção por cada um dos métodos, obtivemos um total de <b>498</b> features e um score médio de teste na validação cruzada 5-fold de <b>0.5874</b>.

### Optimize hyperparameters of the best models

In [31]:
# best number of features considering the combination of methods (from {200, 250, 300, 350, 400})
best_k = 250
best_pearson = list(get_k_best_corrs(best_k, pearson_corrs).keys())
best_spearman = list(get_k_best_corrs(best_k, spearman_corrs).keys())
best_mutual_info = list(get_k_best_corrs(best_k, mutual_info).keys())

best_features_psmi = list(set(best_pearson + best_spearman + best_mutual_info))

Tendo em consideração os melhores modelos para cada método de seleção de features (<b>Pearson</b>, <b>Spearman</b>, <b>ANOVA f-values</b>, <b>informação mútua</b>, <b>SelectFromModel</b> e combinação <b>Pearson + Spearman + informação mútua</b>), procedemos à otimização dos seus hiperparâmetros. Invariavelmente, o melhor modelo foi o <b>HistGradientBoostingRegressor</b>. Consequentemente, apenas definimos um espaço de procura para a otimização de hiperparâmetros para este modelo. Considerámos os seguintes hiperparâmetros: <b>learning_rate</b>, <b>max_iter</b>, <b>max_leaf_nodes</b>, <b>min_samples_leaf</b> e <b>warm_start</b>. De modo a efetuar a procura dos mesmos, recorremos à classe <b>RandomizedSearchCV</b> do <b>sklearn</b>. Não é utilizada uma procura exaustiva (por exemplo, utilizando a classe <b>GridSearchCV</b> do <b>sklearn</b>), já que utilizando a grelha de hiperparâmetros definida em baixo seriam treinados <b>2500</b> modelos (<b>500</b> combinações de hiperparâmetros * <b>5 folds</b> na validação cruzada) para cada método de seleção de features, tornando a complexidade temporal do problema bastante elevada.

In [32]:
from sklearn.model_selection import RandomizedSearchCV

In [33]:
HYPER = {"learning_rate": [0.05, 0.1],
         "max_iter": np.arange(100, 301, 50),
         "max_leaf_nodes": np.arange(31, 64, 8),
         "min_samples_leaf": np.arange(16, 33, 4),
         "warm_start": [True, False]}

<b>Pearson, Spearman, univariate linear regression and mutual information regression</b>
<br>(hyperparameter optimization using the above methods to select features to train the models)

In [26]:
def grid_search(models:list, methods:list, corrs:list, num_feats:list, cv=5):
    results = []
    for mo, me, co, nf in zip(models, methods, corrs, num_feats):
        InitModel = mo()
        print(f"Optimizing {InitModel.__class__.__name__} using {me} to select features ({nf})...")
        print("----------")
        x_train_gs = X_train_sc.loc[:, get_k_best_corrs(nf, co).keys()]
        gs = RandomizedSearchCV(estimator=InitModel,
                                param_distributions=HYPER,
                                n_iter=40,
                                cv=KFold(n_splits=cv, shuffle=True),
                                verbose=3)
        gs.fit(x_train_gs, y_train)
        results.append(gs.best_params_)
        print("----------")
        print(f"Optimal hyperparameters: {gs.best_params_}")
        print(f"Best score: {gs.best_score_}\n")
    return results

In [1]:
models = [HGBR, HGBR, HGBR, HGBR]
methods = ["Pearson correlation", "Spearman correlation", "univariate linear regression", "mutual information regression"]
corrs = [pearson_corrs, spearman_corrs, f_values, mutual_info]
num_feats = [1200, 1200, 1300, 200]

In [2]:
best_params = grid_search(models, methods, corrs, num_feats)

Os hiperparâmetros ótimos para cada modelo foram, então, guardados numa variável <b>best_params</b> de modo a realizar nova validação cruzada de cada um dos modelos (nesta ocasião, treinados com hiperparâmetros otimizados).

<b>SelectFromModel</b>
<br>(hyperparameter optmization using SelectFromModel to select the best features to train the models)

In [4]:
print(f"Optimizing HistGradientBoostingRegressor using SelectFromModel to select features...")
print("----------")
x_train_sfm = X_train_sc.iloc[:, feature_mask]
gs_sfm = RandomizedSearchCV(estimator=HGBR(),
                            param_distributions=HYPER,
                            n_iter=40,
                            cv=KFold(n_splits=5, shuffle=True),
                            verbose=3)
gs_sfm.fit(x_train_sfm, y_train)
print("----------")
print(f"Optimal hyperparameters: {gs_sfm.best_params_}")
print(f"Best score: {gs_sfm.best_score_}")

<b>Pearson + Spearman + mutual information</b>
<br>(hyperparameter optmization using a combination of feature selection methods to select the best features)

In [3]:
print(f"Optimizing HistGradientBoostingRegressor using a combination of feature selection methods...")
print("----------")
x_train_psmi = X_train_sc[best_features_psmi]
gs_psmi = RandomizedSearchCV(estimator=HGBR(),
                             param_distributions=HYPER,
                             n_iter=40,
                             cv=KFold(n_splits=5, shuffle=True),
                             verbose=3)
gs_psmi.fit(x_train_psmi, y_train)
print("----------")
print(f"Optimal hyperparameters: {gs_psmi.best_params_}")
print(f"Best score: {gs_psmi.best_score_}")

Os hiperparâmetros ótimos referentes aos modelos obtidos pelos restantes métodos de seleção de features serão futuramente acessados através do atributo <b>best_params_</b> de objetos da classe <b>RandomizedSearchCV</b> (<b>gs_sfm.best_params_</b> e <b>gs_psmi.best_params_</b>).

### Cross-validate best models (with optimized hyperparameters)

Procedemos, então, a uma validação cruzada 10-fold dos melhores modelos obtidos através de cada um dos métodos de seleção de features.

<b>Pearson, Spearman, univariate linear regression and mutual information regression</b>
<br>(cross-validation of the best models with optimized hyperparameters)

In [292]:
# only cross-validates HistGradientBoostingRegressor, as it was invariably the best model
def cv_best(models, params: list, methods: list, corrs: list, num_feats: list, cv=10):
    for mo, pr, me, co, nf in zip(models, params, methods, corrs, num_feats):
        InitModel = mo()
        print(f"Cross-validating ({cv}-fold) {InitModel.__class__.__name__} using {me} ({nf} features)...")
        print(f"Optimized hyperparameters: {pr}")
        x_train = X_train_sc.loc[:,get_k_best_corrs(nf, co).keys()]
        result = cross_validate(estimator=InitModel.set_params(**pr),
                                X=x_train,
                                y=y_train,
                                cv=KFold(n_splits=cv, shuffle=True),
                                return_train_score=True)
        mean_train = np.sum(result["train_score"]) / cv
        mean_test = np.sum(result["test_score"]) / cv
        print(f"Train scores: {result['train_score']} -> {mean_train = :.4f}")
        print(f"Test scores: {result['test_score']} -> {mean_test = :.4f}\n")

In [293]:
# all variables defined above (models, best_params, methods, corrs, num_feats)
cv_best(models, best_params, methods, corrs, num_feats)

Cross-validating (10-fold) HistGradientBoostingRegressor using Pearson correlation (1200 features)...
Optimized hyperparameters: {'warm_start': False, 'min_samples_leaf': 20, 'max_leaf_nodes': 63, 'max_iter': 300, 'learning_rate': 0.05}
Train scores: [0.73233652 0.73407498 0.73019125 0.74800317 0.73221829 0.71548573
 0.75295105 0.74165818 0.73646702 0.7178306 ] -> mean_train = 0.7341
Test scores: [0.56623301 0.60612089 0.57279343 0.57305721 0.56753691 0.60907319
 0.59046368 0.57254022 0.61033073 0.58555759] -> mean_test = 0.5854

Cross-validating (10-fold) HistGradientBoostingRegressor using Spearman correlation (1200 features)...
Optimized hyperparameters: {'warm_start': False, 'min_samples_leaf': 28, 'max_leaf_nodes': 63, 'max_iter': 250, 'learning_rate': 0.05}
Train scores: [0.7046133  0.72345338 0.68532331 0.74236339 0.72630959 0.69784186
 0.68999517 0.73570951 0.72595013 0.71897368] -> mean_train = 0.7151
Test scores: [0.53102774 0.55152941 0.55988561 0.55823386 0.5664989  0.55229

Os resultados da validação cruzada demonstram que o melhor score foi obtido utilizando a <b>informação mútua</b> como método de seleção de features (<b>0.6060</b>). O modelo respetivo será, então comparado com os modelos obtidos a partir dos métodos de seleção de features <b>SelectFromModel</b> e <b>Pearson + Spearman + mutual information</b>.

<b>SelectFromModel</b>
<br>(cross-validation of the best model with optimized hyperparameters)

In [294]:
print(f"Cross-validating (10-fold) HistGradientBoostingRegressor using SelectFromModel to select features...")
print(f"Optimized hyperparameters: {gs_sfm.best_params_}")
x_train_sfm = X_train_sc.iloc[:, feature_mask]
result = cross_validate(estimator=HGBR(**gs_sfm.best_params_),
                        X=x_train_sfm,
                        y=y_train,
                        cv=KFold(n_splits=10, shuffle=True),
                        return_train_score=True)
mean_train = np.sum(result["train_score"]) / 10
mean_test = np.sum(result["test_score"]) / 10
print(f"Train scores: {result['train_score']} -> {mean_train = :.4f}")
print(f"Test scores: {result['test_score']} -> {mean_test = :.4f}\n")

Cross-validating (10-fold) HistGradientBoostingRegressor using SelectFromModel to select features...
Optimized hyperparameters: {'warm_start': False, 'min_samples_leaf': 32, 'max_leaf_nodes': 55, 'max_iter': 200, 'learning_rate': 0.05}
Train scores: [0.72930953 0.72559338 0.72952691 0.73186194 0.73014827 0.7371733
 0.71938107 0.72341578 0.75435627 0.76341954] -> mean_train = 0.7344
Test scores: [0.57799925 0.62441839 0.60640188 0.57680143 0.60326652 0.59459631
 0.60107882 0.56927381 0.58052564 0.54540325] -> mean_test = 0.5880



O score obtido na validação cruzada foi de <b>0.5880</b>. Sendo assim, o melhor modelo até ao momento é o obtido através da seleção de features pela <b>informação mútua</b>.

<b>Pearson + Spearman + mutual information</b>
<br>(cross-validation of the best model with optimized hyperparameters)

In [39]:
print(f"Cross-validating (10-fold) HistGradientBoostingRegressor using a combination of feature selection methods...")
print(f"Optimized hyperparameters: {gs_psmi.best_params_}")
x_train_psmi = X_train_sc[best_features_psmi]
result = cross_validate(estimator=HGBR(**gs_psmi.best_params_),
                        X=x_train_psmi,
                        y=y_train,
                        cv=KFold(n_splits=10, shuffle=True),
                        return_train_score=True)
mean_train = np.sum(result["train_score"]) / 10
mean_test = np.sum(result["test_score"]) / 10
print(f"Train scores: {result['train_score']} -> {mean_train = :.4f}")
print(f"Test scores: {result['test_score']} -> {mean_test = :.4f}\n")

Cross-validating (10-fold) HistGradientBoostingRegressor using a combination of feature selection methods...
Optimized hyperparameters: {'warm_start': True, 'min_samples_leaf': 32, 'max_leaf_nodes': 63, 'max_iter': 300, 'learning_rate': 0.05}
Train scores: [0.75380586 0.74244005 0.74206205 0.74833658 0.74580747 0.75685111
 0.74250112 0.74689319 0.75487387 0.74426017] -> mean_train = 0.7478
Test scores: [0.58764092 0.59907974 0.62072532 0.62004219 0.60299351 0.60183481
 0.60394989 0.59682932 0.58152845 0.59126593] -> mean_test = 0.6006



Finalmente, obteve-se um score médio de <b>0.6015</b> na validaçao cruzada referente ao modelo obtido pela combinação de métodos de seleção de features <b>Pearson + Spearman + mutual information</b>. Verificamos, então, que o melhor modelo se trata do obtido através da seleção de features pela <b>informação mútua</b>, sendo utilizado para efetuar previsões acerca dos dados de teste (sequências de aminoácidos sem label associada).

### Use best model to predict labels in test.csv

In [5]:
from feature_extraction import get_dataset_with_features

test = pd.read_csv("Files/test.csv")
test_data = get_dataset_with_features(test)
test_data.to_csv("Files/data_test.csv")

In [74]:
# split features and label
X_test = test_data.iloc[:, 2:-1]

In [75]:
X_test

Unnamed: 0,SeqLength,A,R,N,D,C,E,Q,G,H,...,MoreauBrotoAuto_Mutability22,MoreauBrotoAuto_Mutability23,MoreauBrotoAuto_Mutability24,MoreauBrotoAuto_Mutability25,MoreauBrotoAuto_Mutability26,MoreauBrotoAuto_Mutability27,MoreauBrotoAuto_Mutability28,MoreauBrotoAuto_Mutability29,MoreauBrotoAuto_Mutability30,pH
0,221,9.955,1.357,8.597,6.787,1.810,3.620,5.882,8.597,0.0,...,-0.021,-0.025,-0.032,-0.023,-0.019,-0.016,-0.019,-0.026,-0.021,8
1,221,9.955,1.357,8.597,6.787,1.810,3.167,5.882,8.597,0.0,...,-0.023,-0.027,-0.034,-0.025,-0.021,-0.018,-0.021,-0.028,-0.023,8
2,220,10.000,1.364,8.636,6.818,1.818,3.182,5.909,8.636,0.0,...,-0.025,-0.029,-0.035,-0.027,-0.023,-0.020,-0.023,-0.030,-0.025,8
3,221,9.955,1.357,8.597,6.787,2.262,3.167,5.882,8.597,0.0,...,-0.022,-0.026,-0.033,-0.024,-0.020,-0.017,-0.020,-0.027,-0.022,8
4,221,9.955,1.357,8.597,6.787,1.810,3.167,5.882,8.597,0.0,...,-0.023,-0.027,-0.034,-0.025,-0.021,-0.018,-0.021,-0.028,-0.023,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,221,9.502,1.357,8.597,6.787,1.810,3.167,5.882,8.597,0.0,...,-0.023,-0.027,-0.034,-0.025,-0.021,-0.018,-0.021,-0.028,-0.023,8
2409,221,9.502,1.357,8.597,6.787,1.810,3.167,5.882,8.597,0.0,...,-0.014,-0.018,-0.025,-0.016,-0.012,-0.009,-0.012,-0.019,-0.014,8
2410,221,9.502,1.357,9.050,6.787,1.810,3.167,5.882,8.597,0.0,...,-0.029,-0.033,-0.040,-0.031,-0.028,-0.024,-0.028,-0.035,-0.029,8
2411,221,9.502,1.357,8.597,6.787,1.810,3.167,5.882,8.597,0.0,...,-0.017,-0.021,-0.027,-0.019,-0.015,-0.012,-0.015,-0.022,-0.017,8


In [76]:
# scale X_test
X_test_arr = preprocessing.MinMaxScaler().fit_transform(X_test)
X_test_sc = pd.DataFrame(data=X_test_arr, columns=X_test.columns)

In [64]:
# get beat_features according to previous results
best_features = get_k_best_corrs(200, mutual_info).keys()

In [77]:
# reduce datasets to the best features
X_train_sc_best = X_train_sc.loc[:, best_features]
X_test_sc_best = X_test_sc.loc[:, best_features]

In [353]:
# fit best model overall (best combination of model / method of feature selection / number of features / hyperparameters)
estimator = HGBR
params = best_params[3]
model = estimator(**params)
model.fit(X_train_sc_best, y_train)

HistGradientBoostingRegressor(learning_rate=0.05, max_iter=300,
                              max_leaf_nodes=63, min_samples_leaf=32)

In [354]:
y_preds = pd.Series(model.predict(X_test_sc_best), name="tm")

In [355]:
# get predictions and create csv file
predictions = pd.concat([test_data["seq_id"], y_preds], axis=1)
predictions.to_csv("novozymes_submission.csv", index=False)

### Results

Obtivemos um score de **0,17** (correlação de spearman entre as previsões e os valores reais de "tm") na competição Novozymes do KAGGLE. Uma vez que a distribuição das sequências teste é distinta da de treino (um aviso explícito do KAGGLE), era de esperar que não fossemos obter um score tão elevado quanto obtivemos na validação cruzada.

É de notar ainda que não foi efeutada a divisão de dados de treino e teste, usando o dataset de treino, uma vez que era esperado que no final da competição fosse fornecido um dataset com as sequências de teste e os seus respetivos valores reais de termoestabilidade.