# IREI: Search Engines & Real World Data
### Víctor Morcuende Castell and Guillermo Nájera Lavid
#### Course 2022-2023

In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import ndcg_score

In [93]:
# Load data
glucose_df = pd.read_excel('loinc_dataset-v2.xlsx', sheet_name = "glucose in blood")
bilirubin_df = pd.read_excel('loinc_dataset-v2.xlsx', sheet_name = "bilirubin in plasma")
wbc_df = pd.read_excel('loinc_dataset-v2.xlsx', sheet_name = "White blood cells count")
glucose_target = glucose_df.pop('relevant')
bilirubin_target = bilirubin_df.pop('relevant')
wbc_target = wbc_df.pop('relevant')

In [94]:
# We eliminate irrelevant information
glucose_df = glucose_df.drop('comments', axis=1)
glucose_df = glucose_df.drop('loinc_num', axis=1)
bilirubin_df = bilirubin_df.drop('comments', axis=1)
bilirubin_df = bilirubin_df.drop('loinc_num', axis=1)
wbc_df = wbc_df.drop('comments', axis=1)
wbc_df = wbc_df.drop('loinc_num', axis=1)

In [95]:
# Using OneHotEncoder
ohe = preprocessing.OneHotEncoder(sparse_output=False)
glucose_df = pd.DataFrame(ohe.fit_transform(glucose_df),
                          columns=ohe.get_feature_names_out(glucose_df.columns.tolist()),
                          index=glucose_df.index)
bilirubin_df = pd.DataFrame(ohe.fit_transform(bilirubin_df),
                            columns=ohe.get_feature_names_out(bilirubin_df.columns.tolist()),
                            index=bilirubin_df.index)
wbc_df = pd.DataFrame(ohe.fit_transform(wbc_df),
                      columns=ohe.get_feature_names_out(wbc_df.columns.tolist()),
                      index=wbc_df.index)

WE TRAIN AN ADARANK MODEL USING THE METHOD IMPORTED FROM SKLEARN

Query: Glucose in Blood

In [96]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(glucose_df, glucose_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break


Model NDCG score: 0.9409203809871785
Iterations needed to achieve a score above 0.9: 9


Query: Bilirubin in Plasma

In [97]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(bilirubin_df, bilirubin_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 1.0
Iterations needed to achieve a score above 0.9: 49


Query: White Blood Cells Count

In [98]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(wbc_df, wbc_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.923928388719314
Iterations needed to achieve a score above 0.9: 113


#### EXTENDING THE DATASETS

In [99]:
# Load the new data
extended_glucose_df = pd.read_excel('extended_loinc_dataset-v2.xlsx', sheet_name = "glucose in blood")
extended_bilirubin_df = pd.read_excel('extended_loinc_dataset-v2.xlsx', sheet_name = "bilirubin in plasma")
extended_wbc_df = pd.read_excel('extended_loinc_dataset-v2.xlsx', sheet_name = "White blood cells count")
extended_glucose_target = extended_glucose_df.pop('relevant')
extended_bilirubin_target = extended_bilirubin_df.pop('relevant')
extended_wbc_target = extended_wbc_df.pop('relevant')

In [100]:
# We eliminate irrelevant information
extended_glucose_df = extended_glucose_df.drop('comments', axis=1)
extended_glucose_df = extended_glucose_df.drop('loinc_num', axis=1)
extended_bilirubin_df = extended_bilirubin_df.drop('comments', axis=1)
extended_bilirubin_df = extended_bilirubin_df.drop('loinc_num', axis=1)
extended_wbc_df = extended_wbc_df.drop('comments', axis=1)
extended_wbc_df = extended_wbc_df.drop('loinc_num', axis=1)

In [101]:
# Using OneHotEncoder
ohe = preprocessing.OneHotEncoder(sparse_output=False)
extended_glucose_df = pd.DataFrame(ohe.fit_transform(extended_glucose_df), 
                                   columns=ohe.get_feature_names_out(extended_glucose_df.columns.tolist()),
                                   index=extended_glucose_df.index)
extended_bilirubin_df = pd.DataFrame(ohe.fit_transform(extended_bilirubin_df), 
                                     columns=ohe.get_feature_names_out(extended_bilirubin_df.columns.tolist()),
                                     index=extended_bilirubin_df.index)
extended_wbc_df = pd.DataFrame(ohe.fit_transform(extended_wbc_df), 
                               columns=ohe.get_feature_names_out(extended_wbc_df.columns.tolist()),
                               index=extended_wbc_df.index)

WE TRAIN AN ADARANK MODEL USING THE METHOD IMPORTED FROM SKLEARN

Query: Glucose in Blood

In [102]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(extended_glucose_df, extended_glucose_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.9328509665484749
Iterations needed to achieve a score above 0.9: 3


Query: Bilirubin in Plasma

In [103]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(extended_bilirubin_df, extended_bilirubin_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.9146523650946982
Iterations needed to achieve a score above 0.9: 20


Query: White Blood Cells Count

In [104]:
iter = 0
while(True):
    iter = iter+1
    X_train, X_test, y_train, y_test = train_test_split(extended_wbc_df, extended_wbc_target, test_size=0.2)
    estimator = DecisionTreeRegressor(max_depth=1)
    model = AdaBoostRegressor(estimator=estimator, n_estimators=100, loss='linear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    ndcg = ndcg_score([y_test], [y_pred], k=X_test.shape[1])
    if ndcg >= 0.9:
        print('Model NDCG score:', ndcg)
        print("Iterations needed to achieve a score above 0.9:", iter)
        break

Model NDCG score: 0.9263221439514678
Iterations needed to achieve a score above 0.9: 1
