# Model

## Import and read Data

In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from train_tabular import fit
import matplotlib.pyplot as plt
import seaborn as sns
import os

print(os.getcwd())
if os.getcwd().endswith("tabular_data"):
    os.chdir("../..")
    print(os.getcwd())
    from src.imputation import impute_dataframe,get_imputation_values
    os.chdir("modelling/tabular_data")
    print(os.getcwd())


columns = ['COR', 'FSH', 'FT4', 'IGF1', 'LH','PROL', 'TEST','Patient_age','Patient_gender','Pre_OP_hormone_gonado']
model_features = ['COR', 'FSH', 'FT4', 'IGF1', 'LH','PROL', 'TEST','Patient_age','Patient_gender','fold']
os.environ["WANDB_SILENT"] = "true"

def load_data():
    X_train = pd.read_csv(r'../../data/train/train_lab_data.csv')
    X_test = pd.read_csv(r'../../data/test/test_data_pairs.csv')
    Y_train = X_train["Category"]
    Y_test = X_test["Category"]
    X_train= X_train[columns + ['fold']]
    X_test= X_test[columns]
    return X_train,Y_train,X_test,Y_test

def preprocess_xg(X_train,X_test,Y_train,Y_test):
    # define preprocessing function
    le = LabelEncoder()
    Y_train = le.fit_transform(Y_train)
    Y_test = pd.DataFrame(le.fit_transform(Y_test))

    X_train['Patient_gender']= X_train['Patient_gender'].astype('category')
    X_test['Patient_gender']= X_test['Patient_gender'].astype('category')
    return X_train,X_test,Y_train,Y_test,le

## Setup Model

In [None]:
model = XGBClassifier(enable_categorical=True,random_state=42)

In [None]:
wandbadd={'enable_categorical':True,'random_state':42}

## Fit Model and Evaluate Model

In [None]:
for fold in range(0,5):
    # TODO: learning curve 
    X_train,Y_train,X_test,Y_test = load_data()
    X_Train_Impute= X_train[X_train['fold'] != fold]
    X_train = impute_dataframe(X_train, get_imputation_values(X_Train_Impute))
    X_test = impute_dataframe(X_test,get_imputation_values(X_Train_Impute))
    X_train,X_test,Y_train,Y_test,le = preprocess_xg(X_train,X_test,Y_train,Y_test)
    X_train= X_train[model_features]
    X_test = X_test[model_features[:-1]]  
    fitted_model_fold = fit(model,X_train,Y_train,X_test,Y_test,fold,"Tab-Data-XGBoost-Data-Pairs-ohneCOR","XGBoost",None,class_names = le.fit_transform(['non-prolaktinom','prolaktinom']),wandb_additional_config=wandbadd)

In [None]:
X_train,Y_train,X_test,Y_test = load_data()
X_train = impute_dataframe(X_train, get_imputation_values(X_train))
X_test = impute_dataframe(X_test,get_imputation_values(X_train))
X_train,X_test,Y_train,Y_test,le = preprocess_xg(X_train,X_test,Y_train,Y_test)
X_train= X_train[model_features]
X_test = X_test[model_features[:-1]]
fitted_model_fold = fit(model,X_train,Y_train,X_test,Y_test,'all',"Tab-Data-XGBoost-All-Data-Pairs-ohneCOR","XGBoost",None, class_names = le.fit_transform(['non-prolaktinom','prolaktinom']),wandb_additional_config=wandbadd)