# Model

## Import and read Data

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from train_tabular import fit
import os
print(os.getcwd())
if os.getcwd().endswith("tabular_data"):
    os.chdir("../..")
    from src.imputation import impute_dataframe,get_imputation_values
    os.chdir("modelling/tabular_data")


columns = ['COR', 'FSH', 'FT4', 'IGF1', 'LH','PROL', 'TEST','Patient_age','Patient_gender','Pre_OP_hormone_gonado']
model_features = ['COR', 'FSH', 'FT4', 'IGF1', 'LH','PROL', 'TEST','Patient_age','Patient_gender','fold']
os.environ["WANDB_SILENT"] = "true"

def load_data():
    X_train = pd.read_csv(r'../../data/train/train_lab_data.csv')
    X_test = pd.read_csv(r'../../data/test/test_data_pairs.csv')
    Y_train = X_train["Category"]
    Y_test = X_test["Category"]
    X_train= X_train[columns + ['fold']]
    X_test= X_test[columns]
    return X_train,Y_train,X_test,Y_test
def preprocess_logreg(X_train,X_test):

    # define preprocessing function
    X_train.loc[X_train['Patient_gender'] =='male','Patient_gender'] =1
    X_train.loc[X_train['Patient_gender'] =='female','Patient_gender'] =0
    X_test.loc[X_test['Patient_gender'] =='male','Patient_gender'] =1
    X_test.loc[X_test['Patient_gender'] =='female','Patient_gender'] =0
    # Assuming `data` is your dataset and `columns_to_scale` is a list of column names to scale
    columns_to_scale = ['COR', 'FSH', 'FT4', 'IGF1', 'LH', 'PROL', 'TEST']

    # Create a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), columns_to_scale),
        ],
        remainder='passthrough'  # This will include the non-specified columns as-is
    )

    # Create a Pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        # You can add more steps to the pipeline if needed
    ])
    X_train_transformed_data = pipeline.fit_transform(X_train)
    # Get the column names after preprocessing
    preprocessed_columns = pipeline.named_steps['preprocessor'].get_feature_names_out()
    # Extract the second part of the column names
    preprocessed_columns = [col.split('__')[1] if '__' in col else col for col in preprocessed_columns]
    X_test_transformed_data = pipeline.fit_transform(X_test)
    
    # Fit and transform your data
    X_train_transformed = pd.DataFrame(X_train_transformed_data,columns=preprocessed_columns)
    X_test_transformed = pd.DataFrame(X_test_transformed_data,columns=preprocessed_columns[:-1])

    return X_train_transformed,X_test_transformed

## Setup Model

In [None]:
model = LogisticRegression(max_iter=500)

In [None]:
wandbadd={'max_iter':500}

## Fit Model and Evaluate Model

In [None]:
X_train,Y_train,X_test,Y_test = load_data()
X_train = impute_dataframe(X_train, get_imputation_values(X_train))
X_test = impute_dataframe(X_test,get_imputation_values(X_train))
X_train,X_test = preprocess_logreg(X_train,X_test)
X_train= X_train[model_features]
X_test = X_test[model_features[:-1]]
fitted_model_fold = fit(model,X_train,Y_train,X_test,Y_test,'all',"Tab-Data-LogReg-All-Data-Pairs-Permutation","LogReg",wandb_additional_config=wandbadd,perm_importance_yes=True)

In [None]:
for fold in range(0,5):
    X_train,Y_train,X_test,Y_test = load_data()
    X_Train_Impute= X_train[X_train['fold'] != fold]
    X_train = impute_dataframe(X_train, get_imputation_values(X_Train_Impute))
    X_test = impute_dataframe(X_test,get_imputation_values(X_Train_Impute))

    X_train,X_test = preprocess_logreg(X_train,X_test)
    X_train= X_train[model_features]
    X_test = X_test[model_features[:-1]]
    fitted_model_fold = fit(model,X_train,Y_train,X_test,Y_test,fold,"Tab-Data-LogReg-Data-Pairs-Permutation","LogReg",wandb_additional_config=wandbadd,perm_importance_yes=True)

In [None]:
X_train,Y_train,X_test,Y_test = load_data()
X_train = impute_dataframe(X_train, get_imputation_values(X_train))
X_test = impute_dataframe(X_test,get_imputation_values(X_train))
X_train,X_test = preprocess_logreg(X_train,X_test)
X_train= X_train[model_features]
X_test = X_test[model_features[:-1]]
fitted_model_fold = fit(model,X_train,Y_train,X_test,Y_test,'all',"Tab-Data-LogReg-All-Data-Pairs","LogReg",wandb_additional_config=wandbadd)