In [0]:
!pip install autogluon
dbutils.library.restartPython() 


In [0]:
import os
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
warnings.filterwarnings('ignore')


In [0]:
pd.set_option('display.max_columns', None)

# data loading 

In [0]:
df_data = pd.read_csv('../data/processed_log_abundance_t2d.csv', sep='\t', header=0, index_col=0)
df_data

In [0]:
df_data["target"].value_counts()

In [0]:
y = df_data.target
X = df_data.drop(columns=['target'], inplace=False)

In [0]:
X.shape, y.shape

# prediction

In [0]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [0]:
# !pip install typing_extensions>=4.5 --upgrade
# dbutils.library.restartPython()
import typing_extensions
import pkg_resources

version = pkg_resources.get_distribution("typing_extensions").version
display(version)


In [0]:
from autogluon.core.metrics import make_scorer
import torch as th


from autogluon.multimodal.optimization.losses import FocalLoss



# weights = [y_train.sum()/len(y_train), 1 - y_train.sum()/len(y_train)]
# alpha = th.Tensor(weights)

ag_focal_loss = make_scorer(name='focal_loss',
                         score_func=FocalLoss, 
                         optimum=0, 
                         greater_is_better=False, 
                        #  alpha=alpha
                         )

In [0]:
PRESETS = 'medium_quality'

In [0]:
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score

def get_auprc(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

ag_auprc_scorer = make_scorer(name='auprc',
                              score_func=get_auprc,
                              optimum=1,
                              greater_is_better=True,
                              needs_threshold=True)


In [0]:
from autogluon.tabular import TabularPredictor, TabularDataset

train_data = TabularDataset(pd.concat([X_train, y_train], axis=1))
test_data = TabularDataset(pd.concat([X_test, y_test], axis=1))

In [0]:
predictor = TabularPredictor(label='target', eval_metric='roc_auc')
predictor.fit(train_data, presets=PRESETS, hyperparameters='toy')



In [0]:
predictor = TabularPredictor.load("AutogluonModels/ag-20250106_152300", require_py_version_match=False)

In [0]:
predictor.leaderboard(test_data, extra_metrics=[ag_auprc_scorer])

In [0]:
from autogluon.multimodal import MultiModalPredictor

predictor_mm = MultiModalPredictor(label='target')
predictor_mm.fit(train_data=train_data, hyperparameters={"optimization.loss_function": "focal_loss"})

In [0]:
from autogluon.multimodal import MultiModalPredictor

predictor_mm = MultiModalPredictor.load("AutogluonModels/ag-20250106_152640")

In [0]:
predictor_mm.evaluate(test_data)

In [0]:
from sklearn.metrics import average_precision_score

predictions = predictor_mm.predict_proba(test_data)
y_true = test_data['target']
y_scores = predictions.iloc[:, 1]  # Assuming binary classification and positive class is at index 1

auprc = average_precision_score(y_true, y_scores)
auprc

In [0]:
predictions


# apply auto-encoder
 

In [0]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

encoding_dim = 128

input_data = Input(shape=(X_train.shape[1],))
encoded = Dense(encoding_dim, activation='relu')(input_data)
decoded = Dense(X_train.shape[1], activation='sigmoid')(encoded)

autoencoder = Model(input_data, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=50,
                batch_size=32,
                shuffle=True,
                validation_data=(X_test_scaled, X_test_scaled))

encoder = Model(input_data, encoded)
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)



## pred using encoded data

In [0]:
X_train_encoded = pd.DataFrame(X_train_encoded, index=y_train.index)
data_train_encoded = pd.concat([X_train_encoded, y_train], axis=1)
X_test_encoded = pd.DataFrame(X_test_encoded, index=y_test.index)
data_test_encoded = pd.concat([X_test_encoded, y_test], axis=1)
predictor_encod = TabularPredictor(label='target', eval_metric='roc_auc')
data_train_encoded = TabularDataset(data_train_encoded)
predictor_encod.fit(data_train_encoded, presets=PRESETS, hyperparameters='toy')

In [0]:
data_test_encoded = TabularDataset(data_test_encoded)
predictor.evaluate(test_data)


In [0]:

predictor_encod.leaderboard(data_test_encoded, extra_metrics=[ag_auprc_scorer])

## pred using encoding error scores


In [0]:
X_train_decoded = autoencoder.predict(X_train_scaled)
X_test_decoded = autoencoder.predict(X_test_scaled)
mse_train = tf.keras.losses.mean_squared_error(X_train_scaled, X_train_decoded)
mse_test = tf.keras.losses.mean_squared_error(X_test_scaled, X_test_decoded)
mse_train

In [0]:
plt.boxplot(mse_train)

In [0]:
data_test_decodemse

In [0]:
X_train_decodemse = pd.DataFrame(mse_train, index=y_train.index)
data_train_decodemse = pd.concat([X_train_decodemse, y_train], axis=1)
X_test_decodemse = pd.DataFrame(mse_test, index=y_test.index)
data_test_decodemse = pd.concat([X_test_decodemse, y_test], axis=1)
predictor_decodemse = TabularPredictor(label='target', eval_metric='roc_auc')
data_train_decodemse = TabularDataset(data_train_decodemse)
predictor_decodemse.fit(data_train_decodemse, presets=PRESETS, hyperparameters='toy')

In [0]:
data_test_decodemse = TabularDataset(data_test_decodemse)
predictor_encod.leaderboard(data_test_decodemse, extra_metrics=[ag_auprc_scorer])

## pred using encoding + error scores

In [0]:
X_train_mix = pd.concat([X_train_encoded, X_train_decodemse], axis=1, ignore_index=True)
data_train_mix = pd.concat([X_train_mix, y_train], axis=1)
X_test_mix = pd.concat([X_test_encoded, X_test_decodemse], axis=1, ignore_index=True)
data_test_mix = pd.concat([X_test_mix, y_test], axis=1)

predictor_mix =  TabularPredictor(label='target', eval_metric='roc_auc')
data_train_mix = TabularDataset(data_train_mix)
predictor_mix.fit(data_train_mix,  presets=PRESETS, hyperparameters='toy')

In [0]:
data_test_mix = TabularDataset(data_test_mix)
predictor_mix.leaderboard(data_test_mix, extra_metrics=[ag_auprc_scorer])