In [13]:
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
from biom import load_table
import pandas as pd
from gemelli.preprocessing import matrix_rclr
from gemelli.rpca import rpca, rpca_table_processing
from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
from TRPCA import trpca, utils
from sklearn.decomposition import PCA

In [None]:
import pandas as pd
import numpy as np
from biom.table import Table
from biom.util import biom_open

def df_to_biom(df, sample_metadata=None, taxa_metadata=None, table_id=None):
    """
    Convert a pandas DataFrame to a BIOM table.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame where rows are features (e.g., taxa) and columns are samples
    sample_metadata : pandas.DataFrame, optional
        Metadata for samples (columns). Index must match df columns
    taxa_metadata : pandas.DataFrame, optional
        Metadata for features (rows). Index must match df index
    table_id : str, optional
        ID for the BIOM table
        
    Returns:
    --------
    biom.Table
        BIOM format table
    
    Example:
    --------
    >>> df = pd.DataFrame({
    ...     'sample1': [1, 2, 3],
    ...     'sample2': [4, 5, 6]
    ... }, index=['taxa1', 'taxa2', 'taxa3'])
    >>> biom_table = df_to_biom(df)
    """
    # Convert DataFrame to dense numpy array
    data = df.values
    
    # Get sample and observation IDs
    sample_ids = df.columns.astype(str)
    obs_ids = df.index.astype(str)
    
    # Convert metadata to dict format if provided
    sample_metadata_dict = None
    if sample_metadata is not None:
        sample_metadata_dict = sample_metadata.to_dict(orient='index')
        
    taxa_metadata_dict = None
    if taxa_metadata is not None:
        taxa_metadata_dict = taxa_metadata.to_dict(orient='index')
    
    # Create BIOM table
    table = Table(
        data=data,
        sample_ids=sample_ids,
        observation_ids=obs_ids,
        sample_metadata=sample_metadata_dict,
        observation_metadata=taxa_metadata_dict,
        table_id=table_id
    )
    
    return table

def save_biom_table(biom_table, filepath):
    """
    Save a BIOM table to a file.
    
    Parameters:
    -----------
    biom_table : biom.Table
        BIOM table to save
    filepath : str
        Path where to save the BIOM file. Should end with '.biom'
    
    Example:
    --------
    >>> save_biom_table(biom_table, "output.biom")
    """
    with biom_open(filepath, 'w') as f:
        biom_table.to_hdf5(f, "Generated by df_to_biom")

In [None]:
# List of regressors to compare
regressors = {
    "Support Vector Regression": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "Gradient Boosting Regression": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "K-Neighbors Regression": KNeighborsRegressor(n_neighbors=5),
    "Neural Network Regression": MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=1000, random_state=42),
    "Random Forest Regression": RandomForestRegressor(n_estimators=100, random_state=42),
}

### 16S Example

In [None]:
table = load_table('data/oral_2550.biom').to_dataframe(dense=True).T.astype(int)
age_metadata = pd.read_csv('data/oral_2550_map.txt', sep='\t', index_col=0, dtype={'qiita_host_age': float})

# age_metadata = age_metadata.drop_duplicates(subset='host_subject_id')
table = table.loc[age_metadata.index]
columns_to_drop = table.columns[table.apply(lambda col: (col != 0).sum()) < 25]# drop columns with low prev
df = table.drop(columns=columns_to_drop).copy()
df1 = pd.DataFrame(matrix_rclr(df.values), columns=df.columns, index=df.index).fillna(0)
# PCA reduction
pca = PCA(n_components=256)
arr_reduced = pca.fit_transform(df1.values)

num_folds = 10
subjects = age_metadata['host_subject_id']
studies = age_metadata['qiita_study_id']

kf = StratifiedGroupKFold(n_splits=num_folds, shuffle=True, random_state=42)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Get non-zero values
nonzero_vals = df1.values[df1.values > 0]

# Create figure with multiple subplots
plt.figure(figsize=(15, 5))

# Create three subplots
plt.subplot(131)
sns.histplot(nonzero_vals, kde=True, color='#2878B5', alpha=0.6)
plt.title('Distribution of Non-Zero Values\n(Histogram + KDE)', fontsize=12)
plt.xlabel('Log-transformed Abundance')
plt.ylabel('Count')

plt.subplot(132)
sns.boxplot(y=nonzero_vals, color='#2878B5', width=0.3)
plt.title('Distribution of Non-Zero Values\n(Box Plot)', fontsize=12)
plt.ylabel('Log-transformed Abundance')

plt.subplot(133)
sns.violinplot(y=nonzero_vals, color='#2878B5', alpha=0.6)
plt.title('Distribution of Non-Zero Values\n(Violin Plot)', fontsize=12)
plt.ylabel('Log-transformed Abundance')

# Add stats text
stats_text = (f'n = {len(nonzero_vals):,}\n'
             f'Mean = {np.mean(nonzero_vals):.2f}\n'
             f'Median = {np.median(nonzero_vals):.2f}\n'
             f'Std = {np.std(nonzero_vals):.2f}\n'
             f'% Zeros = {100 * (df1.values == 0).sum() / df1.values.size:.1f}%')

plt.figtext(0.98, 0.95, stats_text,
         fontsize=10,
         bbox=dict(facecolor='white', alpha=0.8),
         ha='right',
         va='top')

plt.tight_layout()
plt.show()

In [None]:
X_np = arr_reduced
y_np = age_metadata.age

# Evaluate each regressor using cross-validation with GroupKFold
for name, regressor in regressors.items():
    cv_scores = cross_val_score(regressor, X_np, y_np, cv=kf.split(X_np, groups=subjects, y=studies), scoring='neg_mean_absolute_error')
    print(f"{name} - Cross-Validation Scores (Negative MAE): {cv_scores}")
    print(f"{name} - Mean CV Score: {cv_scores.mean()} +/- {cv_scores.std()}\n")