Check the data structure before prediction


In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


## presence and nonzero distribution

In [0]:
from visualization_utils import *

In [0]:
%fs ls /dbfs/data/

## diversity vs age & other factors


In [0]:
# df_age = pd.read_csv("../data/age.csv", header=None, index_col=0, sep='\t')
# # df_age = df_age[df_age[1]>=18]  # todo remove after test 
# y = df_age.to_numpy().reshape(-1, 1).flatten()

# y_class = y//10
# y_class[y_class==9] = 8 
# train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y_class, random_state=42)  

df_abundance = pd.read_csv("../data/taxa_data.txt", sep='\t', header=0, index_col=0)
df_abundance.columns = df_abundance.columns.str.replace('[', '').str.replace(']', '')
df_abundance = df_abundance.fillna(0)

df_genus = df_abundance.groupby(df_abundance.columns.str.split(' ').str[0], axis=1).sum()

df_genus
# X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]

In [0]:
df_meta = pd.read_csv("../data/metadata.txt", sep='\t', header=0, index_col=0)
df_meta

In [0]:
col_factors = ['age', 'antibiotics_current_use', 'gender', 'country', 'non_westernized', 'sequencing_platform', 'disease', "study_condition"]

In [0]:
df_meta.loc[~df_meta.age.isna(), 'age'] = df_meta.loc[~df_meta.age.isna(), 'age'].astype(int)

In [0]:
diversity_species = df_abundance.astype(bool).sum(axis=1)
diversity_genus = df_genus.astype(bool).sum(axis=1)

diversity_species = df_abundance.astype(bool).sum(axis=1)
diversity_genus = df_genus.astype(bool).sum(axis=1)
df_diversity = pd.concat([diversity_genus, diversity_species], axis=1, ignore_index=False)
df_diversity.columns=['num_genus', 'num_species']

df_diversity = pd.concat([df_diversity, df_meta[col_factors]], axis=1, ignore_index=False)



# df_diversity = pd.concat([df_age, diversity_species, diversity_genus], axis=1, ignore_index=False)
# df_diversity.columns = ['age', 'num_species', 'num_genus']
# df_diversity.dropna(inplace=True, axis=0)
# df_diversity.age = df_diversity.age.astype(int)
# df_diversity

In [0]:
plt.figure(figsize=(12, 5))
plt.subplot(121)
sns.histplot(df_diversity['num_species'], kde=True)
plt.xlabel('# species')
plt.ylabel('Frequency')
plt.title('Distribution of Number of Species')

plt.subplot(122)
sns.histplot(df_diversity['num_genus'], kde=True)
plt.xlabel('# genus')
plt.ylabel('Frequency')
plt.title('Distribution of Number of Genus')

plt.tight_layout()
plt.show()

In [0]:
df_health_counts = df_meta[['study_condition', 'disease']].copy()
df_health_counts['study_condition'] = df_health_counts['study_condition'].apply(
    lambda x: 'control' if x == 'control' else 'case'
)
df_health_counts['disease'] = df_health_counts['disease'].apply(
    lambda x: 'healthy' if x == 'healthy' else 'disease'
)
# Pivot the DataFrame
df_pivot = df_health_counts.pivot_table(
    index='disease', 
    columns='study_condition', 
    aggfunc='size', 
    fill_value=0
)

# Display the pivoted DataFrame
sns.heatmap(df_pivot, annot=True, fmt='d', cmap='Blues', cbar=False);

In [0]:

df_diversity[["disease", 'study_condition']] = df_health_counts[['disease','study_condition']]
print(df_diversity.shape)
plt.figure(figsize=(10, 20))
for i, col in enumerate(col_factors):
    plt.subplot(4, 2, i + 1)
    sns.boxplot(data=df_diversity.dropna(subset=[col, 'num_species']), y='num_species', x=col)
    plt.xticks(rotation=45)
    plt.title(col)
plt.tight_layout()

In [0]:
df_diversity.sequencing_platform.value_counts()

In [0]:
import seaborn as sns

plt.figure(figsize=(12, 5))
plt.subplot(121)
sns.boxplot(df_diversity, x='age', y='num_species')
plt.xlabel('Age')
plt.ylabel('# species')
plt.xticks(ticks=range(0, int(df_diversity['age'].max()), 5), rotation=45)
plt.title('Species diversity by age')
plt.subplot(122)
sns.boxplot(df_diversity[df_diversity['sequencing_platform'] !='IlluminaMiSeq'], x='age', y='num_species')
plt.xlabel('Age')
plt.ylabel('# genus')
plt.xticks(ticks=range(0, int(df_diversity['age'].max()), 5), rotation = 45)
;

In [0]:
plt.plot(df_diversity['num_species'], df_diversity['num_genus'], '.')
plt.xlabel('Number of Species')
plt.ylabel('Number of Genus')
plt.title('Number of Species vs Number of Genus')

## prediction using diversity

In [0]:
!pip install xgboost


In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# Prepare the data
y = df_meta[(~df_meta.age.isna()) &(df_meta.age>0) & (df_meta.disease=='healthy')& (df_meta.study_condition =='control')].age
ind_samples = y.index
X = df_diversity[['num_species']].loc[ind_samples, :].to_numpy()
y = y.to_numpy()
print(X.shape, y.shape)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from xgboost import XGBRegressor

# Create and train the linear regression model
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

# Create and train the XGBoost regressor model
model_xg = XGBRegressor()
model_xg.fit(X_train, y_train)

# Make predictions
y_pred_linear = model_linear.predict(X_test)
y_pred_xgb = model_xg.predict(X_test)

In [0]:
# Calculate the mean squared error and R-squared for linear regression
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = model_linear.score(X_test, y_test)

# Calculate the mean squared error and R-squared for XGBoost regressor
mse_xg = mean_squared_error(y_test, y_pred_xgb)
r2_xg = model_xg.score(X_test, y_test)

# Plot y_pred_linear vs y_test, and y_pred_xgb vs y_test, in two subplots
plt.figure(figsize=(9, 4.5))

# Subplot for Linear Regression
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_linear, alpha=0.5)
plt.xlabel('Actual Age')
plt.ylabel('Predicted Age (Linear Regression)')
plt.title('Actual Age vs Predicted Age (Linear Regression)')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.text(0.05, 0.95, f'R2: {r2_linear:.2f}\nMSE: {mse_linear:.2f}', transform=plt.gca().transAxes, 
         fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='white'))

# Subplot for XGBoost Regressor
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred_xgb, alpha=0.5)
plt.xlabel('Actual Age')
plt.ylabel('Predicted Age (XGBoost)')
plt.title('Actual Age vs Predicted Age (XGBoost)')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.text(0.05, 0.95, f'R2: {r2_xg:.2f}\nMSE: {mse_xg:.2f}', transform=plt.gca().transAxes, 
         fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='white'))

plt.tight_layout()


## prediction using presence

In [0]:
from sklearn.metrics import r2_score
# y = y[y >= 18]
y_class = y//10 
y_class[y_class==9] = 8  
train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y_class, random_state=42)  
X = df_abundance.loc[ind_samples, :].to_numpy()
X = X.astype(bool).astype(float)

X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]

In [0]:
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=4)
# xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
# Plot y_pred vs y_test with text for R-squared and MSE
plt.figure(figsize=(6, 4.5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Age')
plt.ylabel('Predicted Age')
plt.title('Actual Age vs Predicted Age')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)

# Calculate R-squared and MSE
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Add text for R-squared and MSE
plt.text(0.05, 0.95, f'R2: {r2:.2f}\nMSE: {mse:.2f}', transform=plt.gca().transAxes, 
         fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='white'))

plt.tight_layout()
plt.show()


In [0]:
feature_importance = xgb_model.feature_importances_

feature_names = df_abundance.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(6, 4))
sns.barplot(importance_df.head(10), y ='Feature', x='Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance from XGBoost Model')
plt.gca().invert_yaxis()
plt.show()

In [0]:
!pip install shap

In [0]:
!pip install numpy==1.24

In [0]:
import shap
# Initialize the SHAP explainer
explainer = shap.Explainer(xgb_model, df_abundance)

# Calculate SHAP values
shap_values = explainer(df_abundance)

# Plot the SHAP summary
shap.summary_plot(shap_values, df_abundance)

In [0]:
# from databricks import automl

# # Combine the dataframes
# df_combined = pd.DataFrame(X)
# df_combined['age'] = y 

# # Run AutoML for regression
# summary = automl.regress(df_combined, target_col="age", timeout_minutes=30)

# # Display the summary of the AutoML run
# display(summary)


## feature correlation with target

In [0]:
from scipy.stats import spearmanr
correlations = df_abundance.loc[ind_samples].apply(lambda x: spearmanr(x, y)[0])
correlations_sorted = correlations.dropna().sort_values()

plt.figure(figsize=(16, 4))
plt.subplot(1, 2, 1)
correlations_sorted.head(10).plot(kind='barh')
plt.xlabel('Spearman Correlation with Age')
plt.ylabel('Species')
plt.title('Species by their Correlation to Age')
# plt.yticks(fontsize=)
plt.subplot(1, 2, 2)
correlations_sorted.tail(10).plot(kind='barh')
plt.xlabel('Spearman Correlation with Age')
plt.ylabel('Species')
plt.title('Species by their Correlation to Age')
# plt.yticks(fontsize=6)
plt.tight_layout();

In [0]:
from scipy.stats import spearmanr
correlations = df_genus.loc[ind_samples].apply(lambda x: spearmanr(x, y)[0])
correlations_sorted = correlations.dropna().sort_values()

plt.figure(figsize=(16, 4))
plt.subplot(1, 2, 1)
correlations_sorted.head(10).plot(kind='barh')
plt.xlabel('Spearman Correlation with Age')
plt.ylabel('Genus')
plt.title('Genus by their Correlation to Age')
# plt.yticks(fontsize=)
plt.subplot(1, 2, 2)
correlations_sorted.tail(10).plot(kind='barh')
plt.xlabel('Spearman Correlation with Age')
plt.ylabel('Genus')
plt.title('Genus by their Correlation to Age')
# plt.yticks(fontsize=6)
plt.tight_layout();

In [0]:
df_abundance = df_abundance.loc[df_age.index]
correlations = df_abundance.apply(lambda x: spearmanr(x, df_age[1])[0])
correlations_sorted = correlations.dropna().sort_values()

plt.figure(figsize=(8, 12))
correlations_sorted.plot(kind='barh')
plt.xlabel('Spearman Correlation with Age')
plt.ylabel('Genus')
plt.title('Species by their Correlation to Age')
plt.yticks(fontsize=6);

In [0]:
print(correlations_sorted.head(20), correlations_sorted.tails(20))

## log - drop associated - scaled data


In [0]:


df_age = pd.read_csv("../data/age.csv", header=None, index_col=0, sep='\t')
# df_age = df_age[df_age[1]>=18]  # todo remove after test 
y = df_age.to_numpy().reshape(-1, 1).flatten()

y_class = y//10
y_class[y_class==9] = 8 
train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y_class, random_state=42)  

X = pd.read_csv("../data/processed_log_drop08_scaled.csv", header=0, index_col=0, sep='\t').loc[df_age.index, :].to_numpy()
X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]


In [0]:
# df_age = pd.read_csv("../data/age.csv", sep='\t', header=None, index_col=0)
# df_abundance = pd.read_csv("../data/processed_abundance.csv", sep='\t', header=0, index_col=0).loc[df_age.index, :]
# df_log = pd.read_csv("../data/processed_log_abundance.csv", sep='\t', header=0, index_col=0).loc[df_age.index, :]


In [0]:
sns.histplot(X.flatten());

In [0]:
sns.histplot(X.flatten()[X.flatten() >0 ]);

In [0]:
# plt.figure(figsize=(15, 5))
# plt.subplot(121)
# sns.histplot(df_abundance.to_numpy().flatten())
# plt.subplot(122)
# sns.histplot(df_log.to_numpy().flatten());

In [0]:
# plt.figure(figsize=(15, 5))
# plt.subplot(121)
# sns.histplot(df_abundance.to_numpy().flatten()[df_abundance.to_numpy().flatten() != 0])
# plt.subplot(122)
# sns.histplot(df_log.to_numpy().flatten()[df_log.to_numpy().flatten() >-6 ]);

In [0]:
# from sklearn.preprocessing import MinMaxScaler, StandardScaler

# scaler = StandardScaler()
# df_log_scaled = scaler.fit_transform(df_log)

# df_log_scaled.loc[df_log > -6] = pd.DataFrame(df_log_scaled, index=df_log.index, columns=df_log.columns)
# df_log_scaled

In [0]:
# sns.histplot(df_log_scaled.to_numpy().flatten(), bins=50, kde=True;

In [0]:
# df_log_scaled.describe()

In [0]:
# sns.histplot(df_log_scaled.to_numpy().flatten()[df_log_scaled.to_numpy().flatten()>0]);

In [0]:
import scipy.stats as stats

stat, p_value = stats.normaltest(X[X != 0])
print(stat)
print(f" non zero data - Normality test p-value: {p_value:.5f}")

In [0]:



# # Perform normality test
# stat, p_value = stats.normaltest(df_abundance.to_numpy()[df_abundance.to_numpy() != 0])
# print(stat)
# print(f"abundance non zero data - Normality test p-value: {p_value:.5f}")
# stat, p_value = stats.normaltest(df_log.to_numpy()[df_abundance.to_numpy() != 0])
# print(stat)
# print(f"log abundance non zero data - Normality test p-value: {p_value:.5f}")

In [0]:
stats.normaltest(y)

In [0]:
# X_abundance = df_abundance.fillna(0).to_numpy()
# X_log = df_log.to_numpy()

# y = df_age.to_numpy().reshape(-1, 1).flatten()


## check if data is linear or nonlinear

In [0]:
X!=0

In [0]:
plt.figure(figsize=(12, 4))
plt.subplot(131)
corr_matrix_xy = np.corrcoef(X, y, rowvar=False)[-1, :-1]
sns.histplot(corr_matrix_xy, bins=50, kde=True)
plt.xlabel("Correlation with Target")
plt.ylabel("Feature Count")
plt.title("scaled log abundance Feature-Target Correlation dist")
plt.subplot(132)
corr_matrix_sumxy =  np.corrcoef(X.sum(axis=1), y, rowvar=False)[-1, :-1]
sns.histplot(corr_matrix_sumxy, bins=50, kde=True)
plt.xlabel("Correlation with Target")
plt.ylabel("Feature Count")
plt.title("sum scaled log abundance Feature-Target Correlation dist")
plt.subplot(133)
corr_matrix_sumpresencexy =  np.corrcoef((X !=0).sum(axis=1), y, rowvar=False)[-1, :-1]
sns.histplot(corr_matrix_sumpresencexy, bins=50, kde=True)
plt.xlabel("Correlation with Target")
plt.ylabel("Feature Count")
plt.title("sum presence  Feature-Target Correlation dist");

In [0]:
# # Compute correlation with target variable
# corr_matrix_xy = np.corrcoef(X_abundance, y, rowvar=False)[-1, :-1]


In [0]:


# # Compute correlation matrix between X and y using numpy
# corr_matrix_xlogy = np.corrcoef(X_log, y, rowvar=False)[-1, :-1]


In [0]:

# # Plot correlation
# plt.figure(figsize=(15, 5))
# plt.subplot(121)
# sns.histplot(corr_matrix_xy, bins=50, kde=True)
# plt.xlabel("Correlation with Target")
# plt.ylabel("Feature Count")
# plt.title("Abundance Feature-Target Correlation Distribution")
# plt.subplot(122)
# sns.histplot(corr_matrix_xlogy, bins=50, kde=True)
# plt.xlabel("Correlation with Target")
# plt.ylabel("Feature Count")
# plt.title("Log Abundance Feature-Target Correlation Distribution");

**-> no clear correlation between X and y, nonlinear data**

In [0]:
corr_matrix_xlogscaledx = np.corrcoef(X, rowvar=False)
sns.heatmap(corr_matrix_xlogscaledx, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Feature Correlation Heatmap - scaled log")

In [0]:
corr_matrix_xlogx = np.corrcoef(X_log, rowvar=False)

In [0]:
corr_matrix_xx = np.corrcoef(X_abundance, rowvar=False)

In [0]:
plt.figure(figsize=(20, 8))
plt.subplot(121)
sns.heatmap(corr_matrix_xx, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Feature Correlation Heatmap")
plt.subplot(122)
sns.heatmap(corr_matrix_xlogx, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Feature Correlation Heatmap - log transformed")
plt.tight_layout()

In [0]:
import itertools

# Get pairs of features with correlation higher than 0.5
def get_high_corr_pairs(corr_matrix, threshold=0.8):
    corr_matrix = pd.DataFrame(corr_matrix, index=df_abundance.columns, columns=df_abundance.columns)
    pairs = []
    for i, j in itertools.combinations(range(corr_matrix.shape[0]), 2):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            pairs.append((corr_matrix.index[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
    return pairs

high_corr_pairs_xx = get_high_corr_pairs(corr_matrix_xx)
high_corr_pairs_xlogx = get_high_corr_pairs(corr_matrix_xlogx)

high_corr_pairs_xx, high_corr_pairs_xlogx

In [0]:
len(high_corr_pairs_xx), len(high_corr_pairs_xlogx)

In [0]:
import itertools

# Get pairs of features with correlation higher than 0.5
def get_high_corr_pairs(corr_matrix, threshold=0.8):
    corr_matrix = pd.DataFrame(corr_matrix)
    pairs = []
    for i, j in itertools.combinations(range(corr_matrix.shape[0]), 2):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            pairs.append((corr_matrix.index[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
    return pairs

len(get_high_corr_pairs(corr_matrix_xlogscaledx))

**-> drop correlated features within X (preprocessing)**

linear regressor

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Train Linear Regression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_pred_lin = lin_model.predict(X_test)

# Train Decision Tree
tree_model = DecisionTreeRegressor(max_depth=5)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)


# Compare performance
mae_lin = mean_squared_error(y_test, y_pred_lin)
mae_tree = mean_squared_error(y_test, y_pred_tree)

print(f"Linear Model MAE: {mae_lin:.4f}")
print(f"Decision Tree MAE: {mae_tree:.4f}")

# Calculate R2 scores
r2_lin = r2_score(y_test, y_pred_lin)
r2_tree = r2_score(y_test, y_pred_tree)

print(f"Linear Model R2: {r2_lin:.4f}")
print(f"Decision Tree R2: {r2_tree:.4f}")


if mae_tree < mae_lin * 0.9:
    print("Data likely has nonlinear relationships.")
else:
    print("Data is mostly linear.")





In [0]:
plt.figure(figsize=(12, 4))

plt.subplot(121)
plt.scatter(y_test, y_pred_lin, alpha=0.5, s=6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='r', linestyle='--')
plt.xlabel("True value")
plt.ylabel("predicted value")
plt.title(f"Linear Regressor (R2: {r2_lin:.4f})")

plt.subplot(122)
plt.scatter(y_test, y_pred_tree, alpha=0.5, s=6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='r', linestyle='--')
plt.xlabel("True value")
plt.ylabel("predicted value")
plt.title(f"Tree Regressor (R2: {r2_tree:.4f})")

plt.show()

## try prediction without dimension reduction

In [0]:

from sklearn.ensemble import GradientBoostingRegressor as XGBRegressor

# Train XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

mae_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost MAE: {mae_xgb:.4f}")

r2_xgb = r2_score(y_test, y_pred_xgb)


plt.figure(figsize=(6, 4))

plt.scatter(y_test, y_pred_xgb, alpha=0.5, s=6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='r', linestyle='--')
plt.xlabel("True value")
plt.ylabel("predicted value")
plt.title(f"XGBoost Regressor (R2: {r2_xgb:.4f})")
plt.show()

## Dimension reduction by Feature Selection (Using XGB model)

In [0]:
# Get feature importance scores
importance = xgb_model.feature_importances_



In [0]:
min_mae = 100000
max_r2 = 0
best_y_pred = None
best_k = 0

for k in range(100, 500, 50):

    # Select top features
    top_k_features = np.argsort(importance)[-k:]
    X_train_reduced = X_train[:, top_k_features]
    X_test_reduced = X_test[:, top_k_features]

    # prediction using reduced features
    model = XGBRegressor()
    model.fit(X_train_reduced, y_train)

    # Evaluate
    y_pred = model.predict(X_test_reduced)
    mae_xgb_reduced = mean_squared_error(y_test, y_pred)
    r2_xgb_reduced = r2_score(y_test, y_pred)
    if mae_xgb_reduced < min_mae:
        min_mae = mae_xgb_reduced
        best_y_pred = y_pred
        best_k = k
    

plt.figure(figsize=(6, 4))

plt.scatter(y_test, best_y_pred, alpha=0.5, s=6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='r', linestyle='--')
plt.xlabel("True value")
plt.ylabel("predicted value")
plt.title(f"Xgboost prediction with reduced features (R2: {max_r2:.4f})")
plt.show()


print(f"XGBoost MAE: {min_mae:.4f}")

print(f"The best feature number k: {best_k}")


In [0]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.7, 0.8, 0.9]
# }

# # Initialize the XGBRegressor
# xgb = XGBRegressor()

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# # Fit GridSearchCV
# # Select top features
# top_k_features = np.argsort(importance)[-k:]
# X_train_reduced = X_train[:, top_k_features]
# X_test_reduced = X_test[:, top_k_features]

# grid_search.fit(X_train_reduced, y_train)

# # Get the best parameters
# best_params = grid_search.best_params_
# best_params

In [0]:
# # Initialize the XGBRegressor with the best parameters
# best_model = XGBRegressor(**best_params)

# # Fit the model on the reduced training data
# best_model.fit(X_train_reduced, y_train)

# # Predict using the reduced test data
# best_y_pred = best_model.predict(X_test_reduced)

# # Evaluate the model
# mae_best_model = mean_squared_error(y_test, best_y_pred)
# r2_best_model = r2_score(y_test, best_y_pred)

# # Display the results
# plt.figure(figsize=(6, 4))
# plt.scatter(y_test, best_y_pred, alpha=0.5, s=6)
# plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='r', linestyle='--')
# plt.xlabel("True value")
# plt.ylabel("Predicted value")
# plt.title(f"XGBoost prediction with best parameters (R2: {r2_best_model:.4f})")
# plt.show()

# print(f"XGBoost MAE with best parameters: {mae_best_model:.4f}")
# print(f"XGBoost R2 with best parameters: {r2_best_model:.4f}")

**using reduced features, the xgb prediction is still not poor.**

**-> use autoencoder to reduce the dimension of the features while keeping the latent data structure**

## check if data naturally clusters

In [0]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Reduce to 2D using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

# Reduce to 2D using t-SNE
X_tsne = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(X_train)


In [0]:

# Plot PCA
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train, cmap='viridis',
                          s=10, alpha=0.6, edgecolors='w', linewidths=0.5)

plt.title("PCA Projection")

# Plot t-SNE
plt.subplot(1, 2, 2)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1],  c=y_train, cmap='viridis',
                          s=10, alpha=0.6, edgecolors='w', linewidths=0.5)
plt.title("t-SNE Projection")

plt.show()

**-> data does not naturally form clusters by PCA but rather scattered**

**t-SNE shows somewhat defined clusters but with overlap, it suggests some nonlinear structure but not very strong.**

## check data distribution

As VAE assumes a Gaussian latent space, it works best when the data follows a roughly normal distribution

In [0]:
set(y_train_class)

In [0]:
import scipy.stats as stats

# age group
y_train_class = ((y_train//20) * 20).astype(int)


# Pick a few random features
for i in np.random.choice(X_train.shape[1], 5, replace=False):
    df = pd.DataFrame(X_train[:, [i]], columns=['feature'], )
    df['age_class'] = y_train_class
    plt.figure(figsize=(4, 3))
    sns.histplot(data=df[df['feature']>-6], x='feature', kde=True, bins=50, hue='age_class')
    # sns.histplot(data=df, x='feature', kde=True, bins=50, hue='age_class')
    plt.title(f"{df_abundance.columns[i]} Distribution")
    plt.show()

    # Perform normality test
    try:
        stat, p_value = stats.normaltest(X_train[:, i][X_train[:, i]>-6])
        print(f"{df_abundance.columns[i]} - Normality test p-value: {p_value:.5f}")
    except ValueError:
        print(f"{df_abundance.columns[i]} - Skewness test is not valid with less than 8 samples; {sum(X_train[:, i]>-6)} samples were given.")

**-> features are not Gaussian-like, but rather quite skewed (very sparse before log)**

**conclusion: try both AE and VAE to reduce the feature dimension**