In [96]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import joblib


In [88]:
# embeddings
embeddings_train = load_dataset("LofiAmazon/BOLD-Embeddings-Amazon", split="train").to_pandas()
embeddings_test = load_dataset("LofiAmazon/BOLD-Embeddings-Amazon", split="test").to_pandas()
embeddings_val = load_dataset("LofiAmazon/BOLD-Embeddings-Amazon", split="validation").to_pandas()

In [146]:
# ecological layers
ecoDf = pd.read_csv("/workspace/amazon-lofi-beats/environmental_data/processed/geo_eDNA_data.csv", low_memory=False)
ecoDf = ecoDf[['processid',
    'WorldClim2_BIO_Temperature_Seasonality',
    'WorldClim2_BIO_Precipitation_Seasonality','WorldClim2_BIO_Annual_Precipitation', 'EarthEnvTopoMed_Elevation',
    'EsaWorldCover_TreeCover', 'CHELSA_exBIO_GrowingSeasonLength',
    'WCS_Human_Footprint_2009', 'GHS_Population_Density',
    'CHELSA_BIO_Annual_Mean_Temperature']]


In [182]:
features = ['embeddings',
    'WorldClim2_BIO_Temperature_Seasonality',
    'WorldClim2_BIO_Precipitation_Seasonality','WorldClim2_BIO_Annual_Precipitation', 'EarthEnvTopoMed_Elevation',
    'EsaWorldCover_TreeCover', 'CHELSA_exBIO_GrowingSeasonLength',
   'WCS_Human_Footprint_2009', 'GHS_Population_Density',
    'CHELSA_BIO_Annual_Mean_Temperature']



#features = ['embeddings',
#    'WorldClim2_BIO_Temperature_Seasonality',
#    'WorldClim2_BIO_Precipitation_Seasonality','WorldClim2_BIO_Annual_Precipitation', 'EarthEnvTopoMed_Elevation',
#    'EsaWorldCover_TreeCover', 'CHELSA_exBIO_GrowingSeasonLength',
#    'WCS_Human_Footprint_2009', 'GHS_Population_Density',
#    'CHELSA_BIO_Annual_Mean_Temperature']

In [183]:
def split_embed(df):

    # Splitting the embeddings list into separate columns
    attributes_df = df['embeddings'].apply(pd.Series)

    # Rename columns to a generic name
    attributes_df.columns = [f'Attribute{i+1}' for i in range(attributes_df.shape[1])]

    # Joining the new columns back to the original DataFrame 
    new_df = df.join(attributes_df).drop(columns=['embeddings'])
    
    return new_df


def encode_class(Y):
    label_encoder = LabelEncoder()
    label_encoder = label_encoder.fit(Y)
    label_encoded_y = label_encoder.transform(Y)
    return label_encoded_y

In [184]:
# Prepare dataset
from sklearn.preprocessing import LabelEncoder

#merge embeddings and eco_data
eco_embedding_train = pd.merge(embeddings_train, ecoDf, on='processid', how='left')
eco_embedding_test = pd.merge(embeddings_test, ecoDf, on='processid', how='left')
eco_embedding_val = pd.merge(embeddings_val, ecoDf, on='processid', how='left')


# Split data into X and y
X_train = split_embed(eco_embedding_train[features])
X_test = split_embed(eco_embedding_test[features])
X_val = split_embed(eco_embedding_val[features])

y_train = encode_class(eco_embedding_train['genus'])
y_test = encode_class(eco_embedding_test['genus'])
y_val = encode_class(eco_embedding_val['genus'])


In [None]:


# Dictionary of models and their hyperparameters
models_params = {
    'SVM': {
        'model': SVC(),
        'params': {'C': [0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']}
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {'n_estimators': [10, 50, 100], 'max_features': ['auto', 'sqrt']}
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5, 7, 9]}
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        'params': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
    }
}

results = []
models = {}

# Train and optimize each model
for model_name, mp in models_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_
    y_val_pred = best_model.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    report = classification_report(y_val, y_val_pred)

    # Save the best model
    joblib.dump(best_model, f'{model_name}_best_model.pkl')
    
    # Store results for analysis
    results.append({
        'Model': model_name,
        'Best Parameters': clf.best_params_,
        'Validation Accuracy': accuracy,
        'Classification Report': report
    })
    models[model_name] = best_model

# Test models on the test set
test_results = []
for model_name, model in models.items():
    y_test_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
    report = classification_report(y_test, y_test_pred)
    test_results.append({
        'Model': model_name,
        'Test Accuracy': accuracy,
        'Test Classification Report': report
    })

# Save results to a DataFrame and export to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('model_validation_results.csv', index=False)

test_results_df = pd.DataFrame(test_results)
test_results_df.to_csv('model_test_results.csv', index=False)

print("Training, validation, and testing complete. Results saved.")

