In [346]:
import pandas as pd
import numpy as np
import seaborn as sns
from Bio import SeqIO
from sklearn.metrics.pairwise import cosine_similarity

In [89]:
from sklearn.metrics import roc_curve, auc 
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier  
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import roc_auc_score, roc_curve

In [442]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification

In [113]:
from phylodm import PhyloDM
import dendropy

In [108]:
from skbio.stats.ordination import pcoa
from skbio import DistanceMatrix
from sklearn.manifold import TSNE

In [376]:
from tqdm import tqdm

In [210]:
import dendropy
from phylodm import PhyloDM

### metabolic

In [1334]:
co_embedding = pd.read_csv("../../data/social_niche_embedding_100.txt",
                          header=None, sep=" ", low_memory=False, index_col=0)
co_embedding.drop("<unk>", inplace=True)
fid = co_embedding.index.values

In [1335]:
taxonomy = pd.read_csv("../Pretraining_data_profile/Data/taxmap_slv_ssu_ref_nr_138.2.txt", sep="\t", low_memory=False)

acc = []
for i in range(taxonomy.shape[0]):
    temp = taxonomy.iloc[i]
    acc.append(f"{temp[0]}.{temp[1]}.{temp[2]}")

taxonomy = taxonomy.loc[:, "path"].str.split(';', expand=True)
taxonomy.index = acc
taxonomy = taxonomy.iloc[:, 0: 7]
taxonomy.columns = ["k", "p", "c", "o", "f", "g", "s"]
taxonomy = taxonomy.loc[fid]

In [1360]:
metabolic_res = pd.read_csv("Data/metabolic_res.csv")

In [1337]:
metabolic_res.loc[metabolic_res.group_1.values == "Traitar"].Traits.values

array(['D-Xylose', 'Lactose', 'Raffinose', 'Melibiose', 'Glycerol',
       'ONPG (beta galactosidase)', 'Esculin hydrolysis', 'L-Arabinose',
       'L-Rhamnose', 'myo-Inositol', 'D-Mannose', 'Maltose', 'Trehalose',
       'Salicin', 'Sucrose', 'D-Sorbitol', 'D-Mannitol',
       'Starch hydrolysis'], dtype=object)

In [1338]:
metabolic_res.loc[metabolic_res.group_1.values == "BacDive"].Traits.values

array(['Adipate', 'Esculin', 'Malate', 'Lactose', 'Glycogen', 'Salicin',
       'Raffinose', 'Starch', 'Glycerol', 'Amygdalin', 'Melibiose',
       'Arbutin', 'Gluconate', 'Maltose', 'Cellobiose', 'Turanose',
       'Melezitose', 'Trehalose', 'Gentiobiose', 'Sucrose', 'Sorbitol',
       'Gluconate', 'Maltose'], dtype=object)

In [1339]:
inter_metabolic = ["Lactose", "Melibiose", "Glycerol", "Maltose", "Trehalose", "Salicin", "Sucrose", "Sorbitol", "D-Sorbitol"]

In [1340]:
traits =  pd.read_csv("Data/trait_predcit.csv", index_col=0)
inter_id = np.intersect1d(fid, traits.index.values)
traits = traits.loc[inter_id]
traits = traits.astype(int)
traits[traits.values == 3] = 1
traits_traitor = traits.loc[:, [i in inter_metabolic for i in traits.columns.values]]

In [1341]:
traits_traitor.columns

Index(['Lactose', 'Salicin', 'Glycerol', 'Melibiose', 'Maltose', 'Sucrose',
       'Trehalose', 'D-Sorbitol'],
      dtype='object')

In [1342]:
traits_traitor.columns = ['Lactose', 'Salicin', 'Glycerol', 'Melibiose', 'Maltose', 'Sucrose', 'Trehalose', 'Sorbitol']

In [1343]:
traits_traitor.columns.values

array(['Lactose', 'Salicin', 'Glycerol', 'Melibiose', 'Maltose',
       'Sucrose', 'Trehalose', 'Sorbitol'], dtype=object)

In [1344]:
# --- 1. Load and prepare the traits data ---
# Read the CSV, drop duplicates based on 'X16s_ID', and set it as the index
traits = pd.read_csv("Data/bacDive.csv")
traits.drop_duplicates(subset='16s_ID', inplace=True)
traits.set_index('16s_ID', inplace=True)

# Extract the accession number by splitting the index string at the period '.'
accessions_num = co_embedding.index.str.split('.').str[0]

# Create a helper DataFrame to map the full embed_id to the shortened accession number
df_map = pd.DataFrame({
    'accessions': accessions_num,
    'embed_id': co_embedding.index
})

# --- 3. Align traits and embedding data ---
# Find the common accession numbers between the two datasets
inter_id = np.intersect1d(traits.index, df_map['accessions'])

# Filter the mapping and traits DataFrames to keep only the common entries
df_map = df_map[df_map['accessions'].isin(inter_id)]
traits = traits.loc[df_map['accessions']]

# Update the traits index to match the full embedding ID for consistency
traits.index = df_map['embed_id']

# --- 4. Clean and standardize data in the traits DataFrame ---
# Create a dictionary for all values that need to be replaced
replace_dict = {"NA": np.nan, "-": "no", "+": "yes",
    "": np.nan, "+;NA": np.nan, "coccus-shaped": "coccus",
    "rod-shaped": "rod", "mixed": np.nan, "negative;variable": np.nan,
    "no;yes": np.nan, "negative;positive": np.nan, "variable": np.nan
                
}
traits.replace(replace_dict, inplace=True)

# Standardize the 'cell_shape' column: keep only 'coccus' or 'rod', set others to NaN
valid_shapes = ['coccus', 'rod']
traits['cell_shape'] = traits['cell_shape'].where(traits['cell_shape'].isin(valid_shapes), np.nan)

# --- 5. Create the 'Oxygen.Preference' column ---
# Define conditions and corresponding choices for oxygen preference
conditions = [
    traits['aerobe'] == 1,
    traits['facultative.anaerobe'] == 1,
    traits['anaerobe'] == 1
]
choices = ['aerobic', 'facultatively', 'anaerobic']

# Use np.select (similar to R's case_when) to create the new column
# The default value is NaN for anything that doesn't meet a condition
traits['Oxygen.Preference'] = np.select(conditions, choices, default=np.nan)

# --- 6. Final cleanup ---
# Define columns to remove
remove_cols = ['X16s_ID', 'aerobe', 'facultative.anaerobe', 'anaerobe']
# Drop the specified columns; 'errors='ignore'' prevents an error if a column is already gone
traits.drop(columns=remove_cols, inplace=True, errors='ignore')

# Drop any column where all values are missing (NaN)
traits.dropna(axis=1, how='all', inplace=True)

  traits = pd.read_csv("/home/dongbiao/word_embedding_microbiome/all_data/gut/pheno/bacDive.csv")


In [1345]:
traits_bacdive = traits.loc[:, [i in inter_metabolic for i in traits.columns.values]]

In [1346]:
agg_bac = pd.read_csv("Data/agg_bac.csv")
agg_bac.level_3 = [i.capitalize() for i in agg_bac.level_3.values]
agg_bac = agg_bac.loc[[i in inter_metabolic for i in agg_bac.level_3.values]]
agg_bac = agg_bac.loc[[i in ["builds_acid_from"] for i in agg_bac.level_2.values]]
agg_bac = agg_bac.loc[agg_bac.type == 1]

In [1347]:
agg_bac

Unnamed: 0,terms,level_1,level_2,level_3,type,Number
1,metabolite_utilization.maltose.builds_acid_from,metabolite_utilization,builds_acid_from,Maltose,1,94
3,metabolite_utilization.sucrose.builds_acid_from,metabolite_utilization,builds_acid_from,Sucrose,1,81
6,metabolite_utilization.lactose.builds_acid_from,metabolite_utilization,builds_acid_from,Lactose,1,67
10,metabolite_utilization.trehalose.builds_acid_from,metabolite_utilization,builds_acid_from,Trehalose,1,57
20,metabolite_utilization.salicin.builds_acid_from,metabolite_utilization,builds_acid_from,Salicin,1,42
28,metabolite_utilization.melibiose.builds_acid_from,metabolite_utilization,builds_acid_from,Melibiose,1,32
44,metabolite_utilization.glycerol.builds_acid_from,metabolite_utilization,builds_acid_from,Glycerol,1,21
92,metabolite_utilization.sorbitol.builds_acid_from,metabolite_utilization,builds_acid_from,Sorbitol,1,6


In [1348]:
traits_bacdive = traits[agg_bac.terms.values]
traits_bacdive.columns = agg_bac.level_3.values

In [1350]:
names = ['Lactose', 'Salicin', 'Glycerol', 'Melibiose', 'Maltose', 'Sucrose', 'Trehalose', 'Sorbitol']

In [1351]:
j = "Sorbitol"
temp = traits_bacdive.loc[traits_bacdive.loc[:, j].values != 'nan']
temp = temp.loc[temp.loc[:, j].values == temp.loc[:, j].values]
tax_bacdive = taxonomy.loc[temp.index.values]
tax_bacdive.groupby("p").count().sort_values("k", ascending=False)

Unnamed: 0_level_0,k,c,o,f,g,s
p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bacteroidota,30,30,30,30,30,30
Bacillota,28,28,28,28,28,28
Actinomycetota,11,11,11,11,11,11
Fusobacteriota,3,3,3,3,3,3
Pseudomonadota,2,2,2,2,2,2
Campylobacterota,1,1,1,1,1,1
Synergistota,1,1,1,1,1,1


In [1352]:
labels_dict = {"yes":1, "no": 0}
phylum_id = ["Bacillota", "Bacteroidota", "Actinomycetota"]

In [1353]:
auc = []
traits_type = []
tax = []
for j in names:
    for i in phylum_id:
        temp = traits_bacdive.loc[traits_bacdive.loc[:, j].values != 'nan']
        temp = temp.loc[temp.loc[:, j].values == temp.loc[:, j].values]
        tax_bacdive = taxonomy.loc[temp.index.values]
        phylum = tax_bacdive.p.unique()
        test_phylum = list(phylum[[i not in phylum_id for i in phylum]]) + [i]
        test_id = tax_bacdive.loc[[i in test_phylum for i in tax_bacdive.p]].index.values
        test_id = test_id[traits_bacdive.loc[test_id, j].values == traits_bacdive.loc[test_id, j].values]
        
        tax_traitor = taxonomy.loc[traits_traitor.index.values]
        phylum = tax_traitor.p.unique()
        train_id = tax_traitor.loc[[i not in test_phylum for i in tax_traitor.p]].index.values
    
        train_id = train_id[traits_traitor.loc[train_id, j].values == traits_traitor.loc[train_id, j].values]
        X_train = co_embedding.loc[train_id]
        y_train = traits_traitor.loc[train_id, j].values
        X_test = co_embedding.loc[test_id]
        y_test = traits_bacdive.loc[test_id, j].values
        y_test = [labels_dict[i] for i in y_test]
        
        countsunique_elements, counts = np.unique(y_test, return_counts=True)
        if np.all(counts > 5) and len(countsunique_elements) > 1:
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_test)
            y_pred_proba = rf_model.predict_proba(X_test)
            auc.append(roc_auc_score(y_test, y_pred_proba[:, 1]))
            traits_type.append(j)
            tax.append(i)

In [1354]:
res = pd.DataFrame({"AUC":auc, "traits":traits_type, "tax":tax})

In [1355]:
res.to_csv("Data/predict_metabolics_res.csv", index=None)