# 1. data loading

In [0]:
df_genus = spark.table("onesource_eu_dev_rni.onebiome.mpa_2_genus_table").toPandas().fillna(0)
print("Number of samples:", len(df_genus), ", Number of unique taxa:", len(df_genus.columns))

In [0]:
df_genus = df_genus.loc[:, (df_genus != 0).any(axis=0)]
df_genus.columns = df_genus.columns.str.lower()
print("Number of samples:", len(df_genus), ", Number of unique taxa:", len(df_genus.columns))

# 2. generate taxa sequence

In [0]:
list_sequences = []
for _, row in df_genus.iterrows():
    ranked_vars = row[row != 0].sort_values(ascending=False).index.tolist()  
    list_sequences.append(ranked_vars) 

In [0]:
MAX_SEQ_LENGTH = max([len(seq) for seq in list_sequences])
print(MAX_SEQ_LENGTH)
import matplotlib.pyplot as plt
plt.hist([len(seq) for seq in list_sequences], bins=100);

--> length of the sequence: from 0 to 64, number of unique values: 

In [0]:
sequences = [" ".join(seq) for seq in list_sequences]
len(sequences)

# 3. Save in Unity Catalog


In [0]:
from pyspark.sql import Row
df = spark.createDataFrame([Row(sequence=seq) for seq in sequences])
df.write.mode("overwrite").saveAsTable("onesource_eu_dev_rni.onebiome.mpa_2_genus_sequences")

# 4. Create the taxa id mapper

In [0]:
UNIQUE_TAXA = df_genus.columns.sort_values()
UNIQUE_TAXA

In [0]:
import json

with open('../data/token_to_id.json', 'r') as file:
    token_to_id = json.load(file)

In [0]:
metagenomics_genus_set = set(token_to_id.keys())
dicaprio_genus_set = set(UNIQUE_TAXA)


superposition = metagenomics_genus_set & dicaprio_genus_set
only_in_metagenomics = metagenomics_genus_set - dicaprio_genus_set
only_in_dicaprio = dicaprio_genus_set - metagenomics_genus_set

len(metagenomics_genus_set), len(superposition), len(only_in_metagenomics), len(only_in_dicaprio)

In [0]:
!pip install matplotlib-venn

In [0]:
import matplotlib.pyplot as plt

from matplotlib_venn import venn2


# Create the venn diagram
plt.figure(figsize=(6, 4))
venn2([metagenomics_genus_set, dicaprio_genus_set], ('curatedMetagenomics', 'Dicaprio'))

# Display the plot
plt.title('# of unique genera in curatedMetagenomics and Dicaprio')
plt.show()

In [0]:
# # option 1: extend the existing mapper
# metagenomic_last_taxa_id = list(token_to_id.values())[-1] 
# print(metagenomic_last_taxa_id)
# for idx, taxa in enumerate(only_in_dicaprio):
#     token_to_id[taxa] = idx + metagenomic_last_taxa_id + 1
# token_to_id
# with open('../data/token2id_metagenomic_dicaprio.json', 'w') as file:
#     json.dump(token_to_id, file) 

In [0]:
# option 2: create a new mapper
union = sorted(list(metagenomics_genus_set | dicaprio_genus_set))

token2id_genus = {taxa: idx + 4 for idx, taxa in enumerate(union)}
token2id_genus


In [0]:
len(union)

In [0]:
with open('../data/token2id_628_genus.json', 'w') as file:
    json.dump(token_to_id, file) 