In [1]:
import os
import pandas as pd
import glob

In [2]:
os.chdir("/fs/scratch/PAS0439/Ming/virome_ecology_core_prkaryotes/results/08_inStrain_rug4941/instrain_profiles/")

In [3]:
samples = [f.split(".profile")[0] for f in glob.glob("*")]
genome_info = pd.read_csv("../genome_info.csv")
prophage = pd.read_csv("../../04_prophage_rumen_mags/prophage_host_summary.csv").query('prophage == prophage')

In [4]:
co_exist_df_list = []
prophage_df_list = []
for f in samples:
    #instrain_genome: population presented in the samples
    instrain_genome = pd.read_csv(f"{f}.profile/output/{f}.profile_genome_info.tsv", sep = "\t").query('breadth_minCov >= 0.5') 
    instrain_genome.genome = instrain_genome.genome.apply(lambda x: x.split('.fasta')[0])
    
    instrain_genome = pd.merge(instrain_genome, genome_info, left_on = "genome", right_on = "user_genome").drop("user_genome", axis = 1)
    instrain_prophage_genome = pd.merge(instrain_genome, prophage, left_on = "genome", right_on = "user_genome")
    
    for index, row in instrain_prophage_genome.iterrows():
        instrain_prophage_genome.loc[index, "prophage_scaffold"] = row["prophage"].split("||")[0]
    
    # check if scaffold carrying prophage exist
    scaffold_to_check = set(instrain_prophage_genome.prophage_scaffold)
    
    instrain_scaffold = pd.read_csv(f"{f}.profile/output/{f}.profile_scaffold_info.tsv", sep = "\t")
        ## prophage scaffold great than 99% coverage to be considered as present
    prophage_scaffold_filtered = set(instrain_scaffold.query('scaffold in @scaffold_to_check').query('breadth > 0.99').scaffold)
    instrain_prophage_genome_filtered = instrain_prophage_genome.query('prophage_scaffold in @prophage_scaffold_filtered').reset_index(drop = True)
    prophage_genomes_presented = set(instrain_prophage_genome_filtered.genome)
    
    for index, row in instrain_genome.iterrows():
        if row["genome"] in prophage_genomes_presented:
            instrain_genome.loc[index, "prophage_presented"] = "Present"
        else:
            instrain_genome.loc[index, "prophage_presented"] = "absent"
    
    # species with population carrying prophage    
    prophage_species = set(instrain_genome.query('genome in @prophage_genomes_presented')["Species"])
    # species with populations carrying prophage and population without prophage
    species_both = instrain_genome.query('Species in  @prophage_species').query('prophage_presented == "absent"')["Species"]
    
    # record genome and sample info for species with populations carrying prophage and population without prophage
    if len(species_both) == 0:
        continue
    else:
        tmp = instrain_genome.query('Species in @species_both')[["genome", "Genera", "Species", "prophage_presented"]]
        tmp.loc[:,"id"] = f
        co_exist_df_list.append(tmp)
    
    # record genome and sample info for species with populations carrying prophage 
    if len(prophage_species) == 0:
        continue
    else:
        tmp = instrain_genome.query('Species in @prophage_species')[["genome", "Genera", "Species", "prophage_presented"]]
        tmp.query('prophage_presented == "Present"')
        tmp.loc[:,"id"] = f
        prophage_df_list.append(tmp)
    

In [5]:
co_exist_df_rug4941 = pd.concat(co_exist_df_list)

# fill na with genome name for column Species
for f in co_exist_df_rug4941.query("Species != Species").index:
    co_exist_df_rug4941.loc[f, "Species"] = co_exist_df_rug4941.loc[f, "genome"]

In [6]:
prophage_df_rug4941 = pd.concat(prophage_df_list)

# fill na with genome name for column Species
for f in co_exist_df_rug4941.query("Species != Species").index:
    co_exist_df_rug4941.loc[f, "Species"] = co_exist_df_rug4941.loc[f, "genome"]

In [7]:
species_prevelance = pd.read_csv("../../R_project/microbe_species_prevelance.csv")[["Genera", "Species", "prevelance"]]

In [8]:
df = pd.merge(co_exist_df_rug4941, species_prevelance, on = ["Genera", "Species"], how = "left").reset_index(drop = True)

In [9]:
df1 = pd.merge(prophage_df_rug4941, species_prevelance, on = ["Genera", "Species"], how = "left").reset_index(drop = True)

In [10]:
df.to_csv("/fs/ess/PAS0439/MING/virome_ecology/results/R_project/genome_prophage_co_exist_rug4941.csv", index = None)
df1.to_csv("/fs/ess/PAS0439/MING/virome_ecology/results/R_project/genome_prophage_rug4941.csv", index = None)