# Figure 1 Genome feature plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec

# Load the dataset
file_path = 'data/AmbrosiaFungiGenomeFeature.csv'
df = pd.read_csv(file_path)

# Clean and prepare the data
df['Genome_size'] = df['Genome_size'].astype(str).str.replace(',', '').astype(float)
df['Total_protein'] = df['Total_protein'].astype(str).str.replace(',', '').astype(float)
df['Genome_size_Mb'] = df['Genome_size'] / 1e6
df['TE_CoverageTotal'] = pd.to_numeric(df['TE_CoverageTotal'], errors='coerce')
df['Secret_protein'] = pd.to_numeric(df['Secret_protein'], errors='coerce')
df['Scaffold_N50'] = pd.to_numeric(df['Scaffold_N50'], errors='coerce')
df['CompleteBUSCO'] = pd.to_numeric(df['CompleteBUSCO'], errors='coerce')
df['Total_BUSCO_group'] = pd.to_numeric(df['Total_BUSCO_group'], errors='coerce')
df['Genes_K'] = df['Num_Gene'] / 1e3
df['Secreted_K'] = df['Secret_protein'] / 1e3
df['BUSCO_Completeness'] = (df['CompleteBUSCO'] / df['Total_BUSCO_group']) * 100

# Differentiate species with the same name using Strain
df['Display_Name'] = df['FungusName'] + ' (' + df['Strain'] + ')'

# Define the specific order for Phylum and Family
df['CladeOrder'] = pd.Categorical(df['CladeOrder'], ordered=True)
df_sorted = df.sort_values(by=['CladeOrder'])

# Metrics to plot and calculate
metrics = [
    ('Genome Size (Mb)', 'Genome_size_Mb'),
    ('TE Coverage (%)', 'TE_CoverageTotal'),
    ('Number of Genes (K)', 'Genes_K'),
    ('Secreted Proteins (K)', 'Secreted_K'),
    ('GC content (%)', 'GC_Genome'),
    ('BUSCO Completeness (%)', 'BUSCO_Completeness')
]

# Families to analyze
families = ['Ophiostomataceae', 'Nectriaceae', 'Ceratocystidaceae', 'Irpicaceae']

# Initialize a list to store the results
results = []

# Calculate medians for each family and ambrosia status
for family in families:
    family_df = df[df['Family'] == family]
    for ambrosia_status in ['Y', 'N']:
        ambrosia_df = family_df[family_df['Ambrosia'] == ambrosia_status]
        for title, metric in metrics:
            median_value = ambrosia_df[metric].median()
            results.append({
                'Family': family,
                'Ambrosia': ambrosia_status,
                'Metric': title,
                'Median': median_value
            })

# Calculate overall medians
for ambrosia_status in ['Y', 'N']:
    ambrosia_df = df[df['Ambrosia'] == ambrosia_status]
    for title, metric in metrics:
        median_value = ambrosia_df[metric].median()
        results.append({
            'Family': 'Overall',
            'Ambrosia': ambrosia_status,
            'Metric': title,
            'Median': median_value
        })

# Convert the results to a DataFrame and save to CSV
results_df = pd.DataFrame(results)
csv_output_file = 'data/AmbrosiaMediansbyFamily.csv'
results_df.to_csv(csv_output_file, index=False)
print(f"Results saved to {csv_output_file}")

# Visualization
plt.figure(figsize=(20, 18))
gs = gridspec.GridSpec(1, 6, width_ratios=[1, 1, 1, 1, 1, 1])

# Update palette to include 'outgroup'
palette = {'Y': '#2BAE66FF', 'N': '#101820FF', 'outgroup': 'white'}

for i, (title, metric) in enumerate(metrics):
    ax = plt.subplot(gs[i])
    if i < 4:
        sns.barplot(y='Display_Name', x=metric, hue='Ambrosia', data=df_sorted, palette=palette, dodge=False, ax=ax, alpha=0.8)
    else:
        sns.scatterplot(y='Display_Name', x=metric, hue='Ambrosia', data=df_sorted, palette=palette, s=300, marker="o", ax=ax, alpha=0.8)
    
    for ambrosia_status in ['Y', 'N']:
        median_value = df_sorted[df_sorted['Ambrosia'] == ambrosia_status][metric].median()
        ax.axvline(x=median_value, linestyle=':', color='green' if ambrosia_status == 'Y' else 'black', label=f'Median {ambrosia_status}')
    
    ax.set_title(title, fontsize=15)
    ax.legend().set_visible(False)
    if i == 0:
        ax.set_ylabel('FungusName')
        ax.tick_params(axis='y', labelsize=15)
    else:
        ax.set_ylabel('')
        plt.setp(ax.get_yticklabels(), visible=False)
    ax.set_xlabel('')

plt.tight_layout()
plt.subplots_adjust(wspace=0.1)

# Save the figure to a landscape-oriented PDF
pdf_output_file = 'AmbrosiaFungiGenomeFeatures.pdf'
plt.savefig(pdf_output_file, bbox_inches='tight', orientation='landscape')
plt.close()

print(f"PDF visualization saved to {pdf_output_file}")

# Figure 2. Comparison of genomic features

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

# Define the color map for visualizations
color_map = {'Y': '#2BAE66FF', 'N': '#101820FF'}

# Load the dataset
file_path = 'data/AmbrosiaFungiGenomeFeature.csv'
data = pd.read_csv(file_path)

# Filter out the outgroup entries
data = data[data['Ambrosia'] != 'outgroup']

# Convert units
data['Genome_size'] *= 1e-6  # Convert Mb to base pairs
data['Num_Gene'] *= 1e-3     # Convert K to units
data['Secret_protein'] *= 1e-3  # Convert K to units

# Initial Exploration and Visualization
sns.set(style="whitegrid")
features = ['Genome_size', 'TE_CoverageTotal', 'Num_Gene', 'Secret_protein', 'GC_Genome']
y_labels = ['Genome Size (Mb)', 'TE Coverage (%)', 'Number of Genes (K)', 'Secret Proteins (K)', 'GC Content (%)']
palette = {'Y': color_map['Y'], 'N': color_map['N']}
fig, axes = plt.subplots(1, 5, figsize=(12, 4))

for ax, feature, y_label in zip(axes.flatten(), features, y_labels):
    sns.boxplot(x='Ambrosia', y=feature, hue='Ambrosia', data=data, ax=ax, palette=palette)
    ax.set_title(feature)
    ax.set_ylabel(y_label)
    ax.get_legend().remove()

plt.tight_layout()

# Save the plot to a PDF file
pdf_path = 'GenomeSize_TEcoverage_NumGene_SecreteProtein_GCGenome.pdf'
with PdfPages(pdf_path) as pdf:
    pdf.savefig(fig)
    plt.close()

# Provide the link to download the PDF
pdf_path


# Figure 5 RIP analysis

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap, BoundaryNorm

# Load the datasets
nc_rip_df = pd.read_csv("data/NC_RIP.csv")
ambrosia_fungi_df = pd.read_csv("data/AmbrosiaFungiGenomeFeature.csv")

# Excluding specific species
excluded_species = ['Cytidiella_melzeri','Trametopsis_cervina','Irpex_rosettiformis','Irpex_subulatus',
                    'Pyricularia_oryzae','Diaporthe_amygdali', 'Phaeoacremonium_minimum', 'Pyricularia_grisea',
                    'Irpex_lacteus','Irpex_flavus','Irpex_flavus','Rhizopus_microsporus', 'Scedosporium_boydii',
                    'Geosmithia_putterillii','Geosmithia_morbida','Geosmithia_flava',
                    'Mucor_mucedo','Aspergillus_fumigatus','Monilinia_laxa']

ambrosia_fungi_df = ambrosia_fungi_df[~ambrosia_fungi_df['FungusName'].isin(excluded_species)]

# Categorize RIP classes
rip_features = [
    "Num_RIP_Win", "RIP_Gen_Prop", "LRAR_Count", "Avg_LRAR_Size",
    "Avg_LRAR_GC", "LRAR_Gen_Prop", "LRAR_Prod_Val", "LRAR_Sub_Val", "LRAR_Comp_Val"
]
bins = [0.0, 0.2, 1.0, 5.0, 10.0, 20.0, 100.0]
labels = ['Class1', 'Class2', 'Class3', 'Class4', 'Class5', 'Class6']
ambrosia_fungi_df['RIP_Class'] = pd.cut(ambrosia_fungi_df['RIP_Gen_Prop'], bins=bins, labels=labels, right=False)

# Add Ambrosia column (Y/N) and convert it to numerical for colormap
color_map = {'Y': '#2BAE66FF', 'N': '#101820FF'}
ambrosia_fungi_df['Ambrosia_Color'] = ambrosia_fungi_df['Ambrosia'].map({'Y': 1, 'N': 0})

# Create binary representation for genes
nc_rip_presence_df = nc_rip_df.set_index('Unnamed: 0')
nc_rip_presence_df = nc_rip_presence_df.apply(lambda x: x.apply(lambda y: 0 if y == "No hit with E-value < 1e-5" else 1))

# Merging datasets using the unified identifier
full_data = ambrosia_fungi_df.set_index('FungusName_Strain').join(nc_rip_presence_df, how='inner')

# Sorting by CladeOrder before visualization
full_data = full_data.sort_values('CladeOrder')

# Extracting and processing data for heatmaps
rip_class_map = {label: i for i, label in enumerate(labels, 1)}
full_data['RIP_Class_Value'] = full_data['RIP_Class'].map(rip_class_map)

# Standardizing RIP features
scaler = StandardScaler()
scaled_features_full = scaler.fit_transform(full_data[rip_features])
scaled_features_df_full = pd.DataFrame(scaled_features_full, index=full_data.index, columns=rip_features)

# Gene presence columns
gene_presence_data_full = full_data[nc_rip_presence_df.columns]
discrete_cmap = ListedColormap(['white', 'gray'])

# Plotting all three heatmaps in a single layout with Ambrosia column
fig, ax = plt.subplots(1, 4, figsize=(20, 15), gridspec_kw={'width_ratios': [0.2, 0.2, 3, 3]})
sns.heatmap(full_data[['Ambrosia_Color']], cmap=ListedColormap([color_map['N'], color_map['Y']]), cbar=False, ax=ax[0])
ax[0].set_title('Ambrosia (Y/N)')
sns.heatmap(full_data[['RIP_Class_Value']], cmap='YlGnBu', cbar=False, fmt=".0f", ax=ax[1], yticklabels=False)
ax[1].set_title('RIP Class Heatmap')

# Adding legend for RIP Class Heatmap
legend_labels = [f'{i}: {label}' for i, label in enumerate(labels, 1)]
legend_colors = plt.cm.YlGnBu(np.linspace(0, 1, len(labels)))
patches = [plt.plot([],[], marker="s", ms=10, ls="", mec=None, color=legend_colors[i], 
            label="{:s}".format(legend_labels[i]) )[0]  for i in range(len(labels))]
ax[1].legend(handles=patches, bbox_to_anchor=(1.5, 1), loc='upper right', ncol=1)

sns.heatmap(scaled_features_df_full, cmap="YlGnBu", annot=False, fmt=".2f", ax=ax[2], yticklabels=False)
ax[2].set_title('Adjusted Heatmap of RIP Features by Species')
sns.heatmap(gene_presence_data_full, cmap=discrete_cmap, cbar_kws={'label': 'Presence (Yes) / Absence (No)'}, ax=ax[3], yticklabels=False)
ax[3].set_title("Gene Presence/Absence Heatmap")
plt.tight_layout()

# Save the figure to a landscape-oriented PDF
pdf_path = 'RIP_analysis.pdf'
plt.savefig(pdf_path, bbox_inches='tight', orientation='landscape')
plt.close()

print(f"PDF saved: {pdf_path}")
