# Analysis of variance on protein length, gene length and exon number of duplicated, gained and retained genes in 13 extant trematodes since their most recent common ancestor

In order to perform this analysis one has to first run OMA standalone and Pyham to create a dataframe called trematode_df. See Ancestral_trematode_genome_vs_extant_trematode_genome.ipynb. Because the trematode_df has only protein length from fasta files one has to obtain gene ids for each species and calculate the gene length in bp from the gff files. Exon numbers per gene can also be obtained from gff files. Here we assume a ready dataframe with each gene of each species categorized as duplicated, gained or retained since trematode ancestor, with gene length, protein length and exon number.

In [None]:
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import researchpy as rp
import statsmodels.stats.multicomp
import scipy.stats as stats
from sklearn.preprocessing import normalize
import numpy as np
import matplotlib.pyplot as plt

#Define working directory and load the cumulative table
working_dir = 'C:/Users/nzaja/Documents/Onedrive/onedrivedocuments'
trematode_df = pd.read_csv(working_dir + "/proteinlength/" + "cumulative_table.txt", sep=",")

You might first want to create a boxplot for gene length, protein length or exon number per species before you perform the analysis.

In [None]:
# Plotting boxplots of protein lengths of different classes, change protein_len to gene_len or exon_number
mylabels = [
"Atriophallophorus winterbourni",
"Clonorchis sinensis",
"Echinostoma caproni",
"Fasciola hepatica",
"Opisthorchis felineus",
"Opisthorchis viverrini",
"Schistosoma bovis",
"Schistosoma curassoni",
"Schistosoma haematobium",
"Schistosoma japonicum",
"Schistosoma mansoni",
"Schistosoma mattheei",
"Schistostoma margrebowiei",
"Trichobilharzia regenti"]
fig, ax = plt.subplots(figsize=(12,6))
colours = ["palevioletred", "gold", "darkturquoise"]
ax = sns.boxplot(x="species", y="protein_len", hue="class", palette = colours, \
                 data=trematode_df, showfliers=False)
ax.set_xticklabels(mylabels, rotation=90)
plt.ylabel("Protein Length")
plt.xlabel("Species")

Calculate the summary statistics.

In [None]:
trematode_genome_filenames = [
"Atriophallophorus_red3.agouti.run1.all.maker.proteins.fa",
"Clonorchis_sinensis_GCA_003604175.1_ASM360417v1_protein.fa",
"Echinostoma_caproni_GCA_900618425.1_Egypt_0011_upd_protein.fa",
"Fasciola_hepatica_GCA_002763495.2_1.0.allpaths.pg_protein.fa",
"Opisthorchis_felineus_GCA_004794785.1_ICG_Ofel_1.0_protein.fa",
"Opisthorchis_viverrini_GCA_001990785.1_1.0.pg.lrna_protein.fa",
"Schistosoma_bovis_GCA_003958945.1_ASM395894v1_protein.fa",
"Schistosoma_curassoni_GCA_900618015.1_Dakar_0011_upd_protein.fa",
"Schistosoma_haematobium_GCA_000699445.1_1.0_protein.fa",
"Schistosoma_japonicum_GCA_006368765.1_ASM636876v1_protein.fa",
"Schistosoma_mansoni_GCA_000237925.2_ASM23792v2_protein.fa",
"Schistosoma_mattheei_GCA_900617995.1_Denwood_0011_upd_protein.fa",
"Schistostoma_margrebowiei_GCA_900618395.1_Zambia_0011_upd_protein.fa",
"Trichobilharzia_regenti_GCA_900618515.1_v1_0_4_001_upd_protein.fa"]


trematode_genomes = [x.split(".")[0] for x in trematode_genome_filenames] 

#Change protein_len into gene_len or exon_number
for species in trematode_genomes:
    data1=trematode_df[(trematode_df['species']== species) & (trematode_df['class']== 'duplicated')]
    data1['protein_len']
    print(species, len(data1[['protein_len']]))

#mean
    sum1 = sum(data1['protein_len'])/len(data1['protein_len'])
    print("Mean ", sum1)
# atrio retained 548.7659275891103

#median, 412 for retained atrio
    n=len(data1['protein_len'])
    data2= data1['protein_len'].tolist()
    data2.sort() 
    if n % 2 == 0: 
        median1 = data2[n//2] 
        median2 = data2[n//2 - 1] 
        median = (median1 + median2)/2
    else: 
        median = data2[n//2] 
    print("Median " + str(median)) 

##mode, Mode is / are: 239, 369 atrio retained

    data_counter = Counter(data2) 
    get_mode = dict(data_counter) 
    mode = [k for k, v in get_mode.items() if v == max(list(data_counter.values()))] 
  
    if len(mode) == n: 
        get_mode = "No mode found"
    else: 
        get_mode = "Mode " + ', '.join(map(str, mode)) 
      
    print(get_mode) 

Make a distribution of protein length, gene length or exon number per species.

In [None]:
#Change protein_len to exon_number or gene_len
def make_size_plot(df, protein_len_cutoff=2000, alpha=0.5, figsize=(12,7), title="title goes here"):
    
    alpha = alpha
    fig, ax = plt.subplots(figsize=figsize)

    subset_df = df[df['protein_len']<=protein_len_cutoff]
    ax = sns.distplot(subset_df[subset_df['species']=="Atriophallophorus_red3"]['protein_len'], color="red", hist=False, label="Atriophallophorus winterbourni")
    ax = sns.distplot(subset_df[subset_df['species']=="Opisthorchis_felineus_GCA_004794785"]['protein_len'], color="maroon", hist=False,  label="Opisthorchis felineus")
    ax = sns.distplot(subset_df[subset_df['species']=="Clonorchis_sinensis_GCA_003604175"]['protein_len'], color="sandybrown", hist=False,  label="Clonorchis sinensis")
    ax = sns.distplot(subset_df[subset_df['species']=="Echinostoma_caproni_GCA_900618425"]['protein_len'], color="purple", hist=False,  label="Echinostoma caproni")
    ax = sns.distplot(subset_df[subset_df['species']=="Fasciola_hepatica_GCA_002763495"]['protein_len'], color="magenta", hist=False,  label="Fasciola hepatica")
    ax = sns.distplot(subset_df[subset_df['species']=="Opisthorchis_viverrini_GCA_001990785"]['protein_len'], color="darksalmon", hist=False,  label="Opisthorchis viverrini")
    ax = sns.distplot(subset_df[subset_df['species']=="Schistosoma_bovis_GCA_003958945"]['protein_len'], color="olive", hist=False,  label="Schistosoma bovis")
    ax = sns.distplot(subset_df[subset_df['species']=="Schistosoma_curassoni_GCA_900618015"]['protein_len'], color="yellowgreen", hist=False,   label="Schistosoma curassoni")
    ax = sns.distplot(subset_df[subset_df['species']=="Schistosoma_haematobium_GCA_000699445"]['protein_len'], color="lawngreen", hist=False,  label="Schistosoma haematobium")
    ax = sns.distplot(subset_df[subset_df['species']=="Schistosoma_japonicum_GCA_006368765"]['protein_len'], color="lightgreen", hist=False,   label="Schistosoma japonicum")
    ax = sns.distplot(subset_df[subset_df['species']=="Schistosoma_mansoni_GCA_000237925"]['protein_len'], color="g", hist=False,  label="Schistosoma mansoni")
    ax = sns.distplot(subset_df[subset_df['species']=="Schistosoma_mattheei_GCA_900617995"]['protein_len'], color="mediumseagreen", hist=False, label="Schistosoma mattheei")
    ax = sns.distplot(subset_df[subset_df['species']=="Schistostoma_margrebowiei_GCA_900618395"]['protein_len'], color="mediumturquoise", hist=False, label="Schistosoma margrebowiei")
    ax = sns.distplot(subset_df[subset_df['species']=="Trichobilharzia_regenti_GCA_900618515"]['protein_len'], color="darkblue", hist=False,  label="Trichobilharzia regenti")

    ax.set_xlabel("Protein Length")
    ax.legend(fontsize=12)
    ax.set_title(title, fontsize=20)
    
#all species combined
make_size_plot(trematode_df, title="All trematode species")

Prepare a table for ANOVA analysis. We assume here protein length (protein_len) but it can be changed for gene length or exon number.

In [None]:
protein_length_df = trematode_df[['species','class', 'protein_len']] 
protein_length_2_df = pd.melt(protein_length_df, id_vars=['species', 'class'])
protein_length_2_df.columns = ['species', 'gene', 'variable', 'value']

In [None]:
#Log transform and normalize values if the data has a lot of outliers and is not normally distributed
import numpy as np
protein_length_2_df = protein_length_2_df.replace(0, np.nan)
protein_length_2_df = protein_length_2_df[protein_length_2_df['value'].notna()]
protein_length_log = np.log(protein_length_2_df.value)
protein_length_log.describe()
protein_length_2_df.insert(4, "LOG_Values", protein_length_log, True)
#normalization
def normalize(column):
    upper = column.max()
    lower = column.min()
    y = (column - lower)/(upper-lower)
    return y

protein_length_log_normalized = normalize(protein_length_log)
protein_length_log_normalized.describe()
protein_length_2_df.insert(5, "LOG_NORM_Values", protein_length_log_normalized, True)

In [None]:
#Obtain summary statistics for the normalized and log transformed values
rp.summary_cont(protein_length_2_df['LOG_NORM_Values'])
rp.summary_cont(protein_length_2_df.groupby(['gene']))['LOG_NORM_Values']
rp.summary_cont(protein_length_2_df.groupby(['species']))['LOG_NORM_Values']
rp.summary_cont(protein_length_2_df.groupby(['gene', 'species']))['LOG_NORM_Values']

In [None]:
#Run the model
model = ols('LOG_NORM_Values ~ C(species) + C(gene) + C(species):C(gene)', data= protein_length_2_df).fit()
print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}")

#Obtain the full model summary
model.summary()

#Obtain the effect sizes for each independent factor
res = sm.stats.anova_lm(model, typ= 2)
res
def anova_table(aov):
    aov['mean_sq'] = aov[:]['sum_sq']/aov[:]['df']
    
    aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
    
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*aov['mean_sq'][-1]))/(sum(aov['sum_sq'])+aov['mean_sq'][-1])
    
    cols = ['sum_sq', 'mean_sq', 'df', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
    aov = aov[cols]
    return aov
anova_table(res)

In [None]:
#Turkey's Posthoc test to test for significant differences between means of pairs of groups of independent factors in the model
#First we test gene category (retained, duplicated, gained)
mc = statsmodels.stats.multicomp.MultiComparison(protein_length_2_df['value'], protein_length_2_df['gene'])
mc_results = mc.tukeyhsd()
print(mc_results)

#Then we test species (retained, duplicated, gained)
mc = statsmodels.stats.multicomp.MultiComparison(protein_length_2_df['value'], protein_length_2_df['species'])
mc_results = mc.tukeyhsd()
print(mc_results)