In [1]:
import os
import sys
import warnings
import matplotlib

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

sys.path.insert(0, "..")
from utils import load_all, get_dataset_names, scheirer_ray_hare_test
from utils import RESULTSPATH
from overall_comparision.general_analyses import calc_start_end_lengths

plt.style.use("seaborn")

## Load data and preprocessing

In [2]:
dfnames = get_dataset_names(cutoff=40)
dfs, _ = load_all(dfnames, expected=False)

## Create figure 3

In [3]:
def create_comparision_matrix(dfs: list, dfnames: list):
    '''
        compare the given datasets pairwise in their difference of start and
        end sequence length by creating a matrix.
        :param dfs: The list of DataFrames containing the data, preprocessed
            with sequence_df(df)
        :param dfnames: The names associated with each DataFrame in `dfs`

        :return: None
    '''    
    plot_list, _ = calc_start_end_lengths(dfs, dfnames)

    plt.figure(figsize=(10, 9))
    plt.rc("font", size=20)
    # initialize an empty matrix
    matrix_size = len(plot_list)
    matrix = [[0] * matrix_size for _ in range(matrix_size)]
    # calculate the differences and populate the matrix
    for i, d1 in enumerate(plot_list):
        for j, d2 in enumerate(plot_list):
            if i == j:
                matrix[i][j] = np.nan
            else:
                # cliffs delta
                U, p = stats.mannwhitneyu(d1, d2)
                cliffs_delta = 2*U / (len(d1)*len(d2)) - 1
                matrix[i][j] = abs(cliffs_delta)
                color = "black" if abs(cliffs_delta) > 0.4 else "white"
                plt.annotate(f"{cliffs_delta:.2f}", xy=(j, i), color=color, ha='center', va='center', fontsize=10, fontweight='bold')
                
    warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
    plt.imshow(matrix, cmap="viridis", interpolation="nearest")
    plt.colorbar(fraction=0.046, pad=0.04, location="right", label="Cliff's delta (absolute values)")
    plt.xticks(np.arange(len(dfnames)), [f"{n}    " for n in dfnames], rotation=90)
    plt.yticks(np.arange(len(dfnames)), [f"{n}    " for n in dfnames])
    plt.tight_layout()
    plt.grid(False)

    save_path = os.path.join(RESULTSPATH, "datasplits")
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    plt.savefig(os.path.join(save_path, "cliffs_d_comparision_matrix.png"), dpi=300)
    plt.close()

create_comparision_matrix(dfs, dfnames)

## Define functions for more finer splits

In [4]:
IAV_dfnames = get_dataset_names(cutoff=40, selection="IAV")
IBV_dfnames = get_dataset_names(cutoff=40, selection="IBV")

vitro_dfnames = get_dataset_names(cutoff=40, selection="in vitro")
vivo_dfnames = get_dataset_names(cutoff=40, selection="in vivo mouse")    
human_dfnames = get_dataset_names(cutoff=40, selection="in vivo human")

def get_IV_type(datasetname):
    if datasetname in IAV_dfnames:
        return "IAV"
    elif datasetname in IBV_dfnames:
        return "IBV"
    else:
        return "error"
    
def get_host_system(datasetname):
    if datasetname in vitro_dfnames:
        return "in vitro"
    elif datasetname in vivo_dfnames:
        return "in vivo mouse"
    elif datasetname in human_dfnames:
        return "in vivo human"
    else:
        return "error"

### Pooled data and for single segments

In [5]:
p_iv_list = list()
p_host_list = list()
p_inter_list = list()
for seg in ["Pooled", "PB2", "PB1", "PA"]:
    t_dfs = list()
    if seg == "Pooled":
        t_dfs = dfs
    else:
        ns = 0
        for df in dfs:
            ns += len(df)
            t_dfs.append(df[df["Segment"] == seg])
        print(ns)

    t_diff_3_5, _ = calc_start_end_lengths(t_dfs, dfnames)

    data = pd.DataFrame({
        "Measure": list(),
        "IV_type": list(),
        "Host_system": list()
    })

    for dfname, values_3_5 in zip(dfnames, t_diff_3_5):
        temp_data = pd.DataFrame({
            "Measure": [np.mean(values_3_5)],
            "IV_type": [get_IV_type(dfname)],
            "Host_system": [get_host_system(dfname)]
        })
        data = pd.concat([data, temp_data], ignore_index=True)

    data = data[data["Host_system"] != "in vivo mouse"].reset_index(drop=True)

    H_iv, p_iv, H_host, p_host, H_int, p_int = scheirer_ray_hare_test(data)
    print(seg)
    print("IV type host  interaction")
    print(round(H_iv,2), round(H_host,2), round(H_int,2))
    print(round(p_iv,4), round(p_host,4), round(p_int,4))

    p_iv_list.append(p_iv)
    p_host_list.append(p_host)
    p_inter_list.append(p_int)


Pooled
IV type host  interaction
9.45 5.35 1.2
0.0021 0.0207 0.273
23365
PB2
IV type host  interaction
4.76 5.35 5.89
0.0292 0.0207 0.0152
23365


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["End_L"] = df["full_seq"].str.len() - df["End"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["End_L"] = df["full_seq"].str.len() - df["End"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["End_L"] = df["full_seq"].str.len() - df["End"]


PB1
IV type host  interaction
7.71 4.42 3.87
0.0055 0.0355 0.0491
23365
PA
IV type host  interaction
6.65 4.42 4.93
0.0099 0.0355 0.0264


## Supplementary Figure

In [None]:
def compare_iav_ibv(dfs1: list, dfnames1: list, dfs2: list, dfnames2: list, categories: list, analysis: str)-> None:
    '''
        compare the given datasets in their difference of start and end
        sequence length.
        :param dfs: The list of DataFrames containing the data, preprocessed
            with sequence_df(df)
        :param dfnames: The names associated with each DataFrame in `dfs`

        :return: None
    '''
    data1, _ = calc_start_end_lengths(dfs1, dfnames1)
    data2, _ = calc_start_end_lengths(dfs2, dfnames2)
    list1 = [item for sublist in data1 for item in sublist]
    list2 = [item for sublist in data2 for item in sublist]
    plot_list = [list1, list2]

    fig, axs = plt.subplots(1, 1, figsize=(5, 1.5), tight_layout=True)
    position_list = np.arange(0, 2)
    violin_parts = axs.violinplot(plot_list, position_list, showextrema=False, points=1000, showmeans=True, vert=False)
    for pc in violin_parts["bodies"]:
        pc.set_edgecolor("black")

    for i, d in enumerate(plot_list):
        y_p = np.random.uniform(i-0.3, i+0.3, len(d))
        plt.scatter(d, y_p, c="darkgrey", s=2, zorder=0)

    axs.set_yticks(position_list)
    if analysis == "IAV_IBV":
        step = (12, 12)
    elif analysis == "vivo_vitro":
        step = (6, 6)
    elif analysis == "vitro_IAV":
        step = (0, 0)
    elif analysis == "vivo_IBV":
        step = (0, 0)
    elif analysis == "IAV_vitro_vivo":
        step = (2, 2)
    elif analysis == "IBV_vitro_vivo":
        step = (0, 0)

    axs.set_yticklabels([f"{' '*step[0]}{categories[0]} (n={len(plot_list[0])})", f"{' '*step[0]}{categories[1]} (n={len(plot_list[1])})"])

    axs.set_xlabel("5'-end length - 3'-end length")
    axs.set_xlim(right=340)
    axs.set_xticks([-300, -200, -100, 0, 100, 200, 300])

    save_path = os.path.join(RESULTSPATH, "datasplits")
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    plt.savefig(os.path.join(save_path, f"{analysis}_3_5_ends.png"))
    plt.close()


def select_PB2(dfs):
    return [df[df["Segment"] == "PB2"] for df in dfs]

if __name__ == "__main__":
    IAV_dfnames = get_dataset_names(cutoff=40, selection="IAV")
    IAV_dfs, _ = load_all(IAV_dfnames)
    IAV_dfs = select_PB2(IAV_dfs)
    IBV_dfnames = get_dataset_names(cutoff=40, selection="IBV")
    IBV_dfs, _ = load_all(IBV_dfnames)
    IBV_dfs = select_PB2(IBV_dfs)
    categories = ["IAV", "IBV"]
    compare_iav_ibv(IAV_dfs, IAV_dfnames, IBV_dfs, IBV_dfnames, categories, analysis="IAV_IBV")

####### further analysis
# in vitro against in vivo datasets
    vitro_dfnames = get_dataset_names(cutoff=40, selection="in vitro")
    vitro_dfs, _ = load_all(vitro_dfnames)
    vitro_dfs = select_PB2(vitro_dfs)
    human_dfnames = get_dataset_names(cutoff=40, selection="in vivo human")
    human_dfs, _ = load_all(human_dfnames)
    human_dfs = select_PB2(human_dfs)

    categories = ["in vitro", "human"]
    compare_iav_ibv(vitro_dfs, vitro_dfnames, human_dfs, human_dfnames, categories, analysis="vivo_vitro")

# in vitro all IAV against BLEE and Sheng
    vitro_iav_dfs = list()
    vitro_iav_dfnames = list()
    vitro_ibv_dfs = list()
    vitro_ibv_dfnames = list()
    for df, dfname in zip(vitro_dfs, vitro_dfnames):
        if dfname in ["Alnaji2019_BLEE", "Sheng2018"]:
            vitro_ibv_dfs.append(df)
            vitro_ibv_dfnames.append(dfname)
        else:
            vitro_iav_dfs.append(df)
            vitro_iav_dfnames.append(dfname)

    categories = ["IAV in vitro", "IBV in vitro"]
    compare_iav_ibv(vitro_iav_dfs, vitro_iav_dfnames, vitro_ibv_dfs, vitro_ibv_dfnames, categories, analysis="vitro_IAV")

# in vivo human all IBV against Berry A
    vivo_iav_dfs = list()
    vivo_iav_dfnames = list()
    vivo_ibv_dfs = list()
    vivo_ibv_dfnames = list()
    for df, dfname in zip(human_dfs, human_dfnames):
        if dfname == "Berry2021_A":
            vivo_iav_dfs.append(df)
            vivo_iav_dfnames.append(dfname)
        else:
            vivo_ibv_dfs.append(df)
            vivo_ibv_dfnames.append(dfname)

    categories = ["IAV human", "IBV human"]
    compare_iav_ibv(vivo_iav_dfs, vivo_iav_dfnames, vivo_ibv_dfs, vivo_ibv_dfnames, categories, analysis="vivo_IBV")

# IAV vitro vs vivo human
    categories = ["IAV in vitro", "IAV human"]
    compare_iav_ibv(vitro_iav_dfs, vitro_iav_dfnames, vivo_iav_dfs, vitro_ibv_dfnames, categories, analysis="IAV_vitro_vivo")

# IBV vitro vs vivo human
    categories = ["IBV in vitro", "IBV human"]
    compare_iav_ibv(vitro_ibv_dfs, vivo_ibv_dfnames, vivo_ibv_dfs, vivo_ibv_dfnames, categories, analysis="IBV_vitro_vivo")


### Calculate absolute differences for supplement

In [14]:
def compare_iav_ibv(dfs1: list, dfnames1: list, dfs2: list, dfnames2: list, categories: list)-> None:
    data1, _ = calc_start_end_lengths(dfs1, dfnames1)
    data2, _ = calc_start_end_lengths(dfs2, dfnames2)
    list1 = [item for sublist in data1 for item in sublist]
    list2 = [item for sublist in data2 for item in sublist]

    print(f"difference of lengths for {categories}")
    print(np.mean(list1))
    print(np.mean(list2))
    print(abs(np.mean(list1)) - abs(np.mean(list2)))


if __name__ == "__main__":
    IAV_dfnames = get_dataset_names(cutoff=40, selection="IAV")
    IAV_dfs, _ = load_all(IAV_dfnames)
    IBV_dfnames = get_dataset_names(cutoff=40, selection="IBV")
    IBV_dfs, _ = load_all(IBV_dfnames)
    categories = ["IAV", "IBV"]
    compare_iav_ibv(IAV_dfs, IAV_dfnames, IBV_dfs, IBV_dfnames, categories)

####### further analysis
# in vitro against in vivo datasets
    vitro_dfnames = get_dataset_names(cutoff=40, selection="in vitro")
    vitro_dfs, _ = load_all(vitro_dfnames)
    human_dfnames = get_dataset_names(cutoff=40, selection="in vivo human")
    human_dfs, _ = load_all(human_dfnames)

    categories = ["in vitro", "human"]
    compare_iav_ibv(vitro_dfs, vitro_dfnames, human_dfs, human_dfnames, categories)

# in vitro all IAV against BLEE and Sheng
    vitro_iav_dfs = list()
    vitro_iav_dfnames = list()
    vitro_ibv_dfs = list()
    vitro_ibv_dfnames = list()
    for df, dfname in zip(vitro_dfs, vitro_dfnames):
        if dfname in ["Alnaji2019_BLEE", "Sheng2018"]:
            vitro_ibv_dfs.append(df)
            vitro_ibv_dfnames.append(dfname)
        else:
            vitro_iav_dfs.append(df)
            vitro_iav_dfnames.append(dfname)

    categories = ["IAV in vitro", "IBV in vitro"]
    compare_iav_ibv(vitro_iav_dfs, vitro_iav_dfnames, vitro_ibv_dfs, vitro_ibv_dfnames, categories)

# in vivo human all IBV against Berry A
    vivo_iav_dfs = list()
    vivo_iav_dfnames = list()
    vivo_ibv_dfs = list()
    vivo_ibv_dfnames = list()
    for df, dfname in zip(human_dfs, human_dfnames):
        if dfname == "Berry2021_A":
            vivo_iav_dfs.append(df)
            vivo_iav_dfnames.append(dfname)
        else:
            vivo_ibv_dfs.append(df)
            vivo_ibv_dfnames.append(dfname)

    categories = ["IAV human", "IBV human"]
    compare_iav_ibv(vivo_iav_dfs, vivo_iav_dfnames, vivo_ibv_dfs, vivo_ibv_dfnames, categories)

# IAV vitro vs vivo human
    categories = ["IAV in vitro", "IAV human"]
    compare_iav_ibv(vitro_iav_dfs, vitro_iav_dfnames, vivo_iav_dfs, vitro_ibv_dfnames, categories)

# IBV vitro vs vivo human
    categories = ["IBV in vitro", "IBV human"]
    compare_iav_ibv(vitro_ibv_dfs, vivo_ibv_dfnames, vivo_ibv_dfs, vivo_ibv_dfnames, categories)


difference of lengths for ['IAV', 'IBV']
-66.00065954359583
-12.273790951638066
53.72686859195777
difference of lengths for ['in vitro', 'human']
-62.478020854630955
-24.276261772518716
38.20175908211224
difference of lengths for ['IAV in vitro', 'IBV in vitro']
-63.72420826623725
-37.62098501070664
26.103223255530608
difference of lengths for ['IAV human', 'IBV human']
-93.03412073490814
-8.770642201834862
84.26347853307328
difference of lengths for ['IAV in vitro', 'IAV human']
-63.72420826623725
-93.03412073490814
-29.309912468670895
difference of lengths for ['IBV in vitro', 'IBV human']
-37.62098501070664
-8.770642201834862
28.85034280887178
