In [None]:
import sys

import pandas as pd

sys.path.insert(0, "..")
from utils import load_all, get_dataset_names

## Load data and preprocessing

In [3]:
dfnames = get_dataset_names(cutoff=40)
dfs, _ = load_all(dfnames, expected=False)

In [4]:
IAV_dfnames = get_dataset_names(cutoff=40, selection="IAV")
IBV_dfnames = get_dataset_names(cutoff=40, selection="IBV")

vitro_dfnames = get_dataset_names(cutoff=40, selection="in vitro")
vivo_dfnames = get_dataset_names(cutoff=40, selection="in vivo mouse")    
human_dfnames = get_dataset_names(cutoff=40, selection="in vivo human")

def get_IV_type(datasetname):
    if datasetname in IAV_dfnames:
        return "IAV"
    elif datasetname in IBV_dfnames:
        return "IBV"
    else:
        return "error"
    
def get_host_system(datasetname):
    if datasetname in vitro_dfnames:
        return "in vitro"
    elif datasetname in vivo_dfnames:
        return "in vivo mouse"
    elif datasetname in human_dfnames:
        return "in vivo human"
    else:
        return "error"

In [5]:
data = {
    "in vitro": [0, 0, 0],
    "in vivo mouse": [0, 0, 0],
    "in vivo human": [0, 0, 0],
    "overall": [0, 0, 0]
}

for df, dfname in zip(dfs, dfnames):
    n = df.shape[0]
    IV_type = get_IV_type(dfname)
    host = get_host_system(dfname)
    index = 0 if IV_type == "IAV" else 1

    data[host][index] += n
    data[host][2] += n
    data["overall"][index] += n
    data["overall"][2] += n

df = pd.DataFrame(data, index=["IAV", "IBV", "overall"])
df

Unnamed: 0,in vitro,in vivo mouse,in vivo human,overall
IAV,11517,6492,1080,19089
IBV,541,0,3735,4276
overall,12058,6492,4815,23365


In [7]:
for s in ["PB2", "PB1", "PA"]:
    data = {
        "in vitro": [0, 0, 0],
        "in vivo mouse": [0, 0, 0],
        "in vivo human": [0, 0, 0],
        "overall": [0, 0, 0]
    }

    print(s)
    for df, dfname in zip(dfs, dfnames):
        t_df = df[df["Segment"] == s]
        n = t_df.shape[0]
        IV_type = get_IV_type(dfname)
        host = get_host_system(dfname)
        index = 0 if IV_type == "IAV" else 1

        data[host][index] += n
        data[host][2] += n
        data["overall"][index] += n
        data["overall"][2] += n

    df = pd.DataFrame(data, index=["IAV", "IBV", "overall"])
    print(df)

PB2
         in vitro  in vivo mouse  in vivo human  overall
IAV          2621           1209            102     3932
IBV           101              0           1186     1287
overall      2722           1209           1288     5219
PB1
         in vitro  in vivo mouse  in vivo human  overall
IAV          4289           2518            161     6968
IBV            83              0           1512     1595
overall      4372           2518           1673     8563
PA
         in vitro  in vivo mouse  in vivo human  overall
IAV          3416           2560            689     6665
IBV            77              0            830      907
overall      3493           2560           1519     7572
