In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
metadata_df = pd.read_csv("~/mrc/project/sita_eisa/processed/181120_eisametadata.tsv", sep="\t")

metadata_df.head()

Unnamed: 0,GSE,Study,Organism,Cell/Tissue type,Treatment(s),Time (hr),Concentration (µg/ml),Genotype(s),Disease model,Nmin,...,lps_only,p_intron,EISA_nsig,EISA_Up:Down,DESeq_nsig,DESeq_Up:Down,SITA,Remarks,Other seq,Other comments
0,GSE110316,Cancer,Mus musculus,Fibroblasts,,,,"CEBPB-LIP (OE), CEBPB-LAP (OE) vs. CEBPB (KO)",,3.0,...,,0.0508,2,1.0,17,0.888889,FALSE?,,,
1,GSE116780,Inflammation,Homo sapiens,U937 monocytes,Control vs. LPS,6.0,0.01,A20 (KO) vs. WT,,3.0,...,1.0,0.008402,9,0.8,239,2.793651,FALSE?,Very low proportion of introns.,,
2,GSE116780,Inflammation,Homo sapiens,U937 monocytes,Control vs. LPS,6.0,0.01,A20 (KO) vs. WT,,3.0,...,1.0,0.008963,17,1.125,1062,1.13253,FALSE?,Very low proportion of introns.,,
3,GSE134443,Inflammation,Mus musculus,MZB B lymphocytes (treated),"Unstimulated vs. LPS, anti-IgM",4.0,10.0,TRPM5 (KO) vs. WT,,2.0,...,0.0,0.09391,6,0.5,2196,1.204819,FALSE?,,,
4,GSE134443,Inflammation,Mus musculus,MZB B lymphocytes (treated),"Unstimulated vs. LPS, anti-IgM",4.0,10.0,TRPM5 (KO) vs. WT,,2.0,...,0.0,0.06968,2,1.0,2323,1.16093,FALSE?,,,WT-unstimulated vs. KO_LPS o


In [3]:
# General GSE stats
total_gse = len(metadata_df['GSE'].unique())
total_comp = len(metadata_df.loc[metadata_df['SITA'] != ''])

print(f"""
Total GSEs: {total_gse}
Total comparisons: {total_comp}
""")



Total GSEs: 98
Total comparisons: 268



In [6]:
# Comparison stats for each dataset
datasets = ['Inflammation', 'Neurodegeneration']
for dataset in datasets:
    complete_comp = metadata_df.loc[metadata_df['Study'] == dataset]
    total_gse = len(complete_comp['GSE'].unique())
    total_comp = len(complete_comp)

    print(f"Total GSEs for {dataset}: {total_gse}")
    print(f"Total comparisons for {dataset}: {total_comp}\n")

    sita_values = complete_comp['SITA'].value_counts(dropna=False).to_dict()

    for sita, value in sita_values.items():
        print(f"{sita}: {value} ({round((value / total_comp) * 100, 2)}%)")

    true_studies = complete_comp.loc[complete_comp['SITA'] == 'TRUE']['GSE'].unique()

    print(f"""Total studies with confident SITA: {len(true_studies)} ({round((len(true_studies) / total_gse) * 100)}%)
    GSEs: {', '.join(list(true_studies))}\n""")

Total GSEs for Inflammation: 53
Total comparisons for Inflammation: 125

FALSE: 31 (24.8%)
TRUE: 28 (22.4%)
nan: 22 (17.6%)
TRUE?: 15 (12.0%)
FALSE*: 15 (12.0%)
FALSE?: 9 (7.2%)
FALSE*?: 5 (4.0%)
Total studies with confident SITA: 17 (32%)
    GSEs: GSE103719, GSE109834, GSE119380, GSE123596, GSE134443, GSE139592, GSE143241, GSE147943, GSE158889, GSE62641, GSE80304, GSE90046, GSE92618, GSE94144, GSE95078, GSE97538, GSE98563

Total GSEs for Neurodegeneration: 44
Total comparisons for Neurodegeneration: 141

nan: 68 (48.23%)
TRUE: 20 (14.18%)
FALSE*: 17 (12.06%)
TRUE?: 16 (11.35%)
FALSE: 10 (7.09%)
FALSE?: 5 (3.55%)
FALSE*?: 4 (2.84%)
FALSE?*: 1 (0.71%)
Total studies with confident SITA: 12 (27%)
    GSEs: GSE102563, GSE109171, GSE109906, GSE117868, GSE132508, GSE135539, GSE136158, GSE136789, GSE154428, GSE43366, GSE43879, GSE74724



In [5]:
# Analysis of NA comparisons
studies = ['Inflammation', 'Neurodegeneration']
condition = 'TRUE'

for study in studies:
    sub_df = metadata_df.loc[metadata_df['Study'] == study]

    # na_comps = metadata_df.loc[metadata_df['SITA'].isna()]
    # nonna_comps = metadata_df.loc[~(metadata_df['SITA'].isna())]

    comps = sub_df.loc[metadata_df['SITA'] == condition]
    non_comps = sub_df.loc[metadata_df['SITA'] != condition]

    print(f"Total {study} comparisons: {len(na_comps)}")

    fig, axs = plt.subplots(1, 4, figsize=(10, 5))

    axs[0].boxplot([comps['p_intron'], non_comps['p_intron']], labels=[condition, f'non-{condition}'])
    axs[0].set_title("Proportion of introns")

    axs[1].boxplot([comps['DESeq_nsig'], non_comps['DESeq_nsig']], labels=[condition, f'non-{condition}'])
    axs[1].set_title("Significant DESeq2 genes")

    axs[2].boxplot([comps['DESeq_Up:Down'].dropna(), non_comps['DESeq_Up:Down'].dropna()], labels=[condition, f'non-{condition}'])
    axs[2].set_ylim([-1, 10])
    axs[2].set_title("DESeq2 up:down")

    axs[3].boxplot([comps['Min Reads (M)'].dropna(), non_comps['Min Reads (M)'].dropna()], labels=[condition, f'non-{condition}'])
    axs[3].set_title('Min Reads (M)')

    fig.suptitle(f"{study}", y=1.05, fontsize=16)
    fig.tight_layout()

NameError: name 'na_comps' is not defined

In [6]:
lps_wt_df = metadata_df.loc[(metadata_df['Study'] == 'Inflammation') &
                            (metadata_df['wt_comp'] == 1) &
                            (metadata_df['lps_only'] == 1)]

true_df = lps_wt_df.loc[lps_wt_df['SITA'] == 'TRUE']
false_df = lps_wt_df.loc[lps_wt_df['SITA'] == 'FALSE']
na_df = lps_wt_df.loc[lps_wt_df['SITA'].isna()]

true_df.to_csv("../processed/inflammation_true.txt", sep="\t")
false_df.to_csv("../processed/inflammation_false.txt", sep="\t")
na_df.to_csv("../processed/inflammation_na.txt", sep="\t")

In [13]:
nd_df = metadata_df.loc[(metadata_df['Study'] == 'Neurodegeneration')]

nd_gse = len(nd_df['GSE'].unique())

microglia_df = metadata_df.loc[(metadata_df['Cell/Tissue type'].str.contains('microglia')) |
(metadata_df['Cell/Tissue type'].str.contains('Microglia'))]

mg_gse = len(microglia_df['GSE'].unique())
mg_comp = len(microglia_df) - 3

print(f"""
Total GSEs with microglia: {mg_gse} ({mg_gse / nd_gse * 100}%)
Total comparisons with microglia: {mg_comp} ({mg_comp / len(nd_df['GSE']) * 100}%)

""")



Total GSEs with microglia: 33 (75.0%)
Total comparisons with microglia: 86 (60.99290780141844%)


