In [1]:
import pandas as pd
import anndata as ad

In [19]:
sc_counts = ad.read_h5ad("../data/sc_counts_reannotated_with_counts.h5ad")

In [20]:
# filter out samples with cell_count_by_well_celltype <= 10
sc_counts = sc_counts[sc_counts.obs.cell_count_by_well_celltype > 10]

In [26]:
# filter out sm_names that have less than 2 donors
celltype_counts = sc_counts.obs.groupby('sm_name').cell_type.nunique()

  celltype_counts = sc_counts.obs.groupby('sm_name').cell_type.nunique()


In [43]:
fractions_outliers = ['ABT737',
 'Alvocidib',
 'BMS-387032',
 'CGP 60474',
 'Canertinib',
 'Foretinib',
 'Ganetespib (STA-9090)',
 'IN1451',
 'Navitoclax',
 'PF-04691502',
 'Palbociclib',
 'Proscillaridin A;Proscillaridin-A',
 'R428',
 'Tamatinib',
 'UNII-BXU45ZH6LI']

In [44]:
# drop donors with large inconsistency in cell type fractions

In [None]:
compounds_with_large_inconsistency = ['ABT737', "Alvocidib", "BMS-3870", "CGP 60474", "Canertinib", ]

In [48]:
# print fractions of cell types for each sm_name in list per donor
for sm_name in ['Dabrafenib', 'Belinostat', 'Dimethyl Sulfoxide']:
    print(sm_name)
    for donor in sc_counts[sc_counts.obs.sm_name == sm_name].obs.donor_id.unique():
        print(donor)
        print(sc_counts[(sc_counts.obs.sm_name == sm_name) & (sc_counts.obs.donor_id == donor)].obs.cell_type.value_counts(normalize=True))
        print()
    print("--------------------------------------------------")

Dabrafenib
Donor 1
cell_type
T cells          0.500774
Myeloid cells    0.226019
B cells          0.216091
NK cells         0.057117
Name: proportion, dtype: float64

Donor 2
cell_type
T cells          0.588915
Myeloid cells    0.245536
B cells          0.107132
NK cells         0.058417
Name: proportion, dtype: float64

Donor 3
cell_type
T cells          0.582358
Myeloid cells    0.210734
B cells          0.137489
NK cells         0.069419
Name: proportion, dtype: float64

--------------------------------------------------
Belinostat
Donor 1
cell_type
T cells          0.584990
B cells          0.248982
Myeloid cells    0.112332
NK cells         0.053697
Name: proportion, dtype: float64

Donor 2
cell_type
T cells          0.691750
B cells          0.141951
Myeloid cells    0.094547
NK cells         0.071752
Name: proportion, dtype: float64

Donor 3
cell_type
T cells          0.702660
B cells          0.127778
Myeloid cells    0.104701
NK cells         0.064861
Name: proportion, dtype: 

In [45]:
# print fractions of cell types for each sm_name in list per donor
for sm_name in fractions_outliers:
    print(sm_name)
    for donor in sc_counts[sc_counts.obs.sm_name == sm_name].obs.donor_id.unique():
        print(donor)
        print(sc_counts[(sc_counts.obs.sm_name == sm_name) & (sc_counts.obs.donor_id == donor)].obs.cell_type.value_counts(normalize=True))
        print()
    print("--------------------------------------------------")

ABT737
Donor 1
cell_type
T cells          0.678431
Myeloid cells    0.223529
NK cells         0.098039
Name: proportion, dtype: float64

Donor 2
cell_type
T cells          0.636364
Myeloid cells    0.327922
NK cells         0.035714
Name: proportion, dtype: float64

Donor 3
cell_type
T cells          0.732523
Myeloid cells    0.212766
NK cells         0.054711
Name: proportion, dtype: float64

--------------------------------------------------
Alvocidib
Donor 1
cell_type
T cells    1.0
Name: proportion, dtype: float64

Donor 3
cell_type
T cells    1.0
Name: proportion, dtype: float64

--------------------------------------------------
BMS-387032
Donor 1
cell_type
T cells     0.786325
NK cells    0.213675
Name: proportion, dtype: float64

Donor 2
cell_type
T cells    1.0
Name: proportion, dtype: float64

Donor 3
cell_type
T cells     0.77451
NK cells    0.22549
Name: proportion, dtype: float64

--------------------------------------------------
CGP 60474
Donor 1
cell_type
T cells    1.0

In [49]:
# Alvocidib only T cells in only 2 donors, remove
sc_counts = sc_counts[sc_counts.obs.sm_name != "Alvocidib"]

In [52]:
# BMS-387032 - one donor with only T cells, two other consistent, but only 2 cell types - leave the 2 cell types in, remove donor 2 with only T cells
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "BMS-387032") & (sc_counts.obs.donor_id == "Donor 2"))]

In [55]:
# CGP 60474 has only T cells left, remove
sc_counts = sc_counts[sc_counts.obs.sm_name != "CGP 60474"]

In [61]:
# Canertinib - the variation of Myeloid cell proportions is very large, skip Myeloid
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "Canertinib") & (sc_counts.obs.cell_type == "Myeloid cells"))]

In [63]:
# Foretinib - large variation in Myeloid cell proportions (some in T cells), skip Myeloid.
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "Foretinib") & (sc_counts.obs.cell_type == "Myeloid cells"))]

In [65]:
# Ganetespib (STA-9090) - donor 2 has no Myeloid and small NK cells proportions. Skip Myeloid, remove donor 2
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "Ganetespib (STA-9090)") & (sc_counts.obs.donor_id == "Donor 2"))]

In [67]:
# IN1451 - donor 2 has no NK or B, remove Donor 2
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "IN1451") & (sc_counts.obs.donor_id == "Donor 2"))]

In [69]:
# Navitoclax - donor 3 doesn't have B cells and has different T and Myeloid proportions, remove donor 3
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "Navitoclax") & (sc_counts.obs.donor_id == "Donor 3"))]

In [71]:
# PF-04691502 remove Myeloid (only present in donor 3)
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "PF-04691502") & (sc_counts.obs.cell_type == "Myeloid cells"))]

In [73]:
# Proscillaridin A;Proscillaridin-A remove Myeloid, since the variation is very high (4x)
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "Proscillaridin A;Proscillaridin-A") & (sc_counts.obs.cell_type == "Myeloid cells"))]

In [75]:
# R428 - skip NK due to high variation (close to 3x)
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "R428") & (sc_counts.obs.cell_type == "NK cells"))]

In [77]:
# UNII-BXU45ZH6LI - remove due to large variation across all cell types and missing cell types
sc_counts = sc_counts[sc_counts.obs.sm_name != "UNII-BXU45ZH6LI"]

In [None]:
# BMS-387032 remove myeloid cells and B cells
sc_counts = sc_counts[~((sc_counts.obs.sm_name == "BMS-387032") & (sc_counts.obs.cell_type.isin(["Myeloid cells", "B cells"])))]