# Explore Round 2 Data Patterns
An exploration of the round 2 data to understand prominent patterns.

In [1]:
import pathlib

import kaleido
import pandas as pd
import plotly.express as px
import plotly.io as pio
from umap import UMAP

  from .autonotebook import tqdm as notebook_tqdm


## Find the root of the git repo on the host system

In [2]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

# Inputs

In [3]:
pilot_path = root_dir / "big_drive/alsf/ALSF_pilot_data"

round2_profiles_path = (
    pilot_path / "preprocessed_profiles_Round_2_data/Round_2_data"
).resolve(strict=True)

common_cols = None

for pq_file in list(round2_profiles_path.rglob("*feature_selected.parquet")):
    cols = pd.read_parquet(pq_file).columns
    if common_cols is None:
        common_cols = set(cols)

    else:
        common_cols &= set(cols)

round2df = pd.concat(
    [
        pd.read_parquet(pq_file, columns=common_cols)
        for pq_file in round2_profiles_path.rglob("*feature_selected.parquet")
    ],
    axis=0,
)

In [4]:
round2_figure_path = pathlib.Path("round2_figures_path")

int_figure_path = round2_figure_path / "interactive_figures"
int_figure_path.mkdir(parents=True, exist_ok=True)

static_figure_path = round2_figure_path / "static_figures"
static_figure_path.mkdir(parents=True, exist_ok=True)

# High-Level Data Characteristics

In [5]:
print(round2df.shape)

(1050944, 445)


In [6]:
print("Are there any NaN entries?")
print(f"Answer: {round2df.isna().any().any()}")

Are there any NaN entries?


Answer: False


## Metadata Exploration

In [7]:
round2df["Metadata_Plate"].unique()

array(['BR00145438', 'BR00145440', 'BR00145439', 'BR00145818',
       'BR00145817', 'BR00145816'], dtype=object)

In [8]:
round2df["Metadata_time_point"].unique()

array([24, 72, 48])

In [9]:
print("\nUnique timepoints:")
round2df["Metadata_time_point"].unique()


Unique timepoints:


array([24, 72, 48])

In [10]:
print("\nUnique Seeding Densities:")
round2df["Metadata_seeding_density"].unique()


Unique Seeding Densities:


array([ 1000,  2000,  4000,  8000, 12000])

In [11]:
round2df["Metadata_seeding_density"].value_counts()

Metadata_seeding_density
12000    353135
8000     301038
4000     211533
2000     119399
1000      65839
Name: count, dtype: int64

In [12]:
round2df["Metadata_cell_line"].unique()

array(['A-673', 'SKNMC', 'CHP-212', 'NB-1', 'KPNYN', 'SHSY5Y', 'U2-OS',
       'SKNDZ', 'CHLA-10', 'CHLA-25', 'CHLA-113', 'CHLA-200', 'CHLA-218'],
      dtype=object)

In [13]:
round2df["Metadata_Image_Count_Cells"].nunique()

844

In [14]:
round2df["Metadata_Plate"].value_counts()

Metadata_Plate
BR00145440    387332
BR00145439    225179
BR00145818    155145
BR00145438    137004
BR00145817     82987
BR00145816     63297
Name: count, dtype: int64

# UMAP Figures

In [15]:
vdf = round2df.groupby(
    ["Metadata_cell_line", "Metadata_seeding_density", "Metadata_time_point"]
).sample(n=50, random_state=0)

umap_obj = UMAP(random_state=0)
umapdf = umap_obj.fit_transform(
    vdf.loc[:, ~vdf.columns.str.contains("metadata", case=False)]
)
umapdf = pd.DataFrame(umapdf, columns=["umap0", "umap1"])
umapdf = umapdf.assign(
    Metadata_cell_line=vdf["Metadata_cell_line"].reset_index(drop=True),
    Metadata_seeding_density=vdf["Metadata_seeding_density"].reset_index(drop=True),
    Metadata_time_point=vdf["Metadata_time_point"].reset_index(drop=True),
)
umapdf["Metadata_seeding_density"] = umapdf["Metadata_seeding_density"].astype(str)
umapdf["Metadata_time_point"] = umapdf["Metadata_time_point"].astype(str)

  warn(


In [16]:
# Needed to display in chrome
kaleido.get_chrome_sync()

PosixPath('/home/camo/projects/pediatric_cancer_atlas_analysis/1.exploratory_data_analysis/.venv/lib/python3.11/site-packages/choreographer/cli/browser_exe/chrome-linux64/chrome')

In [17]:
fig = px.scatter(
    umapdf,
    x="umap0",
    y="umap1",
    color="Metadata_cell_line",
    title="UMAP by Cell Line",
    color_discrete_sequence=px.colors.qualitative.Dark24,
)

fig.show()
fig.write_image(
    static_figure_path / "round1_cell_line_umap.png", width=2000, height=1200
)
fig.write_html(
    int_figure_path / "round1_cell_line_umap.html",
    full_html=True,
    include_plotlyjs="embed",
)

In [18]:
blue_green_colors = ["#edf8fb", "#b2e2e2", "#66c2a4", "#2ca25f", "#006d2c"]

fig = px.scatter(
    umapdf,
    x="umap0",
    y="umap1",
    color="Metadata_seeding_density",
    title="UMAP by Seeding Density",
    color_discrete_sequence=blue_green_colors,
)

fig.show()
fig.write_image(
    static_figure_path / "round1_seeding_density_umap.png", width=2000, height=1200
)
fig.write_html(
    int_figure_path / "round1_seeding_density_umap.html",
    full_html=True,
    include_plotlyjs="embed",
)

In [19]:
blue_green_short_colors = ["#edf8fb", "#99d8c9", "#2ca25f"]

fig = px.scatter(
    umapdf,
    x="umap0",
    y="umap1",
    color="Metadata_time_point",
    title="UMAP by Time Point",
    color_discrete_sequence=blue_green_short_colors,
)

fig.show()
fig.write_image(
    static_figure_path / "round1_time_point_umap.png", width=2000, height=1200
)
fig.write_html(
    int_figure_path / "round1_time_point_umap.html",
    full_html=True,
    include_plotlyjs="embed",
)

# Treemap Cell Count Figures

In [20]:
cellcountdf = (
    round2df[["Metadata_cell_line", "Metadata_seeding_density", "Metadata_time_point"]]
    .value_counts()
    .reset_index(name="count")
)

fig = px.treemap(
    cellcountdf,
    path=["Metadata_cell_line", "Metadata_seeding_density", "Metadata_time_point"],
    values="count",
    title="Treemap of Category Combinations",
)

fig.show()
fig.write_image(static_figure_path / "cell_count_treemap.png", width=2000, height=1200)
fig.write_html(
    int_figure_path / "cell_count_treemap.html",
    full_html=True,
    include_plotlyjs="embed",
)