# Motivation

A notebook where I'm preparing the info files to run DEA. I wanted to be separated from all the other analysis

In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

import multiprocess as mp


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
figures_path = "tissue_diff/"
data_base = "../../data/"
results_path_diff = "tissue_diff/"
results_path_split = "non_tum_split/"

healthy_metadata = pd.read_csv(f"{data_base}/metadata/healthy_bladder_metadata.tsv", sep="\t", index_col="Sample", dtype_backend="pyarrow")

# Prep healthy for sleuth

In [3]:
healthy_metadata = pd.read_csv(f"{data_base}/metadata/healthy_bladder_metadata.tsv", sep="\t", index_col="Sample").drop("Y2796_P0")
folders_viking = pd.read_csv(f"{data_base}/metadata/viking_directories.tsv", sep="\t").drop(columns=["Unnamed: 1"]).rename(columns={"./": "folder"})


healthy_metadata["sample_prcsd"] = healthy_metadata.index.str.replace("-", "_")
folders_viking["folder_orig"] = folders_viking["folder"].astype(str)
folders_viking["folder_prcsd"] = folders_viking["folder"].str.replace("-", "_")

In [4]:
matched_samples = {}
base_path_viking = "/mnt/scratch/projects/biol-cancerinf-2020/Raw-Data/20240212-Healthy-Bladder-Vlad/kallisto-gencode-v42"
for idx, row in folders_viking.iterrows():
    folder_orig = row["folder_orig"].split("./")[-1]
    folder = row["folder_prcsd"].split("./")[1]

    comps = folder.split("_")
    sample = comps[0]
    tissue_type = comps[-1]

    for idx, row_meta in healthy_metadata.iterrows():
        meta_cmp = row_meta["sample_prcsd"].split("_")
        sample_meta = meta_cmp[0]
        meta_tissue_type = meta_cmp[1]

        if sample == sample_meta and meta_tissue_type == tissue_type:
            matched_samples[row_meta.name] = f"{base_path_viking}/{folder_orig}/abundance.h5"
            break
test_df = pd.DataFrame.from_dict(matched_samples, orient="index", columns=["folder"])
healthy_metadata["folder"] = test_df["folder"]

In [5]:
cluster_1 = "ABS-Ca"
cluster_2 = "P0"

combinations = [("ABS-Ca", "P0"), ("UD", "P0"), ("ABS-Ca", "UD")]
for cluster_1, cluster_2 in combinations:
    print(f"{cluster_1} - {cluster_2}")
    group_1 = healthy_metadata[healthy_metadata["NHU_differentiation"] == cluster_1].copy(deep=True)
    group_1["express"] = 1
    group_1["tissue_type"] = cluster_1
    group_2 = healthy_metadata[healthy_metadata["NHU_differentiation"] == cluster_2].copy(deep=True)
    group_2["express"] = 2
    group_2["tissue_type"] = cluster_2

    sel_cols = ["express", "folder", "tissue_type"]
    comb_df = pd.concat([group_1[sel_cols], group_2[sel_cols]], axis=0).reset_index(names="sample").rename(columns={"folder": "path"})
    comb_df.to_csv(f"{results_path_diff}/sleuth_{cluster_1}_{cluster_2}.info", sep="\t")
    print(comb_df.shape)

ABS-Ca - P0
(73, 4)
UD - P0
(37, 4)
ABS-Ca - UD
(66, 4)


## Using non-tum subtyping

In [11]:
from collections import Counter
from itertools import combinations


morp_df = pd.read_csv("../network_II/standard/Morpheus/non_tum/prcsd_morpheus_cs_7.tsv", sep="\t", index_col="sample")
cluster_model = "dendrogram_cut"
counter_values = Counter(morp_df[cluster_model])  # for verifying
unique_values = morp_df[cluster_model].unique()
unique_values.sort()

morp_df["folder"] = test_df["folder"]

In [12]:
for cluster_1, cluster_2 in list(combinations(unique_values, 2)):
    print(f"{cluster_1} - {cluster_2}")
    group_1 = morp_df[morp_df[cluster_model] == cluster_1].copy(deep=True)
    group_1["express"] = int(cluster_1)
    group_2 = morp_df[morp_df[cluster_model] == cluster_2].copy(deep=True)
    group_2["express"] = int(cluster_2)

    sel_cols = ["express", "folder", "NHU_differentiation"]
    comb_df = pd.concat([group_1[sel_cols], group_2[sel_cols]], axis=0).reset_index(names="sample").rename(columns={"folder": "path"})
    comb_df.to_csv(f"{results_path_split}/sleuth_{cluster_1}_{cluster_2}.info", sep="\t")
    print(comb_df.shape)

1.0 - 2.0
(22, 4)
1.0 - 3.0
(22, 4)
1.0 - 5.0
(7, 4)
1.0 - 6.0
(17, 4)
1.0 - 7.0
(31, 4)
2.0 - 3.0
(38, 4)
2.0 - 5.0
(23, 4)
2.0 - 6.0
(33, 4)
2.0 - 7.0
(47, 4)
3.0 - 5.0
(23, 4)
3.0 - 6.0
(33, 4)
3.0 - 7.0
(47, 4)
5.0 - 6.0
(18, 4)
5.0 - 7.0
(32, 4)
6.0 - 7.0
(42, 4)
