In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
import gc
import anndata as ad

In [None]:
pwd

In [None]:
adata = sc.read_h5ad('adata_scpoli_final_all_genes.h5ad')

In [None]:
adata.obs.groupby('Unique_ID').size()

In [None]:
pwd

In [None]:
test = ad.read_zarr('metrics/out/metrics/prepare/dataset~manual_genes_binned/file_id~sysvi/label~Level_1_refined--batch~Dataset/prepare.zarr')

In [None]:
test.obs.Level_1_refined

In [None]:
pwd

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
adata.obs.head()

# Outlier

In [None]:
adata.obs.groupby('outlier').size()

# Metastatic samples

In [None]:
adata.obs.groupby('Tissue').size()

In [None]:
adata.obs.groupby('Tissue')['Unique_ID'].unique()['Unknown']

In [None]:
# Step 1: Get Unique_IDs in 'Unknown'
unknown_ids = set(adata.obs[adata.obs['Tissue'] == 'Unknown']['Unique_ID'])

# Step 2: Get Unique_IDs in other categories
other_ids = set(adata.obs[adata.obs['Tissue'] != 'Unknown']['Unique_ID'])

# Step 3: Exclude IDs that appear in other categories
filtered_unknown_ids = unknown_ids - other_ids

# Step 4: Count remaining Unique_IDs in 'Unknown'
count_filtered = len(filtered_unknown_ids)

print("Filtered Unique_IDs:", filtered_unknown_ids)
print("Count:", count_filtered)

In [None]:
import re

In [None]:
43 - len([i for i in filtered_unknown_ids if not re.search(r'HT|PDAC', i)])

#### Schelsinger GSM4293555 is also pancreas
#### Ding is also pancreas
#### Adj norm is adj norm

In [None]:
[i for i in filtered_unknown_ids if not re.search(r'HT|PDAC|Adj|GSM', i)]

### "Unknown" Tissue is all pancreas 

In [None]:
adata.obs.groupby('Dataset')['Unique_ID'].unique()

In [None]:
liver_ids = set(adata.obs[adata.obs['Tissue'] == 'Liver']['Unique_ID'])
liver_ids

In [None]:
adata.obs.groupby('Dataset')['Unique_ID'].nunique()

In [None]:
adata[adata.obs.Unique_ID.isin(liver_ids)].obs.groupby('Dataset')['Unique_ID'].nunique()

In [None]:
adata[adata.obs.Unique_ID.isin(liver_ids)].obs.groupby('Dataset')['Unique_ID'].unique()

In [None]:
adata_filtered = adata[~(adata.obs.Tissue == "Liver")].copy()

In [None]:
adata_filtered_mg = adata_filtered[:, adata_filtered.var.Manual_Genes].copy()

In [None]:
if np.all(adata_filtered_mg.X.todense() == np.floor(adata_filtered_mg.X.todense())):
    print("ðŸ”¢ adata.X contains raw counts (integers).")
else:
    print("ðŸ“Š adata.X contains log-normalized values (floats).")

# Bin the raw counts

In [None]:
import numpy as np
from scipy.sparse import issparse
import logging

def bin_data(adata, binning, key_to_process = None, result_binned_key="binned_data"):
    """
    Bins numerical data into discrete categories based on quantiles.

    Parameters:
        adata (AnnData): The input data object.
        key_to_process (str): Key in `adata.layers` to process.
        binning (int): Number of bins (must be an integer).
        result_binned_key (str): Key to store the binned results.

    Raises:
        ValueError: If `binning` is not an integer or data contains negative values.
    """
    if not isinstance(binning, int):
        raise ValueError(f"Binning must be an integer, but got {binning}.")

    layer_data = adata.layers[key_to_process] if key_to_process is not None else adata.X
    layer_data = layer_data.A if issparse(layer_data) else layer_data  # Convert sparse to dense if needed

    if layer_data.min() < 0:
        raise ValueError(f"Expecting non-negative data, but got min value {layer_data.min()}.")

    binned_rows = []
    bin_edges = []

    for row in layer_data:
        if row.max() == 0:
            logger.warning("Row contains all zeros. Consider filtering such rows.")
            binned_rows.append(np.zeros_like(row, dtype=np.int64))
            bin_edges.append(np.array([0] * binning))
            continue

        non_zero_ids = row.nonzero()
        non_zero_row = row[non_zero_ids]

        # Define bin thresholds based on quantiles
        bins = np.quantile(non_zero_row, np.linspace(0, 1, binning - 1))

        # Assign bin indices
        non_zero_digits = np.digitize(non_zero_row, bins)  # Converts values into bin indices
        binned_row = np.zeros_like(row, dtype=np.int64)
        binned_row[non_zero_ids] = non_zero_digits

        binned_rows.append(binned_row)
        bin_edges.append(np.concatenate([[0], bins]))

    # Store the binned data and bin edges
    adata.layers[result_binned_key] = np.stack(binned_rows)
    adata.obsm["bin_edges"] = np.stack(bin_edges)

In [None]:
bin_data(adata_filtered_mg, binning=50)

In [None]:
bin_data(adata_filtered_mg, binning=25, result_binned_key='binned_data_25_bins')

In [None]:
adata_filtered_mg.write('adata_mg_binned.h5ad')

In [None]:
adata_filtered_mg

In [None]:
adata_filtered_mg.layers['binned_data']

In [None]:
raw_counts = adata_filtered_mg.layers['binned_data']
print(f"Are raw counts in X integers? {np.all(raw_counts.astype(int) == raw_counts)}")
print(f"Mean raw counts: {np.mean(raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print(f"Percentage of zero counts: {np.mean(raw_counts == 0) * 100:.2f}%")
print("-" * 50)