#### Importing all the required **Python** and **R** libraries 

In [46]:
import os
import pathlib
import json
import pandas as pd
import numpy as np
import scanpy as sc
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")
import anndata as ad

%load_ext autoreload
%autoreload 2
#%load_ext lab_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
metadata = pd.read_csv('../../../supplementary_data/mouse/atlas_metadata_inhouse.csv', delimiter=';', header=0)

In [10]:
metadata = metadata[metadata['larry'] == 'yes']

In [11]:
metadata = metadata.set_index('sample_ID')


In [12]:
metadata.fillna('', inplace = True)

## read GEP

In [14]:
adatas = []
batches = []
for sample in metadata.index.values:
    try:
        meta = metadata
        ad_ = sc.read_10x_h5(meta.loc[sample, 'path'])
        ad_.var_names_make_unique()
        ad_.obs['donor_id'] = [sample] * ad_.n_obs if meta.loc[sample,'mouse_ID'] == '' else [meta.loc[sample,'mouse_ID']] * ad_.n_obs   
        ad_.obs['disease'] = ['PDAC'] * ad_.n_obs
        ad_.obs['model'] = [meta.loc[sample, 'model']] * ad_.n_obs
        ad_.obs['barcoded'] = [meta.loc[sample, 'larry']] * ad_.n_obs
        ad_.obs['sex'] = [meta.loc[sample, 'sex']] * ad_.n_obs
        ad_.obs['strain'] = [meta.loc[sample, 'strain']] * ad_.n_obs
        ad_.obs['genotype'] = [meta.loc[sample, 'genotype group']] * ad_.n_obs
        ad_.obs['treatment'] = [meta.loc[sample, 'treatment']] * ad_.n_obs
        ad_.obs['cell_filtering'] = [meta.loc[sample, 'enriched fraction']] * ad_.n_obs
        adatas.append(ad_)
        batches.append(sample)
    except Exception as e:
        print(f"Error processing sample {sample}: {e}")

In [19]:
adata = adatas[0].concatenate(adatas[1:], join='inner', batch_key = 'sampleID', batch_categories = batches)

In [21]:
adata.var = adata.var.iloc[:, [0]]

## barcodes

#### 8442

In [30]:
barcodes = pd.read_csv('/mnt/storage/Daniele/preprocessed_data/mouse_pancreas/scRNA/202205/larry_targeted/barcodes_output.csv')

In [40]:
barcodes  = barcodes[barcodes['sample'].isin(["BC_14", "BC_16"])].set_index('cell').iloc[:, 3:]

In [51]:
barcodes['positive'] = barcodes.apply(lambda x: x.sum() > 10, axis = 1)

In [53]:
barcodes_8442 = barcodes['positive']

#### 9091

In [64]:
barcodes = pd.read_parquet('/mnt/storage/Daniele/preprocessed_data/mouse_pancreas/scRNA/202310/larry_targeted/merged.parquet')

In [67]:
barcodes['sample'] = barcodes.index.str.split('_').str[-1]

In [71]:
barcodes = barcodes[barcodes['sample'].isin(batches)].iloc[:, :-1]

In [75]:
barcodes['positive'] = barcodes.apply(lambda x: x.sum() > 10, axis = 1)

In [76]:
barcodes_9091 = barcodes['positive']

#### map cells

In [77]:
barcodes = barcodes_9091.append(barcodes_8442)

In [92]:
cells_matched = []
for cell in adata.obs_names:
    if 'BC16_Tumor' in cell:
        cell = cell.replace('BC16_Tumor', 'BC_16')
    elif 'BC14_Tumor' in cell:
        cell = cell.replace('BC14_Tumor', 'BC_14')
    else:
        cell = cell
    cell = f'{cell[:18]}_{cell[19:]}'
    cells_matched.append(cell)
adata.obs['cells_matched'] = cells_matched


In [100]:
adata.obs['larry_positive'] = adata.obs['cells_matched'].map(barcodes).fillna(False).astype('category')

In [101]:
del adata.obs['cells_matched']

In [106]:
adata.write('/mnt/storage/Daniele/atlases/mouse/01_mouse_larry_barcoded_raw.h5ad')