# scvi prep

* Going to perform scvi integration to show results are comparable to harmony integration. Xiaohan will perform integration, however scvi needs to be done on raw data so preparing raw object here (take original raw object but then remove cells that were excluding during cleaning and transfer relevant metadata)

In [1]:
import scanpy as sc
import os
import pandas as pd

In [3]:
#read in the files
data_raw = sc.read_h5ad('/scratch/user/s4436039/scdata/Myeloid_Objects/NR_data_intersect.h5ad') #raw data
data_clean = sc.read_h5ad('/scratch/user/s4436039/scdata/Python_Integration_Sep/NRclean_clustered2.h5ad') #cleaned but processed data

In [4]:
data_raw

AnnData object with n_obs × n_vars = 623908 × 15074
    obs: 'nCount_RNA', 'nFeature_RNA', 'sample_type', 'cancer_type', 'patient_id', 'sample_id', 'percent.mt', 'site', 'sample_type_major', 'cancer_subtype', 'integration_id', 'ident', 'dataset_id'
    var: 'name'

In [5]:
data_clean

AnnData object with n_obs × n_vars = 498023 × 1268
    obs: 'nCount_RNA', 'nFeature_RNA', 'sample_type', 'cancer_type', 'patient_id', 'sample_id', 'percent.mt', 'site', 'sample_type_major', 'cancer_subtype', 'integration_id', 'ident', 'dataset_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'keep_or_remove', 'leiden', 'celltype', 'leiden_DC', 'leiden_DC_v2', 'exclude_annots', 'leiden_M', 'Technology', 'cancer_type_broad', 'clean-leiden_0.8', 'clean-subcluster', 'clean-subcluster-v2', 'YS_sig', 'HSC_sig', 'CD5nDC2_up_vDC3', 'DC3_up_vCD5nDC2', 'CD5pDC2_up_vDC3', 'DC3_up_vCD5pDC2', 'DC3_up_vMono', 'Mono_up_vDC3', 'DC1_sig', 'DC2_sig', 'DC3_sig', 'DC_Axl_sig', 'DC2_up_DC3', 'DC3_up_DC2', 'mregDC_sig', 'cDC1_mregDC', 'cDC2_mregDC', 'NR_annotati

### subset data_raw to only the cells in data_clean based on barcode:

In [6]:
# Get the list of barcodes (cell names) in data_clean
barcodes_to_keep = data_clean.obs.index

In [7]:
# subset data_raw to just those in data_clean
data_raw_subset = data_raw[data_raw.obs.index.isin(barcodes_to_keep), :]

In [8]:
data_raw_subset

View of AnnData object with n_obs × n_vars = 498023 × 15074
    obs: 'nCount_RNA', 'nFeature_RNA', 'sample_type', 'cancer_type', 'patient_id', 'sample_id', 'percent.mt', 'site', 'sample_type_major', 'cancer_subtype', 'integration_id', 'ident', 'dataset_id'
    var: 'name'

### Transfer desired metadata to data_raw_subset
* Technology
* NR_annotations_simple
* sample_type_major2
* cancer_with_H
* cancer_broadest
* patient_treatment

In [9]:
meta_columns = ['Technology','NR_annotations_simple','sample_type_major2','cancer_with_H','cancer_broadest','patient_treatment']

In [10]:
# Subset the metadata in data_clean to match data_raw_subset's barcodes
metadata_to_transfer = data_clean.obs.loc[data_raw_subset.obs.index, meta_columns]

In [11]:
# Add these columns to data_raw_subset's obs
for col in meta_columns:
    data_raw_subset.obs[col] = metadata_to_transfer[col]

  data_raw_subset.obs[col] = metadata_to_transfer[col]


In [12]:
# Verify that the metadata was transferred
data_raw_subset.obs.tail()

Unnamed: 0,nCount_RNA,nFeature_RNA,sample_type,cancer_type,patient_id,sample_id,percent.mt,site,sample_type_major,cancer_subtype,integration_id,ident,dataset_id,Technology,NR_annotations_simple,sample_type_major2,cancer_with_H,cancer_broadest,patient_treatment
GSE180661_HGSOC_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY_TTTCATGGTTACCTGA,-4.499666,-9.853381,Adnexa,HGSOC,Left Adnexa,SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,1779.0,ovary,primary tumour,HGSOC,GSE180661_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,all_integrated_intersect,GSE180661,10x3',Classical monocytes,primary tumour,HGSOC,HGSOC,Treatment naive
GSE180661_HGSOC_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY_TTTCGATGTGACGTCC,-4.871771,-10.33713,Adnexa,HGSOC,Left Adnexa,SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,1541.0,ovary,primary tumour,HGSOC,GSE180661_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,all_integrated_intersect,GSE180661,10x3',Classical monocytes,primary tumour,HGSOC,HGSOC,Treatment naive
GSE180661_HGSOC_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY_TTTCGATTCTGCGAGC,-4.766977,-8.292435,Adnexa,HGSOC,Left Adnexa,SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,2596.0,ovary,primary tumour,HGSOC,GSE180661_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,all_integrated_intersect,GSE180661,10x3',Tissue resident macrophage,primary tumour,HGSOC,HGSOC,Treatment naive
GSE180661_HGSOC_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY_TTTGATCGTTGCTCCT,-4.13331,-8.758132,Adnexa,HGSOC,Left Adnexa,SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,1200.0,ovary,primary tumour,HGSOC,GSE180661_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,all_integrated_intersect,GSE180661,10x3',Tissue resident macrophage,primary tumour,HGSOC,HGSOC,Treatment naive
GSE180661_HGSOC_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY_TTTGGTTGTGTACAGG,-4.476278,-10.355681,Adnexa,HGSOC,Left Adnexa,SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,1895.0,ovary,primary tumour,HGSOC,GSE180661_SPECTRUM-OV-045_S1_CD45P_LEFT_OVARY,all_integrated_intersect,GSE180661,10x3',Classical monocytes,primary tumour,HGSOC,HGSOC,Treatment naive


In [13]:
data_raw_subset

AnnData object with n_obs × n_vars = 498023 × 15074
    obs: 'nCount_RNA', 'nFeature_RNA', 'sample_type', 'cancer_type', 'patient_id', 'sample_id', 'percent.mt', 'site', 'sample_type_major', 'cancer_subtype', 'integration_id', 'ident', 'dataset_id', 'Technology', 'NR_annotations_simple', 'sample_type_major2', 'cancer_with_H', 'cancer_broadest', 'patient_treatment'
    var: 'name'

In [14]:
#set directory 
os.chdir('/scratch/user/s4436039/scdata/Myeloid_Objects')

In [15]:
#export raw cleaned myeloid object as a h5ad object 
data_raw_subset.write_h5ad("NRclean_Raw_Data.h5ad", compression="gzip")

``` rsync -azvhp /scratch/user/s4436039/scdata/Myeloid_Objects/NRclean_Raw_Data.h5ad /QRISdata/Q5935/nikita/scdata/2025-04-29_Raw-data-for-scvi/ ```

### This object will then undergo scvi integration by Xiaohan