#### Adding extra metadata to samples --> kit technology (3' vs 5') and patient sex so that I can check these are integrated well


# Set up workspace:

In [1]:
#load packages I need
import os
import tools
import scanpy as sc
import pandas as pd
import dandelion as ddl
from tqdm import tqdm
import matplotlib.pyplot as plt
import scanpy.external as sce
from matplotlib.pyplot import rc_context

In [2]:
#show current directory 
os.getcwd()
os.chdir('/scratch/user/s4436039/scdata/Python_Integration_Sep')
os.getcwd()

'/scratch/user/s4436039/scdata/Python_Integration_Sep'

Print software versions:

In [3]:
sc.logging.print_header()

scanpy==1.10.2 anndata==0.10.8 umap==0.5.6 numpy==1.26.4 scipy==1.14.0 pandas==2.2.2 scikit-learn==1.5.0 statsmodels==0.14.2 igraph==0.11.5 pynndescent==0.5.12


In [4]:
ddl.logging.print_header()

dandelion==0.3.8 pandas==2.2.2 numpy==1.26.4 matplotlib==3.9.2 networkx==3.3 scipy==1.14.0


Set plotting parameters

In [5]:
sc.settings.set_figure_params(dpi=80, facecolor="white")

In [6]:
#read in data
data = sc.read_h5ad("NR_cleandata.h5ad") 

In [7]:
data

AnnData object with n_obs × n_vars = 498382 × 15074
    obs: 'nCount_RNA', 'nFeature_RNA', 'sample_type', 'cancer_type', 'patient_id', 'sample_id', 'percent.mt', 'site', 'sample_type_major', 'cancer_subtype', 'integration_id', 'ident', 'dataset_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'keep_or_remove', 'leiden', 'celltype', 'leiden_DC', 'leiden_DC_v2', 'exclude_annots', 'leiden_M'
    var: 'name', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'log1p'

In [12]:
#make a new metadata column called Technology
data.obs["Technology"] = "10x3'"

In [11]:
# check how many keep vs remove
dataset_counts = data.obs['dataset_id'].value_counts()
print(dataset_counts)

dataset_id
GSE180661       171225
GSE154826        66592
GSE131907        32876
GSE234933        31940
PRJCA005422      26133
GSE235676        20872
GSE161529        16403
GSE189903        14644
GSE162498        11380
GSE200218         8872
GSE176078         8179
GSE224090         7929
GSE197177         7260
GSE164690         7249
GSE217517         7036
GSE139324         6675
GSE184880         6304
PMID32561858      5758
GSE223063         5443
GSE112271         4472
GSE195861         4083
GSE188737         3465
GSE173468         3342
GSE156405         3101
GSE213243         2665
PRJNA907381       2480
GSE167297         1974
GSE162025         1610
GSE183916         1501
GSE214295         1491
GSE231535         1384
GSE225600         1192
GSE234129          924
GSE154778          843
GSE215120          751
GSE199515          334
Name: count, dtype: int64


In [13]:
#make a copy of the data containing just the 5' datasets
data_5_tech = data[data.obs["dataset_id"].isin(["GSE213243","GSE189903","GSE162025","GSE173468","GSE188737","GSE234933","GSE183916","GSE234129"])]

In [14]:
#set all those in data_5_tech to 5 prime
data_5_tech.obs["Technology"] = "10x5'"



In [15]:
#update the object to have correct tech labels 
data.obs["Technology"].update(data_5_tech.obs["Technology"]) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




In [18]:
#make a copy of the data containing just the unknowns
data_unknown_tech = data[data.obs["dataset_id"].isin(["PRJCA005422","PMID32561858"])]

In [19]:
#set all those in data_unknown_tech to unknown
data_unknown_tech.obs["Technology"] = "Unknown"



In [22]:
#make a copy of the data_unknown to fix CRC samples
data_unknown_CRC = data_unknown_tech[data_unknown_tech.obs["site"].isin(["colon"])]

In [23]:
#set all those in colon to 3'
data_unknown_CRC.obs["Technology"] = "10x3'"



In [24]:
#update the object to have correct tech labels 
data_unknown_tech.obs["Technology"].update(data_unknown_CRC.obs["Technology"]) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




In [25]:
#update the object to have correct tech labels 
data.obs["Technology"].update(data_unknown_tech.obs["Technology"]) 

In [26]:
# check how many keep vs remove
tech_counts = data.obs['Technology'].value_counts()
print(tech_counts)

Technology
10x3'      409706
10x5'       60091
Unknown     28585
Name: count, dtype: int64


In [27]:
#export NR_clean
data.write_h5ad("NR_cleandata.h5ad", compression="gzip")