### 1. General info of dataset GSE221776

This is the Jupyter Notebook for dataset GSE221776. Its dataset includes one annotation and uni txt file for both CD4 and CD8 T cells. As seen below, in the txt file, each row is a gene and each column is a cell.

Thus, we need to transform this txt file and generate an overall AnnData object for all samples. 



In [1]:
# Environment setup
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as anndata
import scipy

In [2]:
# inspect the dataset
cd4_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE221776/GSE221776_sc_PBT_CD4_umi.txt.gz'
cd4_input = pd.read_csv(cd4_path, sep='\t', index_col=0) # the first column contains gene names and is the index

print(cd4_input.head()) 
print(cd4_input.shape) # (15095 rows, 15893 columns)

       gene_name  AAACCTGAGAGTACCG.1  AAACCTGAGCAATATG.1  AAACCTGAGCACACAG.1  \
1     AL627309.1                   0                   0                   0   
2     AP006222.2                   0                   0                   0   
3  RP11-206L10.3                   0                   0                   0   
4  RP11-206L10.2                   0                   0                   0   
5  RP11-206L10.9                   0                   0                   0   

   AAACCTGAGTGTCCAT.1  AAACCTGAGTTAAGTG.1  AAACCTGCACGGTAAG.1  \
1                   0                   0                   0   
2                   0                   0                   0   
3                   0                   0                   0   
4                   0                   0                   0   
5                   0                   0                   0   

   AAACCTGCAGCCTTTC.1  AAACCTGGTCCATCCT.1  AAACCTGGTTCTGAAC.1  ...  \
1                   0                   0                 

In [4]:
# inspect the annotation file
cd4_annotation_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE221776/GSE221776_sc_PBT_CD4_annotation.txt.gz'
cd4_annotation = pd.read_csv(cd4_annotation_path, sep='\t', index_col=0) # the first column contains gene names and is the index

cd4_annotation

Unnamed: 0_level_0,clonotype.tag,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,percent.hs,Run,Project,Species,donor,...,n_sG,tra.v,tra.j,trb.v,trb.j,cell_classification_deconv,paper_cluster,Cell.Type,library_name,donor_publicationID
cell_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGAGTACCG-1,clonotype39,3210,1645,1.59,14.17,0.56,NV035,AdUp01,Hu,BT5,...,,,,TRBV4-1,TRBJ1-2,TREG,6,CD4,PBT_batch1_library1_7donors_Gex,PBT_12
AAACCTGAGCAATATG-1,clonotype866,2791,1502,4.87,13.79,0.11,NV035,AdUp01,Hu,BT1,...,,TRAV1-1,TRAJ11,TRBV20-1,TRBJ1-6,TREG,6,CD4,PBT_batch1_library1_7donors_Gex,PBT_01
AAACCTGAGCACACAG-1,,3270,1433,3.06,21.01,0.40,NV035,AdUp01,Hu,BT1,...,,,,,,CTL,2,CD4,PBT_batch1_library1_7donors_Gex,PBT_01
AAACCTGAGTGTCCAT-1,clonotype583,3695,1668,1.62,16.39,0.27,NV035,AdUp01,Hu,BT7,...,,,,TRBV4-1,TRBJ2-1,CTL,1,CD4,PBT_batch1_library1_7donors_Gex,PBT_02
AAACCTGAGTTAAGTG-1,,3327,1469,4.39,25.51,0.27,NV035,AdUp01,Hu,BT5,...,,,,,,TN,3,CD4,PBT_batch1_library1_7donors_Gex,PBT_12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGGAACAGCACCGTC-12,clonotype46460,1893,912,1.74,28.19,1.27,NV099,AdUp07,Hu,BT38,...,,TRAV8-4,TRAJ17,TRBV19,TRBJ1-1,TN,3,CD4,PBT_batch5_library4_6donors_Gex,PBT_15
TTGGCAAGTTGCGCAC-12,clonotype46539,2922,1179,3.76,31.79,0.31,NV099,AdUp07,Hu,BT46,...,,TRAV13-1,TRAJ49,TRBV20-1,TRBJ2-3,TCM,0,CD4,PBT_batch5_library4_6donors_Gex,PBT_34
TTGTAGGGTAGCGCTC-12,,3692,1449,1.25,30.23,0.60,NV099,AdUp07,Hu,BT46,...,,,,,,CTL,1,CD4,PBT_batch5_library4_6donors_Gex,PBT_34
TTTCCTCTCGCCATAA-12,clonotype45367,919,691,4.35,3.81,0.00,NV099,AdUp07,Hu,BT46,...,,TRAV38-2DV8,TRAJ45,TRBV5-1,TRBJ2-2,TFH,9,CD4,PBT_batch5_library4_6donors_Gex,PBT_34


In [5]:
cd4_annotation.columns

Index(['clonotype.tag', 'nCount_RNA', 'nFeature_RNA', 'percent.mt',
       'percent.ribo', 'percent.hs', 'Run', 'Project', 'Species', 'donor',
       'tissue', 'site', 'Tumor.grade', 'Diagnosis', 'Diagnosis.subclass',
       'UMAP_1', 'UMAP_2', 'TRA.tag', 'TRB.tag', 'TRA.nt.chains.tag',
       'TRB.nt.chains.tag', 'TRA.aa.chains.tag', 'TRB.aa.chains.tag',
       'clon.size.tag', 'clon.proportion.tag', 'expDegree', 'sGroup_tag',
       'pattern', 'n_sG', 'tra.v', 'tra.j', 'trb.v', 'trb.j',
       'cell_classification_deconv', 'paper_cluster', 'Cell.Type',
       'library_name', 'donor_publicationID'],
      dtype='object')

In [30]:
useful_columns = ['site', 'Diagnosis', 'Diagnosis.subclass', 'cell_classification_deconv', 'Cell.Type', 'donor_publicationID']
useful_cd4_annotation = cd4_annotation[useful_columns]
useful_cd4_annotation

Unnamed: 0_level_0,site,Diagnosis,Diagnosis.subclass,cell_classification_deconv,Cell.Type,donor_publicationID
cell_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAACCTGAGAGTACCG-1,brain,Choroid plexus papilloma,Choroid plexus papilloma,TREG,CD4,PBT_12
AAACCTGAGCAATATG-1,brain,Low grade glioma,Pilocytic astrocytoma,TREG,CD4,PBT_01
AAACCTGAGCACACAG-1,brain,Low grade glioma,Pilocytic astrocytoma,CTL,CD4,PBT_01
AAACCTGAGTGTCCAT-1,brain,Low grade glioma,Pilocytic astrocytoma,CTL,CD4,PBT_02
AAACCTGAGTTAAGTG-1,brain,Choroid plexus papilloma,Choroid plexus papilloma,TN,CD4,PBT_12
...,...,...,...,...,...,...
TTGGAACAGCACCGTC-12,brain,Meningioma,Meningioma,TN,CD4,PBT_15
TTGGCAAGTTGCGCAC-12,brain,Medulloblastoma,Medulloblastoma,TCM,CD4,PBT_34
TTGTAGGGTAGCGCTC-12,brain,Medulloblastoma,Medulloblastoma,CTL,CD4,PBT_34
TTTCCTCTCGCCATAA-12,brain,Medulloblastoma,Medulloblastoma,TFH,CD4,PBT_34


In [20]:
useful_cd4_annotation['donor_publicationID'].unique()

array(['PBT_12', 'PBT_01', 'PBT_02', 'PBT_10', nan, 'PBT_05', 'PBT_11',
       'PBT_35', 'PBT_25', 'PBT_26', 'PBT_24', 'PBT_20', 'PBT_32',
       'PBT_30', 'PBT_09', 'PBT_21', 'PBT_23', 'PBT_13', 'PBT_22',
       'PBT_29', 'PBT_31', 'PBT_04', 'PBT_14', 'PBT_07', 'PBT_17',
       'PBT_03', 'PBT_18', 'PBT_33', 'PBT_27', 'PBT_28', 'PBT_19',
       'PBT_08', 'PBT_38', 'PBT_34', 'PBT_06', 'PBT_36', 'PBT_15'],
      dtype=object)

In [24]:
cd4_subset = useful_cd4_annotation[useful_cd4_annotation['donor_publicationID'].isna()]
cd4_subset

Unnamed: 0_level_0,site,Diagnosis,Diagnosis.subclass,cell_classification_deconv,Cell.Type,donor_publicationID
cell_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAACGGGGTCCGAGTC-1,brain,,,TCM,CD4,
AAACGGGTCACAACGT-1,brain,,,TCM,CD4,
AAAGATGCAGACAAAT-1,brain,,,TN,CD4,
AAAGATGGTGACAAAT-1,brain,,,CTL,CD4,
AAAGCAAGTTTAGCTG-1,brain,,,TCM,CD4,
...,...,...,...,...,...,...
CCCTCCTGTAGCTAAA-12,brain,,,TCM,CD4,
GACCTGGGTTGTCTTT-12,brain,,,Cell_Cycle,CD4,
GCATGTATCGGCCGAT-12,brain,,,TCM,CD4,
GTGCATAGTTCGAATC-12,brain,,,CTL,CD4,


<span style="color:red">**PROBLEM:**</span> 898 cells from the CD4 file are from unknown donors

In [8]:
# inspect the CD8 data

cd8_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE221776/GSE221776_sc_PBT_CD8_umi.txt.gz'
cd8_input = pd.read_csv(cd8_path, sep='\t', index_col=0) # the first column contains gene names and is the index

print(cd8_input.head()) 
print(cd8_input.shape) # (14674, 27902)

       gene_name  AAACCTGAGCGATATA.1  AAACCTGCACCGAAAG.1  AAACCTGGTCACCCAG.1  \
1     AL627309.1                   0                   0                   0   
2     AP006222.2                   0                   0                   0   
3  RP11-206L10.3                   0                   0                   0   
4  RP11-206L10.2                   0                   0                   0   
5  RP11-206L10.9                   1                   0                   0   

   AAACCTGTCCTCTAGC.1  AAACGGGAGAACTCGG.1  AAAGATGAGACTGGGT.1  \
1                   0                   0                   0   
2                   0                   0                   0   
3                   0                   0                   0   
4                   0                   0                   0   
5                   0                   0                   0   

   AAAGATGCACAGGTTT.1  AAAGATGCAGGACGTA.1  AAAGATGGTAAGTTCC.1  ...  \
1                   0                   0                 

In [9]:
# inspect the annotation file
cd8_annotation_path = '/scratch/user/s4543064/xiaohan-john-project/data/GSE221776/GSE221776_sc_PBT_CD8_annotation.txt.gz'
cd8_annotation = pd.read_csv(cd8_annotation_path, sep='\t', index_col=0) # the first column contains gene names and is the index

cd8_annotation

Unnamed: 0_level_0,clonotype.tag,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,percent.hs,Run,Project,Species,donor,...,n_sG,tra.v,tra.j,trb.v,trb.j,cell_classification_deconv,paper_cluster,Cell.Type,library_name,donor_publicationID
cell_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGCGATATA-1,,2131,1183,3.89,19.33,0.23,NV035,AdUp01,Hu,,...,,,,,,GZMK_HI,0,CD8,PBT_batch1_library1_7donors_Gex,
AAACCTGCACCGAAAG-1,,2275,1153,5.27,21.53,0.09,NV035,AdUp01,Hu,BT4,...,,,,,,TRM,1,CD8,PBT_batch1_library1_7donors_Gex,PBT_10
AAACCTGGTCACCCAG-1,clonotype312,3297,1519,1.70,23.38,1.27,NV035,AdUp01,Hu,BT3,...,,TRAV3,TRAJ34,,,GZMK_HI,0,CD8,PBT_batch1_library1_7donors_Gex,PBT_05
AAACCTGTCCTCTAGC-1,,1263,843,5.06,9.72,0.00,NV035,AdUp01,Hu,BT3,...,,,,,,TRM,1,CD8,PBT_batch1_library1_7donors_Gex,PBT_05
AAACGGGAGAACTCGG-1,clonotype247,1529,916,5.88,16.07,0.07,NV035,AdUp01,Hu,BT3,...,,,,TRBV19,TRBJ2-1,TRM,1,CD8,PBT_batch1_library1_7donors_Gex,PBT_05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCACAAACC-10,clonotype45287,1560,1009,0.83,10.71,0.13,NV099,AdUp07,Hu,BT38,...,,TRAV19,TRAJ24,TRBV5-4,TRBJ2-1,GZMK_HI,0,CD8,PBT_batch5_library2_6donors_Gex,PBT_15
TTTGGTTTCCGAGCCA-10,,3079,1385,1.75,21.20,3.67,NV099,AdUp07,Hu,BT38,...,,,,,,CD16p_Effector,5,CD8,PBT_batch5_library2_6donors_Gex,PBT_15
TTTGGTTTCGAGAACG-10,,4367,1671,1.60,25.39,0.16,NV099,AdUp07,Hu,BT38,...,,,,,,GZMK_HI,0,CD8,PBT_batch5_library2_6donors_Gex,PBT_15
TTTGGTTTCTCCGGTT-10,,1220,749,4.26,16.30,0.08,NV099,AdUp07,Hu,BT46,...,,,,,,GZMK_HI,0,CD8,PBT_batch5_library2_6donors_Gex,PBT_34


In [10]:
cd8_annotation.columns

Index(['clonotype.tag', 'nCount_RNA', 'nFeature_RNA', 'percent.mt',
       'percent.ribo', 'percent.hs', 'Run', 'Project', 'Species', 'donor',
       'tissue', 'site', 'Tumor.grade', 'Diagnosis', 'Diagnosis.subclass',
       'UMAP_1', 'UMAP_2', 'TRA.tag', 'TRB.tag', 'TRA.nt.chains.tag',
       'TRB.nt.chains.tag', 'TRA.aa.chains.tag', 'TRB.aa.chains.tag',
       'clon.size.tag', 'clon.proportion.tag', 'expDegree', 'sGroup_tag',
       'pattern', 'n_sG', 'tra.v', 'tra.j', 'trb.v', 'trb.j',
       'cell_classification_deconv', 'paper_cluster', 'Cell.Type',
       'library_name', 'donor_publicationID'],
      dtype='object')

In [25]:
useful_cd8_annotation = cd8_annotation[useful_columns]
useful_cd8_annotation

Unnamed: 0_level_0,site,Diagnosis,Diagnosis.subclass,cell_classification_deconv,Cell.Type,donor_publicationID
cell_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAACCTGAGCGATATA-1,brain,,,GZMK_HI,CD8,
AAACCTGCACCGAAAG-1,brain,Low grade glioma,Pilocytic astrocytoma,TRM,CD8,PBT_10
AAACCTGGTCACCCAG-1,brain,Low grade glioma,Pilocytic astrocytoma,GZMK_HI,CD8,PBT_05
AAACCTGTCCTCTAGC-1,brain,Low grade glioma,Pilocytic astrocytoma,TRM,CD8,PBT_05
AAACGGGAGAACTCGG-1,brain,Low grade glioma,Pilocytic astrocytoma,TRM,CD8,PBT_05
...,...,...,...,...,...,...
TTTGGTTTCACAAACC-10,brain,Meningioma,Meningioma,GZMK_HI,CD8,PBT_15
TTTGGTTTCCGAGCCA-10,brain,Meningioma,Meningioma,CD16p_Effector,CD8,PBT_15
TTTGGTTTCGAGAACG-10,brain,Meningioma,Meningioma,GZMK_HI,CD8,PBT_15
TTTGGTTTCTCCGGTT-10,brain,Medulloblastoma,Medulloblastoma,GZMK_HI,CD8,PBT_34


In [26]:
useful_cd8_annotation['donor_publicationID'].unique()

array([nan, 'PBT_10', 'PBT_05', 'PBT_12', 'PBT_01', 'PBT_35', 'PBT_11',
       'PBT_02', 'PBT_26', 'PBT_25', 'PBT_32', 'PBT_24', 'PBT_30',
       'PBT_09', 'PBT_20', 'PBT_21', 'PBT_29', 'PBT_04', 'PBT_07',
       'PBT_13', 'PBT_22', 'PBT_14', 'PBT_23', 'PBT_17', 'PBT_31',
       'PBT_03', 'PBT_19', 'PBT_33', 'PBT_18', 'PBT_27', 'PBT_28',
       'PBT_08', 'PBT_38', 'PBT_15', 'PBT_06', 'PBT_34', 'PBT_37',
       'PBT_36', 'PBT_16'], dtype=object)

In [27]:
len(useful_cd8_annotation['donor_publicationID'].unique())

39

In [28]:
cd8_subset = useful_cd8_annotation[useful_cd8_annotation['donor_publicationID'].isna()]
cd8_subset

Unnamed: 0_level_0,site,Diagnosis,Diagnosis.subclass,cell_classification_deconv,Cell.Type,donor_publicationID
cell_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAACCTGAGCGATATA-1,brain,,,GZMK_HI,CD8,
AAAGATGAGACTGGGT-1,brain,,,GZMK_HI,CD8,
AAATGCCAGACTGGGT-1,brain,,,MAIT,CD8,
AAATGCCTCACATACG-1,brain,,,TCF7high_CCR7high,CD8,
AACACGTCAATGTAAG-1,brain,,,GZMK_HI,CD8,
...,...,...,...,...,...,...
TCAGCTCAGTATCGAA-10,brain,,,TCF7high_CCR7low,CD8,
TCAGGATTCCACGCAG-10,brain,,,MAIT,CD8,
TCCACACCATGGGAAC-10,brain,,,MAIT,CD8,
TCTTTCCCATATGCTG-10,brain,,,CD16p_Effector,CD8,


<span style="color:red">**PROBLEM:**</span> 1569 cells from the CD8 file are from unknown donors

### 2. Overall AnnData object of the dataset

<span style="color:red">**IMPORTANT:**</span> transpose the DataFrame.values to match the AnnData.X

1. `DataFrame.columns`: cell barcodes, which go into `.obs`
2. `DataFrame.index`: gene names, `.var`
3. `DataFrame.values`: the transpose of the expression matrix, `.X`

In [None]:
# Load the metadata from the paper's Supplementary Table 1
age_sex_recurrent = {                
                'PBT_01': [4   ,'male', 'primary'],
                'PBT_02': [11  ,'male', 'primary'],
                'PBT_03': [4   ,'female', 'primary'],
                'PBT_04': [8   ,'female', 'primary'],
                'PBT_05': [6   ,'female', 'primary'],
                'PBT_06': [9   ,'male', 'primary'],
                'PBT_07': [4   ,'female', 'primary'],
                'PBT_08': [5   ,'male', 'primary'],
                'PBT_09': [8   ,'male', 'primary'],
                'PBT_10': [11  ,'female', 'primary'],
                'PBT_11': [1.5 ,'female', 'primary'],
                'PBT_12': [17  ,'female', 'primary'],
                'PBT_13': [2   ,'female', 'primary'],
                'PBT_14': [13  ,'female', 'recurrent'],
                'PBT_15': [14  ,'female', 'primary'],
                'PBT_16': [2   ,'female', 'primary'],
                'PBT_17': [1.5 ,'male', 'primary'],
                'PBT_18': [2   ,'female', 'primary'],
                'PBT_19': [6   ,'female', 'primary'],
                'PBT_20': [9   ,'female', 'recurrent'],
                'PBT_21': [1.75,'male', 'recurrent'],
                'PBT_22': [8   ,'male', 'primary'],
                'PBT_23': [13  ,'female', 'primary'],
                'PBT_24': [12  ,'male', 'primary'],
                'PBT_25': [0.2 ,'female', 'primary'],
                'PBT_26': [10  ,'female', 'primary'],
                'PBT_27': [7   ,'male', 'primary'],
                'PBT_28': [5   ,'male', 'primary'],
                'PBT_29': [2   ,'male', 'primary'],
                'PBT_30': [4   ,'female', 'recurrent'],
                'PBT_31': [7   ,'male', 'primary'],
                'PBT_32': [6   ,'female', 'primary'],
                'PBT_33': [13  ,'female', 'primary'],
                'PBT_34': [5   ,'male', 'primary'],
                'PBT_35': [11  ,'female', 'primary'],
                'PBT_36': [9   ,'male', 'primary'],
                'PBT_37': [17  ,'female', 'primary'],
                'PBT_38': [12  ,'female', 'primary'],
                'PBT_39': [5   ,'male', 'primary'],
                'PBT_40': [5   ,'male', 'primary']
                }


,
'PBT_01': [4   ,'male', 'primary'],
'PBT_02': [11  ,'male', 'primary'],
'PBT_03': [4   ,'female', 'primary'],
'PBT_04': [8   ,'female', 'primary'],
'PBT_05': [6   ,'female', 'primary'],
'PBT_06': [9   ,'male', 'primary'],
'PBT_07': [4   ,'female', 'primary'],
'PBT_08': [5   ,'male', 'primary'],
'PBT_09': [8   ,'male', 'primary'],
'PBT_10': [11  ,'female', 'primary'],
'PBT_11': [1.5 ,'female', 'primary'],
'PBT_12': [17  ,'female', 'primary'],
'PBT_13': [2   ,'female', 'primary'],
'PBT_14': [13  ,'female', 'recurrent'],
'PBT_15': [14  ,'female', 'primary'],
'PBT_16': [2   ,'female', 'primary'],
'PBT_17': [1.5 ,'male', 'primary'],
'PBT_18': [2   ,'female', 'primary'],
'PBT_19': [6   ,'female', 'primary'],
'PBT_20': [9   ,'female', 'recurrent'],
'PBT_21': [1.75,'male', 'recurrent'],
'PBT_22': [8   ,'male', 'primary'],
'PBT_23': [13  ,'female', 'primary'],
'PBT_24': [12  ,'male', 'primary'],
'PBT_25': [0.2 ,'female', 'primary'],
'PBT_26': [10  ,'female', 'primary'],
'PBT_27': [7   ,'male', 'primary'],
'PBT_28': [5   ,'male', 'primary'],
'PBT_29': [2   ,'male', 'primary'],
'PBT_30': [4   ,'female', 'recurrent'],
'PBT_31': [7   ,'male', 'primary'],
'PBT_32': [6   ,'female', 'primary'],
'PBT_33': [13  ,'female', 'primary'],
'PBT_34': [5   ,'male', 'primary'],
'PBT_35': [11  ,'female', 'primary'],
'PBT_36': [9   ,'male', 'primary'],
'PBT_37': [17  ,'female', 'primary'],
'PBT_38': [12  ,'female', 'primary'],
'PBT_39': [5   ,'male', 'primary'],
'PBT_40': [5   ,'male', 'primary']

In [10]:
inputs = (cd4_input, cd8_input)
annotations = (useful_cd4_annotation, useful_cd8_annotation)

age_sex_dict = {
'PBT_01':	[4	,M	Low grade glioma	Pilocytic astrocytoma	I	New
'PBT_02':	[11,	M	Low grade glioma	Pilocytic astrocytoma	I	New
'PBT_03':	[4	,F	Low grade glioma	Pilocytic astrocytoma	I	New
'PBT_04':	[8	,F	Low grade glioma	Diffuse astrocytoma	II	New
'PBT_05':	[6	,F	Low grade glioma	Pilocytic astrocytoma	I	New
'PBT_06':	[9	,M	Low grade glioma	Subependymal giant cell astrocytoma	I	New
'PBT_07':	[4	,F	Low grade glioma	Pilocytic astrocytoma	I	New
'PBT_08':	[5	,M	Low grade glioma	Pilocytic astrocytoma	I	New
'PBT_09':	[8	,M	Low grade glioma	Ganglioglioma	I	New
'PBT_10':	[11,	F	Low grade glioma	Pilocytic astrocytoma	I	New
'PBT_11':	[1.,5	F	Low grade glioma	Ganglioglioma	I	New
'PBT_12':	[17,	F	Choroid plexus papilloma	NA	I	New
'PBT_13':	[2	,F	Craniopharyngioma	NA	I	New
'PBT_14':	[13,	F	Craniopharyngioma	NA	I	Recurrent
'PBT_15':	[14,	F	Anaplastic meningioma	NA	III	New
'PBT_16':	[2	,F	Embryonal tumor with multilayered rosettes (ETMR)	ETMR, C19MC altered	IV	New
'PBT_17':	[1.,5	M	Anaplastic ependymoma	NA	III	New
'PBT_18':	[2	,F	Anaplastic ependymoma	NA	III	New
'PBT_19':	[6	,F	Anaplastic ependymoma	NA	III	New
'PBT_20':	[9	,F	Anaplastic ependymoma	NA	IV	Recurrent
'PBT_21':	[1.,75	M	Anaplastic ependymoma	NA	III	Recurrent
'PBT_22':	[8	,M	High grade glioma	NA	IV	New
'PBT_23':	[13,	F	High grade glioma	NA	III	New
'PBT_24':	[12,	M	High grade glioma	Diffuse midline glioma	IV	New
'PBT_25':	[0.,2	F	High grade glioma	NA	IV	New
'PBT_26':	[10,	F	High grade glioma	Diffuse midline glioma	III	New
'PBT_27':	[7	,M	High grade glioma		IV	New
'PBT_28':	[5	,M	High grade glioma	Infant-type hemispheric glioma	IV	New
'PBT_29':	[2	,M	Medulloblastoma	NON-WNT	IV	New
'PBT_30':	[4	,F	Medulloblastoma	NON-WNT, NON-SHH	IV	Recurrent
'PBT_31':	[7	,M	Medulloblastoma	NON-WNT, NON-SHH	IV	New
'PBT_32':	[6	,F	Medulloblastoma	NON-WNT, NON-SHH	IV	New
'PBT_33':	[13,	F	Medulloblastoma	NON-WNT, NON-SHH	IV	New
'PBT_34':	[5	,M	Medulloblastoma	NON-WNT, NON-SHH	IV	New
'PBT_35':	[11,	F	Medulloblastoma	WNT-activated	IV	New
'PBT_36':	[9	,M	Medulloblastoma	NON-WNT, NON-SHH	IV	New
'PBT_37':	[17,	F	Medulloblastoma	SHH-activated	IV	New
'PBT_38':	[12,	F	Medulloblastoma	WNT-activated	IV	New
'PBT_39':	[5	,M	Medulloblastoma	NON-WNT, NON-SHH	IV	New
'PBT_40':	[5	,M	Medulloblastoma		IV	New
}

for input, annotation in zip(inputs, annotations):

    # add more info in .obs
    annotation['dataset'] = 'GSE221776'
    annotation['cancer_type'] = annotation['Diagnosis'] + annotations['Diagnosis.subclass']
    annotation['uni_barcode'] = annotation['dataset'] + '_' + annotation.index.astype(str)
    annotation['sample_barcode'] = annotation['dataset'] + '_' + annotation['donor_publicationID']
    annotation['cell_type_from_paper'] = annotation['cell_classification_deconv'] + '_' + annotation['Cell.Type']
    	

    # rename/drop the columns for consistency
    annotation.rename(columns={'site': 'tissue'}, inplace=True)

    matrix = scipy.sparse.csr_matrix(input.values.T)
    obs_name = annotation
    var_name = pd.DataFrame(input.index)
    var_name.rename(columns={'Gene': 'gene_symbols'}, inplace=True)

    sample = anndata.AnnData(X=matrix, obs=obs_name, var=var_name)
    print(sample)

    # Create an observation metric info to store related features


    sample.obs = obs_metrics
    sample.obs.set_index("uni_barcode", drop=False, inplace=True)
    print(sample)

    # save the anndata object
    sample.write_h5ad('/scratch/user/s4543064/xiaohan-john-project/write/GSE102130/GSE102130_K27Mproject.RSEM.vh20170621_uni.h5ad', compression="gzip")



AnnData object with n_obs × n_vars = 4058 × 23686
    var: 'gene_symbols'
AnnData object with n_obs × n_vars = 4058 × 23686
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_symbols'


In [11]:
sample.obs

Unnamed: 0_level_0,cancer_type,dataset,tissue,uni_barcode
uni_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GSE102130_MUV1-P04-B12,H3K27M-glioma,GSE102130,brain,GSE102130_MUV1-P04-B12
GSE102130_MUV1-P04-C08,H3K27M-glioma,GSE102130,brain,GSE102130_MUV1-P04-C08
GSE102130_MUV1-P04-D09,H3K27M-glioma,GSE102130,brain,GSE102130_MUV1-P04-D09
GSE102130_MUV1-P04-D10,H3K27M-glioma,GSE102130,brain,GSE102130_MUV1-P04-D10
GSE102130_MUV1-P04-E03,H3K27M-glioma,GSE102130,brain,GSE102130_MUV1-P04-E03
...,...,...,...,...
GSE102130_Oligo-P22-H03,H3K27M-glioma,GSE102130,brain,GSE102130_Oligo-P22-H03
GSE102130_Oligo-P22-H05,H3K27M-glioma,GSE102130,brain,GSE102130_Oligo-P22-H05
GSE102130_Oligo-P22-H06,H3K27M-glioma,GSE102130,brain,GSE102130_Oligo-P22-H06
GSE102130_Oligo-P22-H08,H3K27M-glioma,GSE102130,brain,GSE102130_Oligo-P22-H08


### 3. Confirmation of created AnnData object

In [13]:
output = '/scratch/user/s4543064/xiaohan-john-project/write/GSE102130/GSE102130_K27Mproject.RSEM.vh20170621_uni.h5ad'
sample = anndata.read_h5ad(output)
print(sample)

AnnData object with n_obs × n_vars = 4058 × 23686
    obs: 'cancer_type', 'dataset', 'tissue', 'uni_barcode'
    var: 'gene_symbols'


### 4. Convert AnnData objects to SingleCellExperiment objects

In [14]:
from pathlib import Path

import anndata2ri
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.conversion import localconverter

# Specify directory paths
write_directory = Path('/scratch/user/s4543064/xiaohan-john-project/write/GSE102130')

# Loop through all files in the directory
for file in write_directory.iterdir():
    sample_name = file.stem
    if "_uni.h5ad" in file.name:
        sample_anndata = anndata.read_h5ad(file)
        sample_sce_file = sample_name + ".rds"

        with localconverter(anndata2ri.converter):
            sample_sce = anndata2ri.py2rpy(sample_anndata)
        print(sample_sce)
        
        # Save the sce object in .rds file
        robjects.globalenv["sample_sce"] = sample_sce
        sample_sce_path = write_directory / sample_sce_file
        robjects.r("saveRDS(sample_sce, file='{}')".format(sample_sce_path))