### This is the notebook for analyzing LUAD data

##### Step 1: organize the data into anndata object

In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
print(ad.__version__)

0.10.9


In [None]:

cancer_type = "LUAD"
omic_type = "proteomics"

# Tumor file
sample_type = "Tumor"
file_tumor = f"../../MultiOmics_cancer_landscape_data/{cancer_type}/Data/{cancer_type}_{omic_type}_gene_abundance_log2_reference_intensity_normalized_{sample_type}.txt"


# Normal file
sample_type = "Normal"
file_normal = f"../../MultiOmics_cancer_landscape_data/{cancer_type}/Data/{cancer_type}_{omic_type}_gene_abundance_log2_reference_intensity_normalized_{sample_type}.txt"

# Read data
df_tumor = pd.read_csv(file_tumor, sep="\t")
df_normal = pd.read_csv(file_normal, sep="\t")

In [20]:

merged_df = pd.merge(df_normal, df_tumor, on="idx", how="inner", suffixes=('_N', '_T'))

# The transposed_df is now in the correct format with samples as rows and genes as columns
merged_df.set_index(merged_df.columns[0], inplace=True)
transposed_df = merged_df.T

obs=pd.DataFrame(
    index=transposed_df.index, 
    data={
        'SampleType': transposed_df.index.str[-1],  # Last character
        'Sample': transposed_df.index.str[:-2]      # All but last character
    }
)
var=pd.DataFrame(
    index=transposed_df.columns, 
    data={
        'Gene': transposed_df.columns,  # Last character
        'GeneID': transposed_df.columns      # All but last character
    }
)
adata = ad.AnnData(
    X=transposed_df,
    obs=obs,
    var=var
)

print(adata)



AnnData object with n_obs × n_vars = 211 × 12431
    obs: 'SampleType', 'Sample'
    var: 'Gene', 'GeneID'


In [24]:
adata.var_names
adata.var
adata.obs


Unnamed: 0,SampleType,Sample
C3N-01799_N,N,C3N-01799
C3L-01890_N,N,C3L-01890
C3N-00572_N,N,C3N-00572
C3N-02423_N,N,C3N-02423
C3N-02729_N,N,C3N-02729
...,...,...
C3N-02003_T,T,C3N-02003
C3N-00175_T,T,C3N-00175
C3N-01823_T,T,C3N-01823
C3L-02549_T,T,C3L-02549


In [None]:
object_file = f"../../Object/MultiOmics_cancer_landscape_data/{cancer_type}/{cancer_type}_{omic_type}.h5ad"

adata.write(object_file, compression="gzip")