In [87]:
import warnings
import pandas as pd
import scanpy as sc
warnings.filterwarnings("ignore", category=FutureWarning)

In [72]:
merfish_data = sc.read_h5ad(filename="/Users/a1234/Desktop/SEAAD_MTG_MERFISH.2024-12-11.h5ad")

In [88]:
merfish_data

AnnData object with n_obs × n_vars = 1887729 × 180
    obs: 'Donor ID', 'Sex', 'Gender', 'Age at Death', 'Race (choice=White)', 'Race (choice=Black/ African American)', 'Race (choice=Asian)', 'Race (choice=American Indian/ Alaska Native)', 'Race (choice=Native Hawaiian or Pacific Islander)', 'Race (choice=Unknown or unreported)', 'Race (choice=Other)', 'specify other race', 'Hispanic/Latino', 'Highest level of education', 'Years of education', 'PMI', 'Fresh Brain Weight', 'Brain pH', 'Overall AD neuropathological Change', 'Thal', 'Braak', 'CERAD score', 'Overall CAA Score', 'Highest Lewy Body Disease', 'Total Microinfarcts (not observed grossly)', 'Total microinfarcts in screening sections', 'Atherosclerosis', 'Arteriolosclerosis', 'LATE', 'Cognitive Status', 'Last CASI Score', 'Interval from last CASI in months', 'Last MMSE Score', 'Interval from last MMSE in months', 'Last MOCA Score', 'Interval from last MOCA in months', 'APOE4 Status', 'Primary Study Name', 'Secondary Study Name', 

Subclass to study:

In [73]:
subclasses = ["L2/3 IT", "L4 IT", "L5 IT", "Oligodendrocyte", "Vip"]

Layer to study

In [74]:
layers = sorted([layer for layer in merfish_data.obs['Layer annotation'].unique().dropna() if layer != ''])

Given that the limited number of samples, we only got 27 donors, which were also studied in the machine learning process:

In [75]:
donor_list = merfish_data.obs['Donor ID'].unique()

We then split the donor into two lists: patient_data (with dementia) and normal_data (without dementia):

In [70]:
patient_data = merfish_data[merfish_data.obs['Cognitive Status'] == "Dementia"]
normal_data = merfish_data[merfish_data.obs['Cognitive Status'] == "No dementia"]

In [76]:
def layer_dist_percent(data_input, target_subclasses, all_layers):
    result_dict = {}
    
    for sub in target_subclasses:
        sub_mask = data_input.obs["Subclass"] == sub
        sub_data = data_input.obs[sub_mask].copy()
        
        sub_data = sub_data[sub_data["Layer annotation"].notna() & (sub_data["Layer annotation"] != "")]
        
        if len(sub_data) == 0:
            print(f"Warning: No data for the subclass '{sub}' was found")
            result_dict[sub] = pd.DataFrame(index=donor_list, columns=all_layers).fillna(0)
            continue
        
        layer_count = sub_data.groupby(["Donor ID", "Layer annotation"]).size().unstack(fill_value=0)
        
        layer_count = layer_count.reindex(columns=all_layers, fill_value=0)
        layer_count = layer_count.reindex(index=donor_list, fill_value=0)
        
        row_sums = layer_count.sum(axis=1)
        row_sums[row_sums == 0] = 1 
        layer_percent = (layer_count.div(row_sums, axis=0) * 100).round(2)
        
        result_dict[sub] = layer_percent
    
    return result_dict

In [79]:
patient_percentage = layer_dist_percent(patient_data, subclasses, layers)
normal_percentage = layer_dist_percent(normal_data, subclasses, layers)

Extract the data from Python to R:

In [89]:
selected_obs = merfish_data.obs[[
    'Donor ID',               
    'Cognitive Status',       
    'Age at Death',          
    'Sex',                    
    'Subclass',              
    'Layer annotation',       
    'Cell ID'                
]]

spatial_coords = pd.DataFrame(
    merfish_data.obsm['spatial'],  
    index=merfish_data.obs.index,  
    columns=['x', 'y']            
)

export_data = pd.concat([selected_obs, spatial_coords], axis=1)

export_data = export_data[
    export_data['Layer annotation'].notna() & 
    (export_data['Layer annotation'] != '')    
]

print(f"The exported data contains {len(export_data)} cells")
print("The first five lines of the data:")
print(export_data.head())
print("\nDistribution of subclasses of cells:")
print(export_data['Subclass'].value_counts())

export_data.to_csv('/Users/a1234/Desktop/merfish_for_r_analysis.csv', index=False)
print("\nThe data has been successfully exported to '/Users/a1234/Desktop/merfish_for_r_analysis.csv'")


导出数据包含 341595 个细胞
数据前5行:
           Donor ID Cognitive Status Age at Death     Sex         Subclass  \
145223   H20.33.035      No dementia          90+  Female  Oligodendrocyte   
1018941  H21.33.038      No dementia           84  Female        Astrocyte   
144812   H20.33.035      No dementia          90+  Female        Astrocyte   
77633    H21.33.025      No dementia           88  Female          L5/6 NP   
1539497  H20.33.004         Dementia           86    Male  Oligodendrocyte   

        Layer annotation              Cell ID            x            y  
145223                L4  1110221018101970170  4984.735369  5514.470155  
1018941               L6  1209194515101310169  2798.627149  4612.727961  
144812                L6  1110221018101950128  4159.048633  5665.305649  
77633                 L6  1207112238104530037  5590.851466  8056.754418  
1539497               L5  1028044108102130336  4439.330726  6003.751796  

细胞子类分布:
Subclass
Oligodendrocyte    47522
L2/3 IT            