In [4]:
import scanpy as sc
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt

# 1. 加载蛋白组数据
protein_data = pd.read_excel("/data/work/final/08_protein_integrate/mmc3.xls",sheet_name="Table S1_MS2 quantification",index_col=0)

# 2. 预处理蛋白组数据
# 去除全为空的列
protein_data = protein_data.dropna(how='all', axis=1)
# 去除全为空的行
protein_data = protein_data.dropna(how='all', axis=0)
# 填充缺失值为0
protein_data = protein_data.fillna(0)

# 3. 提取样本分组信息
healthy_samples = [col for col in protein_data.columns if col.startswith("Healthy_")]
early_oa_samples = [col for col in protein_data.columns if col.startswith("Early-Stage OA_")]
late_oa_samples = [col for col in protein_data.columns if col.startswith("Late-stage OA_")]

# 4. 加载单细胞数据(假设您已经加载为adata)
#adata = sc.read_h5ad("/data/work/final/03_anno/anno1_anno2.h5ad")

In [5]:
protein_data

Unnamed: 0_level_0,number of peptides used for quantification,Healthy_1,Healthy_2,Healthy_3,Healthy_4,Healthy_5,Healthy_6,Healthy_7,Healthy_8,Healthy_9,...,Late-stage OA_2,Late-stage OA_3,Late-stage OA_4,Late-stage OA_5,Late-stage OA_6,Late-stage OA_7,Late-stage OA_8,Late-stage OA_9,Late-stage OA_10,Late-stage OA_11
proteins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A1B0GUI7,1,7962.180176,10566.505859,3511.363525,6733.663574,7323.731934,5695.518555,8743.236328,0.000000,7289.207031,...,0.000000,9946.881836,4068.889648,6525.944336,0.000000,0.000000,7646.911621,5426.996094,6113.855957,5210.149414
A0A1B0GUS4,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1L4H1,6,6111.037598,4188.682129,15615.686523,45500.335938,7990.219238,11910.249023,4277.672363,80598.570312,7900.782715,...,41631.019531,31939.869141,29715.738281,41243.214844,43315.953125,21427.468750,27247.117188,46034.699219,23845.689453,12225.129883
A6NEF3,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
B9A064,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y5Y7,4,427271.187500,204890.984375,185712.812500,436143.031250,613632.437500,231939.937500,338469.718750,342488.531250,506472.125000,...,69931.546875,96127.890625,62028.707031,40602.628906,50789.140625,132159.375000,63475.656250,69587.570312,91847.640625,67949.859375
Q9Y5Z4,5,10900.745117,7047.351074,16897.816406,16475.476562,15771.883789,11831.222656,10618.879883,16256.193359,6314.768555,...,2590.014160,4275.327148,3789.836426,3788.661377,2751.048096,5724.907227,3987.331299,3706.851807,4634.523926,1869.755371
Q9Y624,1,2084.883301,0.000000,17251.208984,0.000000,4116.647461,1972.275879,4249.358887,3084.951904,0.000000,...,0.000000,0.000000,1137.091064,0.000000,0.000000,2877.667480,0.000000,0.000000,0.000000,0.000000
Q9Y646,3,100293.593750,81999.843750,196756.000000,104035.398438,78530.250000,76460.046875,96051.343750,149002.000000,93800.281250,...,53431.015625,72068.484375,54602.476562,51520.207031,80229.343750,68777.234375,64550.480469,56089.605469,107325.320312,55382.902344


In [6]:
healthy_samples

['Healthy_1',
 'Healthy_2',
 'Healthy_3',
 'Healthy_4',
 'Healthy_5',
 'Healthy_6',
 'Healthy_7',
 'Healthy_8',
 'Healthy_9',
 'Healthy_10',
 'Healthy_11',
 'Healthy_12',
 'Healthy_13']

In [None]:
# 1. 匹配蛋白组和转录组数据
# 获取蛋白组数据中的基因名(需要根据您的数据实际情况调整)
protein_genes = [gene.split('_')[0] for gene in protein_data.index]  # 示例转换

# 2. 找出在两种数据中都存在的基因
common_genes = set(protein_genes) & set(adata.var_names)

# 3. 为每个蛋白分配细胞类型来源
def assign_protein_to_celltype(protein_data, celltype_specific_genes, gene_mapping):
    protein_origin = {}
    for protein in protein_data.index:
        gene = gene_mapping.get(protein, protein)  # 简单的基因名映射
        for ct, genes in celltype_specific_genes.items():
            if gene in genes:
                if ct not in protein_origin:
                    protein_origin[ct] = []
                protein_origin[ct].append(protein)
    return protein_origin

# 假设简单的基因名映射(需要根据您的数据调整)
gene_mapping = {p:p.split('_')[0] for p in protein_data.index}
protein_origin = assign_protein_to_celltype(protein_data, celltype_specific_genes, gene_mapping)

# 4. 可视化细胞类型特异性蛋白(类似文章图2C)
plt.figure(figsize=(10, 6))
plt.bar(protein_origin.keys(), [len(v) for v in protein_origin.values()])
plt.xticks(rotation=90)
plt.ylabel("Number of specific proteins")
plt.title("Cell type-specific proteins in liquid biopsy")
plt.tight_layout()
plt.show()