In [2]:
import pandas as pd
# 读取CSV文件
df1 = pd.read_csv('~/biatnovo/train-data/ftp.peptideatlas.org/biatNovo/train_dataset_unique.csv')
print(len(df1))
df2 = pd.read_csv('/root/biatnovo/DeepNovo-DIA/oc/oc_test.feature.csv')
print(len(df2))

# 提取两个数据框中唯一的seq值
unique_seqs_df1 = set(df1['seq'].unique())
unique_seqs_df2 = set(df2['seq'].unique())

# 找到公共的seq
common_seqs = unique_seqs_df1.intersection(unique_seqs_df2)

# 计算df2中属于公共seq的条目占df2整体的比例
# 首先，创建一个指示列，标记每个条目的seq是否属于公共seq
df2['is_common_seq'] = df2['seq'].apply(lambda x: x in common_seqs)

# 然后，统计属于公共seq的条目数
common_seq_count_in_df2 = df2['is_common_seq'].sum()

# 计算比例
common_seq_proportion_in_df2 = common_seq_count_in_df2 / len(df2)

# 输出结果
print(f"在第二个CSV文件中，属于公共seq的条目数量: {common_seq_count_in_df2}")
print(f"这些条目占第二个CSV文件总条目的比例: {common_seq_proportion_in_df2:.2%}")

474527
61585
在第二个CSV文件中，属于公共seq的条目数量: 53977
这些条目占第二个CSV文件总条目的比例: 87.65%


In [4]:
# 找到两个数据框中公共的seq
common_seqs = set(df1['seq']).intersection(df2['seq'])

# 筛选出属于公共seq的行，并添加前缀到spec_group_id
df1_common = df1[df1['seq'].isin(common_seqs)].copy()
df1_common['spec_group_id'] = 'training_' + df1_common['spec_group_id'].astype(str)

df2_common = df2[df2['seq'].isin(common_seqs)].copy()
df2_common['spec_group_id'] = 'test_' + df2_common['spec_group_id'].astype(str)

# 合并数据
df_combined = pd.concat([df1_common, df2_common])

# 按seq列和spec_group_id列对合并后的数据进行排序
# 这确保了具有相同seq的行首先按照seq值排序，然后在相同seq内部按照training和test排序
df_combined_sorted = df_combined.sort_values(by=['seq', 'spec_group_id'])

# 写入新文件，确保seq一样的放在一块
df_combined_sorted.to_csv('combined_common_seqs.csv', index=False)

In [26]:
import pandas as pd

# 读取文件biatnovo-sb
df = pd.read_csv('/root/predict_bi_in_dir/accuracy.txt', sep='\t')
#df = pd.read_csv('/root/biatnovo/DeepNovo-DIA/oc/oc_test.feature.csv.deepnovo_denovo.accuracy', sep='\t')
print(common_seqs)
# 定义一个函数，用于修改feature_id，并处理target_sequence
def process_row(row):
    # 移除逗号
    processed_seq = row['target_sequence'].replace(",", "")
    # 替换C(Carbamidomethylation)为C(+57.02)和M(Oxidation)为M(+15.99)
    processed_seq = processed_seq.replace("C(Carbamidomethylation)", "C(+57.02)")
    processed_seq = processed_seq.replace("M(Oxidation)", "M(+15.99)")
    # 更新target_sequence
    row['target_sequence'] = processed_seq
    # 添加common_前缀（如果适用）
    if processed_seq in common_seqs:
        return 'common_' + row['feature_id']
    else:
        return row['feature_id']

# 应用上面的函数到每一行的feature_id列
df['feature_id'] = df.apply(process_row, axis=1)

# 统计 target_sequence 出现次数
sequence_counts = df['target_sequence'].value_counts()

# 将出现次数映射到每行
df['sequence_count'] = df['target_sequence'].map(sequence_counts)
# 计算recall_AA与target_len的比例，并添加为新列
df['recall_ratio'] = df['recall_AA'] / df['target_len']

# 首先按target_sequence列排序，然后按新计算的recall_ratio列排序
df_sorted = df.sort_values(by=['sequence_count','target_sequence', 'recall_ratio'], ascending=[False, True, False])

# 调整recall_ratio列的位置，使其放在predicted_sequence列之后
# 获取所有列名为一个列表
columns = list(df_sorted.columns)

# 移除'sequence_count'列（稍后将其重新插入）
columns.remove('sequence_count')

# 找到'feature_id'的索引
feature_id_index = columns.index('feature_id')

# 在'feature_id'之后插入'sequence_count'
columns.insert(feature_id_index + 1, 'sequence_count')

# 找到'predicted_sequence'的索引
predicted_sequence_index = columns.index('predicted_sequence')

# 移除'recall_ratio'列
columns.remove('recall_ratio')

# 在'predicted_sequence'之后插入'recall_ratio'
columns.insert(predicted_sequence_index + 1, 'recall_ratio')

# 使用新的列顺序重排数据框
df_sorted = df_sorted[columns]

# 显示排序和重排列后的DataFrame头部（可选）
#print(df_sorted.head())

# 将排序和重排列后的数据写入新文件
#df_sorted.to_csv('deepnovo_sorted_accuracy.csv', sep='\t', index=False)
df_sorted.to_csv('sorted_accuracy.csv', sep='\t', index=False)

{'LLEYSGLK', 'LLPAQLPAEK', 'LGVAGQWR', 'DTDTGALLFIGK', 'IAC(+57.02)VVPSEC(+57.02)EK', 'SSVLGFAC(+57.02)K', 'IPIFSAAGLPHNEIAAQIC(+57.02)R', 'TFVPGC(+57.02)QPGEFTLGNIK', 'IFWSDATQGK', 'TFYETPLQLLEK', 'DEFTNTC(+57.02)PSDK', 'ATSIVAWLAK', 'APDFVFYAPR', 'PGSC(+57.02)PIILIR', 'NWGLGGHAFC(+57.02)R', 'INVNEIFYDLVR', 'C(+57.02)LTAIVK', 'LQTSSVLVSGLR', 'ESHVTLASPEETR', 'DNNGGC(+57.02)EQVC(+57.02)VLSHR', 'ITLDNAYM(+15.99)EK', 'GWPEVWAGSVGR', 'C(+57.02)YFQEGR', 'LYIFQASPADAGQYVC(+57.02)R', 'STGGISVPGPM(+15.99)GPSGPR', 'IVVAGM(+15.99)LLR', 'ATPQHTVSFTC(+57.02)ESHGFSPR', 'IVLQIDNAR', 'SLTLEVC(+57.02)QC(+57.02)DNR', 'GGGHILPYDQPLR', 'DC(+57.02)IGGC(+57.02)SDLVSLQQSGELLTR', 'IVAEEFLK', 'FLQDYFDGNLKR', 'GVGYETILK', 'C(+57.02)VALEASGEHR', 'AFGFSHLEALLDDSKELQR', 'VNEALLQR', 'YNPTWHC(+57.02)IVGR', 'YLEQQEGQLK', 'VYTVDLGR', 'TC(+57.02)QDIDEC(+57.02)LEQNVHC(+57.02)GPNR', 'QNC(+57.02)GFPGVTPSQC(+57.02)ANKGC(+57.02)C(+57.02)FDDTVR', 'TMQALPYSTVGNSNNYLHLSVLR', 'EVVLQWFTENSK', 'FAGNYDLVYLHC(+57.02)EVYLC(+57.02)

In [23]:

df = pd.read_csv('deepnovo_sorted_accuracy.csv', sep='\t')

# 添加新列，用于存放处理后的分桶标签
df['recall_ratio_bin'] = pd.cut(df['recall_ratio'], bins=[0, 0.25, 0.5, 0.75, 0.999, 1], 
                                labels=['0-0.25', '0.25-0.5', '0.5-0.75', '0.75-0.999', '1.0'], 
                                right=False, include_lowest=True)

# 对于 recall_ratio 等于 1 的情况，直接分配标签 '1.0'
df.loc[df['recall_ratio'] == 1, 'recall_ratio_bin'] = '1.0'

# 根据是否有 'common' 前缀分组统计 recall_ratio_bin 的比例
# 筛选出有 'common' 前缀的 feature_id
df_common = df[df['feature_id'].str.startswith('common')]
# 筛选出没有 'common' 前缀的 feature_id
df_not_common = df[~df['feature_id'].str.startswith('common')]

# 对 'common' 的 recall_ratio_bin 进行比例统计
bin_counts_common = df_common['recall_ratio_bin'].value_counts(normalize=True).sort_index()

# 对非 'common' 的 recall_ratio_bin 进行比例统计
bin_counts_not_common = df_not_common['recall_ratio_bin'].value_counts(normalize=True).sort_index()

# 输出结果
print("有 'common' 前缀的 feature_id 的 recall_ratio 分桶比例:")
print(bin_counts_common)
print("\n没有 'common' 前缀的 feature_id 的 recall_ratio 分桶比例:")
print(bin_counts_not_common)

有 'common' 前缀的 feature_id 的 recall_ratio 分桶比例:
0-0.25        0.372143
0.25-0.5      0.094100
0.5-0.75      0.102371
0.75-0.999    0.089152
1.0           0.342233
Name: recall_ratio_bin, dtype: float64

没有 'common' 前缀的 feature_id 的 recall_ratio 分桶比例:
0-0.25        0.520345
0.25-0.5      0.126441
0.5-0.75      0.127767
0.75-0.999    0.086017
1.0           0.139430
Name: recall_ratio_bin, dtype: float64


In [28]:
# 步骤2: 对DataFrame按target_sequence进行分组，并计算recall_ratio为1.0的比例
# 创建一个新的DataFrame来存储结果
# 步骤1: 读取CSV文件
df = pd.read_csv('sorted_accuracy.csv', sep='\t')
result_df = pd.DataFrame()

# 对每个target_sequence分组
grouped = df.groupby('target_sequence')

# 计算每组recall_ratio为1.0的比例并保留sequence_count
result_df['sequence_count'] = grouped['sequence_count'].first()  # 每个组的sequence_count是相同的，取第一个即可
result_df['recall_ratio_1.0_proportion'] = grouped.apply(lambda x: (x['recall_ratio'] == 1.0).mean())

# 重置索引，确保target_sequence也被包括在最终的DataFrame中
result_df.reset_index(inplace=True)

# 步骤3: 输出结果
print(result_df)

                                        target_sequence  sequence_count  \
0                                 A,A,A,A,A,A,A,L,Q,A,K               5   
1                               A,A,A,A,T,G,T,I,F,T,F,R              21   
2                                 A,A,A,T,T,A,Q,E,Y,L,K               6   
3                     A,A,A,V,S,E,A,E,A,D,F,Y,E,Q,N,S,R              10   
4     A,A,C(Carbamidomethylation),C(Carbamidomethyla...               7   
...                                                 ...             ...   
7191                                      Y,Y,T,V,F,D,R              10   
7192                                      Y,Y,V,D,S,V,K              10   
7193                            Y,Y,V,T,I,I,D,A,P,G,H,R              12   
7194                          Y,Y,Y,D,G,K,D,Y,I,E,F,N,K               9   
7195  Y,Y,Y,V,C(Carbamidomethylation),Q,Y,C(Carbamid...              10   

      recall_ratio_1.0_proportion  
0                        0.000000  
1                        0.