In [None]:
import pandas as pd
import numpy as np

def sample_tsv(input_file, output_file, percentage):
    # 读取tab分割的文件
    df = pd.read_csv(input_file, sep='\t')
    
    # 确保 percentage 在 0 到 100 之间
    if percentage < 0 or percentage > 100:
        raise ValueError("Percentage must be between 0 and 100")
    
    # 计算需要采样的行数
    sample_size = int(len(df) * (percentage / 100))
    
    # 随机采样
    df_sample = df.sample(n=sample_size, random_state=42)

    # 定义修饰替换规则
    modifications = {
        "C(Carbamidomethylation)": "C(+57.02)",
        "M(Oxidation)": "M(+15.99)"
    }

    # 定义一个函数来替换字符串中的修饰并去掉逗号
    def replace_modifications(sequence, feature_id):
        if isinstance(sequence, str):  # 检查是否为字符串
            for original, replacement in modifications.items():
                sequence = sequence.replace(original, replacement)
            # 去掉逗号
            sequence = sequence.replace(',', '')
            return sequence
            
        else:
            print(f"Feature ID {feature_id} has a non-string value: {sequence}")
            return None

    
     # 对目标列应用替换函数
    df_sample['predicted_forward_sequence'] = df_sample.apply(
        lambda row: replace_modifications(row['predicted_forward_sequence'], row['feature_id']), axis=1)
    df_sample['predicted_backward_sequence'] = df_sample.apply(
        lambda row: replace_modifications(row['predicted_backward_sequence'], row['feature_id']), axis=1)

    # 统计包含None值的行数
    none_rows = df_sample[(df_sample['predicted_forward_sequence'].isna()) | (df_sample['predicted_backward_sequence'].isna())]
    none_count = none_rows.shape[0]
    print(f"共有 {none_count} 行包含None值")

    # 剔除掉包含None值的行
    df_sample = df_sample.dropna(subset=['predicted_forward_sequence', 'predicted_backward_sequence'])

    # 将处理后的数据保存为新的tab分隔文件
    df_sample.to_csv(output_file, sep='\t', index=False)
    
    print(f"{percentage}% 的数据已保存到 {output_file}")

# 示例调用
input_file = '/root/v2/sb_transformer_independent_multheadapi/train_dataset_unique.csv1725206105.deepnovo_denovo'  # 输入文件的路径
output_file = '/root/v2/sb_transformer_independent_multheadapi/train_dataset_10.deepnovo_denovo'  # 输出文件的路径
percentage = 10  # 选取 10% 的数据

sample_tsv(input_file, output_file, percentage)

In [8]:
import pandas as pd
def filter_rows_by_feature_id(b_file, c_file, d_file):
    # 读取b文件和c文件
    b_df = pd.read_csv(b_file, sep='\t')
    c_df = pd.read_csv(c_file, sep=',')
    headers = c_df.columns.tolist()
    print(f"文件 {c_file} 的表头: {headers}")
    # 从b文件中提取feature_id
    feature_ids = b_df['feature_id'].unique()
    print(c_df['spec_group_id'])
    # 在c文件中筛选出具有相同feature_id的行
    filtered_df = c_df[c_df['spec_group_id'].isin(feature_ids)]

    # 将筛选结果保存到d文件
    filtered_df.to_csv(d_file, sep=',', index=False)

    print(f"已保存包含相同feature_id的行到 {d_file}")

# 示例调用
b_file = '/root/v2/sb_transformer_independent_multheadapi/train_dataset_10.deepnovo_denovo'  # b文件的路径
c_file = '/root/biatnovo/train-data/ftp.peptideatlas.org/biatNovo/train_dataset_unique.csv'  # c文件的路径
d_file = '/root/v2/sb_transformer_independent_multheadapi/train_dataset_10.csv'  # 输出d文件的路径

filter_rows_by_feature_id(b_file, c_file, d_file)

文件 /root/biatnovo/train-data/ftp.peptideatlas.org/biatNovo/train_dataset_unique.csv 的表头: ['spec_group_id', 'm/z', 'z', 'rt_mean', 'seq', 'scans', 'profile', 'feature area']
0             F1:1
1             F1:2
2             F1:3
3             F1:4
4             F1:5
            ...   
474522    F9:18694
474523    F9:18695
474524    F9:18696
474525    F9:18697
474526    F9:18698
Name: spec_group_id, Length: 474527, dtype: object
已保存包含相同feature_id的行到 /root/v2/sb_transformer_independent_multheadapi/train_dataset_10.csv
