In [2]:
import pandas as pd
import numpy as np

def sample_tsv(input_file, output_file, percentage):
    # 读取tab分割的文件
    df = pd.read_csv(input_file, sep='\t')
    
    # 确保 percentage 在 0 到 100 之间
    if percentage < 0 or percentage > 100:
        raise ValueError("Percentage must be between 0 and 100")
    
    # 计算需要采样的行数
    sample_size = int(len(df) * (percentage / 100))
    
    # 随机采样
    df_sample = df.sample(n=sample_size, random_state=42)

    # 定义修饰替换规则
    modifications = {
        "C(Carbamidomethylation)": "C(+57.02)",
        "M(Oxidation)": "M(+15.99)"
    }

    # 定义一个函数来替换字符串中的修饰并去掉逗号
    def replace_modifications(sequence, feature_id):
        if isinstance(sequence, str):  # 检查是否为字符串
            for original, replacement in modifications.items():
                sequence = sequence.replace(original, replacement)
            # 去掉逗号
            sequence = sequence.replace(',', '')
            return sequence
            
        else:
            print(f"Feature ID {feature_id} has a non-string value: {sequence}")
            return None

    
     # 对目标列应用替换函数
    df_sample['predicted_forward_sequence'] = df_sample.apply(
        lambda row: replace_modifications(row['predicted_forward_sequence'], row['feature_id']), axis=1)
    df_sample['predicted_backward_sequence'] = df_sample.apply(
        lambda row: replace_modifications(row['predicted_backward_sequence'], row['feature_id']), axis=1)

    # 统计包含None值的行数
    none_rows = df_sample[(df_sample['predicted_forward_sequence'].isna()) | (df_sample['predicted_backward_sequence'].isna())]
    none_count = none_rows.shape[0]
    print(f"共有 {none_count} 行包含None值")

    # 剔除掉包含None值的行
    df_sample = df_sample.dropna(subset=['predicted_forward_sequence', 'predicted_backward_sequence'])

    # 将处理后的数据保存为新的tab分隔文件
    df_sample.to_csv(output_file, sep='\t', index=False)
    
    print(f"{percentage}% 的数据已保存到 {output_file}")

# 示例调用
input_file = '/root/v2/sb_transformer_independent_multheadapi/train_dataset_unique.csv1725206105.deepnovo_denovo'  # 输入文件的路径
output_file = '/root/v2/sb_transformer_independent_multheadapi_finetune_100/train_dataset_100.deepnovo_denovo'  # 输出文件的路径
percentage = 100  # 选取 10% 的数据

sample_tsv(input_file, output_file, percentage)

Feature ID F43:376699 has a non-string value: nan
Feature ID F13:114770 has a non-string value: nan
Feature ID F32:276088 has a non-string value: nan
Feature ID F36:244790 has a non-string value: nan
Feature ID F55:467624 has a non-string value: nan
Feature ID F39:344468 has a non-string value: nan
Feature ID F58:488163 has a non-string value: nan
Feature ID F11:99728 has a non-string value: nan
Feature ID F32:276864 has a non-string value: nan
Feature ID F11:102761 has a non-string value: nan
Feature ID F64:520037 has a non-string value: nan
Feature ID F49:423350 has a non-string value: nan
Feature ID F24:202008 has a non-string value: nan
Feature ID F25:212984 has a non-string value: nan
Feature ID F26:219528 has a non-string value: nan
Feature ID F37:308974 has a non-string value: nan
Feature ID F16:148320 has a non-string value: nan
Feature ID F2:19094 has a non-string value: nan
Feature ID F9:18462 has a non-string value: nan
Feature ID F45:325681 has a non-string value: nan
Featu

In [4]:
import pandas as pd
def filter_rows_by_feature_id(b_file, c_file, d_file):
    # 读取b文件和c文件
    b_df = pd.read_csv(b_file, sep='\t')
    c_df = pd.read_csv(c_file, sep=',')
    headers = c_df.columns.tolist()
    print(f"文件 {c_file} 的表头: {headers}")
    # 从b文件中提取feature_id
    feature_ids = b_df['feature_id'].unique()
    print(c_df['spec_group_id'])
    # 在c文件中筛选出具有相同feature_id的行
    filtered_df = c_df[c_df['spec_group_id'].isin(feature_ids)]

    # 将筛选结果保存到d文件
    filtered_df.to_csv(d_file, sep=',', index=False)

    print(f"已保存包含相同feature_id的行到 {d_file}")

# 示例调用
b_file = '/root/v2/sb_transformer_independent_multheadapi_finetune_100/train_dataset_100.deepnovo_denovo'  # b文件的路径
c_file = '/root/biatnovo/train-data/ftp.peptideatlas.org/biatNovo/train_dataset_unique.csv'  # c文件的路径
d_file = '/root/v2/sb_transformer_independent_multheadapi_finetune_100/train_dataset_100.csv'  # 输出d文件的路径

filter_rows_by_feature_id(b_file, c_file, d_file)

文件 /root/biatnovo/train-data/ftp.peptideatlas.org/biatNovo/train_dataset_unique.csv 的表头: ['spec_group_id', 'm/z', 'z', 'rt_mean', 'seq', 'scans', 'profile', 'feature area']
0             F1:1
1             F1:2
2             F1:3
3             F1:4
4             F1:5
            ...   
474522    F9:18694
474523    F9:18695
474524    F9:18696
474525    F9:18697
474526    F9:18698
Name: spec_group_id, Length: 474527, dtype: object
已保存包含相同feature_id的行到 /root/v2/sb_transformer_independent_multheadapi_finetune_100/train_dataset_100.csv


In [9]:
import csv
import pandas as pd

def get_feature_ids_with_eos(a_file):
    feature_ids_with_eos = []
    
    # 读取a.csv
    with open(a_file, 'r') as file:
        reader = csv.DictReader(file, delimiter='\t')
        for row in reader:
            # 检查predicted_forward_sequence和predicted_backward_sequence是否包含_EOS
            if '_EOS' in row['predicted_forward_sequence'] or '_EOS' in row['predicted_backward_sequence']:
                # 如果包含_EOS，则将对应的feature_id加入列表
                feature_ids_with_eos.append(row['feature_id'])
    
    return feature_ids_with_eos

def remove_rows_from_b(b_file, feature_ids_with_eos, output_file):
    # 读取b.csv为DataFrame
    b_data = pd.read_csv(b_file)
    print(feature_ids_with_eos)
    # 删除spec_group_id在feature_ids_with_eos中的行
    filtered_b_data = b_data[~b_data['spec_group_id'].isin(feature_ids_with_eos)]
    
    # 保存修改后的b.csv
    filtered_b_data.to_csv(output_file, index=False)


# b_file = '/root/v2/sb_transformer_independent_multheadapi_finetune_100/train_dataset_100.deepnovo_denovo'  # b文件的路径
# c_file = '/root/biatnovo/train-data/ftp.peptideatlas.org/biatNovo/train_dataset_unique.csv'  # c文件的路径
# d_file = '/root/v2/sb_transformer_independent_multheadapi_finetune_100/train_dataset_100.csv'  # 输出d文件的路径
# 文件路径
a_file = '/root/v2/sb_transformer_independent_multheadapi_finetune_100/train_dataset_100.deepnovo_denovo'
b_file = '/root/v2/sb_transformer_independent_multheadapi_finetune_100/train_dataset_100.csv'
output_file = '/root/v2/sb_transformer_independent_multheadapi_finetune_100/filter_train_dataset_100.csv'

# 获取feature_id列表
feature_ids_with_eos = get_feature_ids_with_eos(a_file)

# 删除b.csv中spec_group_id在feature_ids_with_eos中的行
remove_rows_from_b(b_file, feature_ids_with_eos, output_file)

print(f"Filtered b.csv saved to {output_file}")


['F14:129477', 'F1:682', 'F2:25717', 'F14:131899', 'F21:91318', 'F60:444008', 'F7:64214', 'F47:404482', 'F34:301771', 'F23:192994', 'F25:212959', 'F51:435151', 'F41:356296', 'F18:161262', 'F55:473337', 'F55:474509', 'F3:31521', 'F26:224397', 'F11:103115', 'F16:150030', 'F23:194103', 'F47:403057', 'F35:237235', 'F58:493941', 'F24:196355', 'F26:217551', 'F11:98753', 'F35:237425', 'F18:161707', 'F63:511207', 'F27:232411', 'F31:268794', 'F30:256374', 'F61:446722', 'F1:701', 'F23:190568', 'F22:166960', 'F12:109612', 'F21:95219', 'F37:306623', 'F33:285009', 'F37:306060']
Filtered b.csv saved to /root/v2/sb_transformer_independent_multheadapi_finetune_100/filter_train_dataset_100.csv
