# 通过t-sne算法对模型的输出进行降维分析与可视化

## 1. 处理模型输出的文件

In [2]:
# 导入必要的库
import numpy as np
import pandas as pd
from Bio import SeqIO



# family_list = ['PF00959','PF01832', 'PF05838', 'PF06737','PF16754']
family_list = ['PF00959']
family_dict = {'PF00959':0, 'PF01832':1, 'PF05838':2, 'PF06737':3,'PF16754':4}

# 需要选择一定数量的序列，用作t-sne，否则大量的数据进行降维分析时，效果极差
natural_sample_num = {'PF00959':190, 'PF01832':280, 'PF05838':70, 'PF06737':130,'PF16754':40}   # PF16754按照比例为4，但是效果不好，所以增加数量
generate_sample_num = {'PF00959':250, 'PF01832':250, 'PF05838':250, 'PF06737':250,'PF16754':250}


处理模型输出的序列

In [None]:
# 挑选模型困惑度较低的序列
# 因为困惑度越低越合理

sample_input_dir = "original_model_output/"
sample_output_dir = "data/generation_samples/"
p = 0.5

for fn in family_list:
    code = family_dict[fn]
    input_filename = sample_input_dir+"samples_lora_code_"+str(code)+"_"+str(p)+".txt"
    df = pd.read_csv(input_filename, header=None, names=['sequence', 'score'])
    # 将 'score' 列转换为浮点数
    df['score'] = df['score'].astype(float)
    # 对 'score' 列进行升序排序
    df_sorted = df.sort_values(by='score', ascending=True)
    # 删除 'sequence' 列中的重复项
    df_sorted_unique = df_sorted.drop_duplicates(subset='sequence')
    # 选择前n个数据
    df_top = df_sorted_unique.head(generate_sample_num[fn])
    # 保存到新的sample文件
    output_filename = sample_output_dir + fn + "_samples.txt"
    df_top.to_csv(output_filename, index=False, header=False)


处理自然序列

In [3]:
sample_input_dir = "../dataset/lysozyme_dataset/"
sample_output_dir = "data/natural_samples/"

def read_seq_from_fasta(input_filename):
    sequences = []
    with open(input_filename, 'r') as file:
        for record in SeqIO.parse(file, 'fasta'):
            sequence = str(record.seq)
            sequences.append(sequence)
    return sequences

for fn in family_list:
    code = family_dict[fn]
    input_filename = sample_input_dir+fn+".fasta"
    sequences = read_seq_from_fasta(input_filename)
    # 保存到输出文件中
    output_filename = sample_output_dir + fn + "_samples.txt"
    # 将当前sequences写入output_filename文件中
    with open(output_filename, 'w') as file:
        for sequence in sequences[:natural_sample_num[fn]]:
            file.write(sequence + '\n')
    print(f"{fn}家族选择的序列已经保存到文件{output_filename}中，共有{len(sequences[:natural_sample_num[fn]])}条序列。")

PF00959家族选择的序列已经保存到文件data/natural_samples/PF00959_samples.txt中，共有190条序列。


## 2. 使用t-sne进行可视化分析

In [None]:
# 利用训练好的分类模型，将已经选择的序列嵌入为向量
