# Sampling without replacement, multi-motif concatenation


1. The final concatenated sequence is expected to have a total length between 400 and 800 amino acids. 
   Since each motif is approximately 20 residues long, the number of motifs to be concatenated should range from 20 to 40.

2. Each motif initially has an equal probability of being selected. 
   However, as certain motifs are chosen, their probability of being selected again should gradually decrease.

3. For the resulting sequences, discard any that contain fewer than 16 unique motifs to maintain overall diversity.

In [None]:
import random
import numpy as np


def sample(candidates_id, motif_num, long_motifs: list, times=100) -> set:
    # Repeat each candidate motif motif_num times and perform sampling without replacement,
    # so that the probability of selection decreases as a motif is chosen more often.
    # long_motifs is a list of IDs corresponding to long motifs
    candidates_id = list(np.repeat(candidates_id, motif_num))
    samples = set()
    
    # Perform sampling 'times' times
    while len(samples) < times:
        
        # If long motifs are not considered, sampling can be done directly:
        # sam = random.sample(candidates_id, motif_num)
        # sam = "-".join([str(i) for i in sam])
        
        # Consider long motifs
        tmp, tmp_count, tmp_candidate = [], motif_num, [i for i in candidates_id]
        while tmp_count > 0:
            id = random.choice(tmp_candidate)
            if tmp_count >= 2:
                if id in long_motifs:
                    tmp_count -= 1
            else:
                # When only one motif remains to be selected, long motifs are not allowed
                while id in long_motifs:
                    id = random.choice(tmp_candidate)
            tmp_count -= 1
            tmp_candidate.remove(id)      
            tmp.append(str(id))
        samples.add("-".join(tmp))
    return samples

In [None]:
import pandas as pd

df_origin = pd.read_csv("/home/wangyu/projects/synmask/data/peptide/top20/peptide.csv")
motif_id_candidates = df_origin['ID'].to_list()
motif_seqs_candidates = df_origin['Sequence'].to_list()

motif_candidates = dict((x, y) for x, y in zip(motif_id_candidates, motif_seqs_candidates))

# print(motif_candidates)
range1, range2 = 400, 800
long_motifs = [76]
motif_nums = [int(i / 20) for i in range(range1, range2 + 100, 100)]
times = [60000, 5000, 5000, 3000, 3000]
samples = {}  # key is motif_num, value is a set of 100 concatenated sequences
for i, motif_num in enumerate(motif_nums):
    samples[motif_num] = sample(motif_id_candidates, motif_num, long_motifs, times[i])

print(motif_nums)
for i in samples[motif_nums[0]]:
    print(i)
    break

Filter out sequences that contain fewer than 16 unique motifs

In [None]:
# Count the number of repeated motifs in a single sequence
from collections import Counter

def diversity_repeat(samples, v):
    # Not all sequences with motif_num = 20 will contain 20 unique motifs,
    # since long motifs may occupy multiple positions; use v as the target motif count
    sample = samples[v]
    # Store the diversity and repeat information for each sequence
    screened_sample = []
    for i in sample:
        # Evaluate each sequence
        ids = i.split("-")
        each_id_repeats_time = dict(sorted(Counter(ids).items(), key=lambda x: x[1], reverse=True))
        # print(each_id_repeats_time) # This shows how many unique motifs exist in the sequence
        if len(each_id_repeats_time.keys()) > 16:
            seq_info = [i]
            # Create a vector of length motif_num, where each position counts how many motifs were repeated 1x, 2x, ..., v times
            repeat_num = [0 for _ in range(v)]
            # Count repeat frequencies
            # {4:3} means 3 unique motifs appeared 4 times
            repeat_nums_times = Counter(each_id_repeats_time.values())
            for rn, mn in repeat_nums_times.items():
                # Record repeat count
                repeat_num[rn - 1] = mn
            seq_info.extend(repeat_num)
            screened_sample.append(seq_info)
    return screened_sample

In [None]:
all_info = {}
for v in motif_nums:
    all_info[v] = diversity_repeat(samples, v)
print(len(all_info[40]))

Save all these data to CSV files based on motif_num,apply multi-criteria sorting, and append the corresponding real amino acid sequences.

In [None]:
import os
import pandas as pd 

new_samples = {}
output_path = "/home/wangyu/projects/synmask/data/sample"
for v in motif_nums:
    random_sample = all_info[v]
    if len(all_info[v]) > 1000:
        random_sample = random.sample(all_info[v],1000)
    nums_col = [i for i in range(1,v+1)]
    ascend = [False for _ in range(v)]
    col = ['seq']+nums_col
    df = pd.DataFrame(random_sample, columns=col)
    # sequence
    seqs = df['seq'].tolist()
    aa_seqs = ["".join([motif_candidates[int(j)] for j in i.split("-")]) for i in seqs]
    df['aa_seq'] = aa_seqs
    new_samples[v] = seqs
    df.sort_values(nums_col, ascending=ascend,inplace=True)
    # Remove columns that contain only zeros
    df=df.loc[:, (df != 0).any(axis=0)]
    df.to_csv(os.path.join(output_path, f"m_{v}/sample_{v}.csv"),index=False)

In [None]:
lines = []

for v in motif_nums:
    classification = f"~{v*20}aa"
    tmp = []
    for i in new_samples[v]:
        tmp.extend([int(j) for j in i.split("-")])
    counter = Counter(tmp)
    for id, count in counter.items():
        lines.append([id, count/len(new_samples[v]), classification])
df = pd.DataFrame(lines, columns =['id', 'count', 'type'])
df

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid")
# Draw a nested barplot by species and sex
g = sns.catplot(
    data=df, kind="bar",
    x="id", y="count", hue="type", palette="dark", alpha=.6, height=5, aspect=2
)
g.despine(left=True)
g.set_axis_labels("","Frequency of motif in one sequence")
g.legend.set_title("")
g.fig.suptitle('Frequency of Motif in Different Length of Sequences')

# Secondary structure

In [None]:
import pandas as pd

df = pd.read_csv("/home/wangyu/projects/synmask/data/peptide/top20/peptide.csv")
df

## PSIPred


In [None]:
import pandas as pd
from tqdm import tqdm

def get_merged_fasta(file, fasta_file):
    df = pd.read_csv(file)
    seqs = df['aa_seq'].to_list()

    with open(fasta_file, "w") as f:
        for i, seq in tqdm(enumerate(seqs)):
            f.writelines(f">{i}\n{seq}\n")

for n in motif_num:
    file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}.csv"
    fasta_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}.fasta"
    get_merged_fasta(file, fasta_file)

Run PSIPred

Use the repository: https://github.com/psipred/s4pred

After installation, run the following command:

python /home/wangyu/gitlab/ss_predict/s4pred-update/run_model.py --device gpu --outfmt fas /home/wangyu/projects/synmask/data/sample/m_40/sample_40.fasta > /home/wangyu/projects/synmask/data/sample/m_40/s4pred.fasta


Get predicted results

In [None]:
from tqdm import tqdm
import pandas as pd

def extract_s4pred(s4pred_file,output_file):
    seqs, ss, percents = [], [], []
    with open(s4pred_file, "r") as f:
        for i, line in tqdm(enumerate(f.readlines())):
            if i%3 == 1:
                seqs.append(line.strip())
            if i%3 == 2:
                tmp = line.strip()
                ss.append(tmp)
                per = tmp.count("C")/len(tmp)
                percents.append(per)
    dict = {'seq': seqs, 's4_pred_ss': ss, 's4_pred_percent': percents} 
    df = pd.DataFrame(dict)
    df.index.name='id'
    df.to_csv(output_file)

for n in motif_num:
    s4pred_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/s4pred.fasta"
    output_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/s4pred.csv"
    extract_s4pred(s4pred_file, output_file)

In [None]:
def ss_filter(file, sample_file, output, col, th):
    df = pd.read_csv(file)
    df_coil = df[df[col]>=th]
    df_coil=df_coil.loc[:, (df_coil != 0).any(axis=0)]   
    seq_list = df_coil["id"].to_list()
    with open(sample_file, "r") as f:
        lines = []
        for i, line in enumerate(f.read().split()):
            if i//2 in seq_list:
                lines.append(line)
    with open(output, "w") as f:
        for line in lines:
            f.writelines(line+"\n")

th = 0.99

# Used to filter FASTA sequences for ProtBert input
for n in motif_num:
    s4pred_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/s4pred.csv"
    sample_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}.fasta"
    output_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}_s4_1.fasta"
    ss_filter(s4pred_file, sample_file, output_file, 's4_pred_percent', th)

## ProtBert


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
import re
import numpy as np
import pandas as pd
import os
from collections import Counter

def read_csv(csv_file):
    content = pd.read_csv(csv_file)
    print(content.columns)
    return content 

def save_csv(res, csv_file):
    with open(csv_file, "w") as f:
        for row in res:
            f.writelines(row)

def read_txt(txt_file):
    with open(txt_file,'r') as f:
        seq = f.read().split()
    return seq

def read_fasta(fasta_file):
    seq = []
    with open(fasta_file,'r') as f:
        lines = f.read().split()
        for line in lines:
            if not line.startswith(">"):
                seq.append(line)
    return seq

def write_txt(seqs,txt_name):
    with open(txt_name,'w') as f:
        for i in seqs:
            f.write(i+'\n')

def read_numpy(file_name):
    content = np.load(file_name)
    print(content.shape)
    return content

def ss3_model(input_list):
    # Add a space between every amino acid in the sequence
    seq_list = []
    for seq in input_list:
        seq_add_space = ''
        for i in range(len(seq)):
            seq_add_space += seq[i]
            seq_add_space += ' '
        seq_list.append(seq_add_space[:-1])

    pipeline = TokenClassificationPipeline(
        model=AutoModelForTokenClassification.from_pretrained("Rostlab/prot_bert_bfd_ss3"),
        tokenizer=AutoTokenizer.from_pretrained("Rostlab/prot_bert_bfd_ss3", skip_special_tokens=True)
        # device='cpu'
    )
    sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in seq_list]
    results = pipeline(sequences)
    return results

def count_ss3_each_seq(ss3_model_results):
    res = []
    for result in ss3_model_results:
        line = ''
        str_result = ''
        for amino in result:
            str_result += amino['entity']
        counter = Counter(str_result)
        res.append(str_result + "," + str(counter["C"]/len(str_result))+"\n")
    return res

def count_R_K_num(seqs):
    nums = []
    for s in seqs:
        nums.append(s.count('R') + s.count('K'))
    return nums

def seq_sencond_struct(seqs, csv_name, R_K_num=False):
    results = ss3_model(seqs)
    c_list, h_list, e_list = count_ss3_each_seq(results)
    content = {}
    content['sequence'] = seqs
    content['coil(%)'] = c_list
    content['helix(%'] = h_list
    content['strand(%)'] = e_list
    if R_K_num:
        R_K_list = count_R_K_num(seqs)
        content['R+K'] = R_K_list
    content = pd.DataFrame(content)
    content.to_csv(csv_name, index=False)

# Predict secondary structure for each sequence
seqs = read_fasta('/home/wangyu/projects/synmask/data/sample/m_20/sample_20_s4_1.fasta')
results = ss3_model(seqs)
res = count_ss3_each_seq(results)
save_csv(res, '/home/wangyu/projects/synmask/data/sample/m_20/prot_bert.csv')

In [None]:

th = 0.97

for n in motif_num:
    prot_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/prot_bert.csv"
    sample_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}_s4_1.fasta"
    
    with open(sample_file, "r") as f:
        seq_list = []
        for i, line in enumerate(f.read().split()):
            if i%2 ==0:
                seq_list.append(int(line[1:]))
                
    df = pd.read_csv(prot_file)
    df['id'] = seq_list
    df.to_csv(prot_file, index=False)
    output_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}_prot_2.fasta"
    ss_filter(prot_file, sample_file, output_file, 'prot_percent', th)

## Alphafold2

Split fasta files

In [None]:
import pandas as pd
from tqdm import tqdm
import os
import shutil


def fasta2_split_fasta(file, fasta_folder):
    with open(file, "r") as f:
        lines = f.read().split()
        for i in range(len(lines)):
            if lines[i].startswith(">"):
                with open(os.path.join(fasta_folder,f"seq{lines[i][1:]}.fasta"), "w") as f:
                    f.writelines(f">seq{i}\n{lines[i+1]}\n")
            
for n in motif_num:
    file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}_prot_2.fasta"
    fasta_folder = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/input"
    output = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/output"
    if os.path.exists(fasta_folder):
        shutil.rmtree(fasta_folder)
    if os.path.exists(output):
        shutil.rmtree(output)
    os.makedirs(fasta_folder)
    os.makedirs(output)
    fasta2_split_fasta(file, fasta_folder)

Run AF2

XLA_PYTHON_CLIENT_PREALLOCATE=false CUDA_VISIBLE_DEVICES=3 python3 /home/wangyu/gitlab/alphafold-dev/run_single_without_msa.py --input_type=dir --input=/home/wangyu/projects/synmask/data/sample/m_40/af/input --output_dir=/home/wangyu/projects/synmask/data/sample/m_40/af/output

Get the predicted results from AF2

In [None]:
import os
import shutil
from tqdm import tqdm
import shutil

def extract_rank0(raw_output_folder, rank0_folder):
    raw_files = os.listdir(raw_output_folder)
    
    if os.path.exists(rank0_folder):
        shutil.rmtree(rank0_folder)    
    os.makedirs(rank0_folder)
    
    for file in tqdm(raw_files):
        id = int(file[3:])
        try:
            rank0_pdb = os.path.join(os.path.join(raw_output_folder,file),"ranked_0.pdb")
            shutil.copy(rank0_pdb, os.path.join(rank0_folder,f"{id}.pdb"))
        except Exception:
            print(n, id)
            
        
for n in motif_num:
    raw_output_folder = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/output"
    rank0_folder = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/output_rank0"
    extract_rank0(raw_output_folder, rank0_folder)

The last line contains two lines of code that calculate and analyze the AF results using DSSP.

In [None]:
from collections import defaultdict
import os
import pathlib
import csv
import shutil

def generate_ssfile(pdb_path,ssfile_path,output_sh):
    print("generating ssfile sh...")
    f = open(output_sh, "a")
    files = os.listdir(pdb_path)
    for file in files:
        f.write(f"dssp {os.path.join(pdb_path,file)} {os.path.join(ssfile_path,pathlib.Path(file).stem+'.txt')}\n")
    f.close()


def list2txt(output, file):
    with open(file, 'w') as f:
        w = csv.writer(f)    
        for line in output:
            w.writerow(line)    
    print("Saved to file.")


def convert_ssfile(ssfile_txt):
    dict = defaultdict(str)
    below_header = False
    with open(ssfile_txt, "r") as f:
        for line in f:
            if line.strip().startswith("#"):
                below_header = True
                continue
            if below_header and line.split():
                if line.strip().split()[1].isnumeric():
                    dict["ss"] += line[16] if line[16] != " " else "-"
                    dict["aa"] += line[13]
    return dict


def all_ssfile2txt(ssfile_path, output_file):
    print("starting convert ssfile to one file...")
    files = os.listdir(ssfile_path)
    output = []
    for file in tqdm(files):
        dict = convert_ssfile(os.path.join(ssfile_path,file))
        sturcture = ['H','G','I','E','B']
        ss_count = 0
        for s in sturcture:
            ss_count += dict["ss"].count(s)
        percent = (len(dict["ss"])-ss_count)/len(dict["ss"])
        output.append([pathlib.Path(file).stem,dict["aa"],dict["ss"], percent])

    list2txt(output,output_file)
    
    
for n in motif_num:
    # 生成dssp的sh file
    pdb_path = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/output_rank0"
    ssfile_path = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/af_ss"
    if not os.path.exists(ssfile_path):
        os.makedirs(ssfile_path)
    output_sh = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/ssfile.sh"
    output_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/af_ss.txt"
    # Generate bash file
    generate_ssfile(pdb_path,ssfile_path,output_sh)
    # Convert to .txt
    # all_ssfile2txt(ssfile_path, output_file)

In [None]:
import pandas as pd

filters = [1,1,0.99,0.96,0.94]
for i, n in enumerate(motif_num):
    af_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/af_ss.csv"
    sample_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}_prot_2.fasta"
    output_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}_af_3.fasta"
    ss_filter(af_file, sample_file, output_file, 'af_percent', filters[i])

In [None]:
def get_id(fasta_file):
    with open(fasta_file, "r") as f:
        seq_list = []
        for i, line in enumerate(f.read().split()):
            if i%2 ==0:
                seq_list.append(int(line[1:]))
    return seq_list

for n in motif_num:
    fasta_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}_af_3.fasta"
    seq_list = get_id(fasta_file)
    df = pd.DataFrame()
    df['id'] = seq_list
    
    origin_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}.csv"
    af_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/af_ss.csv"
    s4pred_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/s4pred.csv"
    prot_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/prot_bert.csv"
    out = f"/home/wangyu/projects/synmask/data/sample/m_{n}/merged_ss.csv"
    
    df_af = pd.read_csv(af_file)
    df_s4 = pd.read_csv(s4pred_file)
    df_prot = pd.read_csv(prot_file)
    df_origin = pd.read_csv(origin_file)
    df_origin.index.name = "id"
    
    df = df.merge(df_origin,how='inner',on='id')
    df = df.merge(df_af,how='inner',on='id')
    df = df.merge(df_s4,how='inner',on='id')
    df = df.merge(df_prot,how='inner',on='id')
    ds = ["seq_y","af_ss","seq","s4_pred_ss","prot_ss"]
    for d in ds:
        del df[d]
    df.to_csv(out,index=False)

Summarize the prediction results of secondary structure

In [None]:
import pandas as pd


for n in motif_num:

    df_prot = pd.read_csv(f"/home/wangyu/projects/synmask/data/sample/m_{n}/prot_bert.csv",header=None)
    df_prot.columns=['ss','percent']
    
    df_s4 = pd.read_csv(f"/home/wangyu/projects/synmask/data/sample/m_{n}/s4pred.csv")
    df_s4.columns=['id','seq','ss','percent']
    
    df_af = pd.read_csv(f"/home/wangyu/projects/synmask/data/sample/m_{n}/af/af_ss.txt",header=None)
    df_af.columns = ['id','seq','ss','percent']
    df_af = df_af.sort_values("id")
    
    seqs = df_s4['seq'].to_list()
    per_prot = df_prot['percent'].to_list()
    per_s4 = df_s4['percent'].to_list()
    per_af = df_af['percent'].to_list()
    dict_percent = {'seq': seqs, 'af': per_af, 'prot_bert': per_prot, 'psi_pred': per_s4} 
    df = pd.DataFrame(dict_percent)
    df.to_csv(f"/home/wangyu/projects/synmask/data/sample/m_{n}/merged_ss.csv",index=False)

# Immunogenicity

In [None]:
def get_common_allele(allele_file):
    with open(allele_file, "r") as f:
        outputs = [line.strip() for line in f.readlines()]
    return outputs


allele_i_file= f"/home/wangyu/projects/synmask/code/immuno/allele_i"
allele_ii_file= f"/home/wangyu/projects/synmask/code/immuno/allele_ii"

allele_i = get_common_allele(allele_i_file)
allele_ii = get_common_allele(allele_ii_file)
print(len(allele_i),len(allele_ii))

Generate a script to run immunogenicity prediction.

Use the predictor provided by 
https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/ and https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.1/


In [40]:
import os
import shutil

def bash(mhc_type, input_fasta, allele, output_data_folder, output_bash_file, len):
    flag = "" if mhc_type else " -l 10"
    with open(output_bash_file, "w") as f:
        f.writelines(f"tmux new -d -s mhc_{mhc_type}_{len}\n")
        for i,a in enumerate(allele):
            f.writelines(f"tmux new-window -n window{i} -t  mhc_{mhc_type}_{len}\n")
            f.writelines(f"tmux send -t  mhc_{mhc_type}_{len}:window{i} \"/home/wangyu/gitlab/ImmunoPredictTool/NetMHCpan/netMHC{mhc_type}pan-4.1/netMHC{mhc_type}pan -f {input_fasta} -xls -a {a}{flag} -xlsfile {os.path.join(output_data_folder,a+'.csv')}\n\"ENTER\n")



for n in motif_num:
    fasta_file = f"/home/wangyu/projects/synmask/data/sample/m_{n}/sample_{n}_af_3.fasta"
    output_i = f"/home/wangyu/projects/synmask/data/sample/m_{n}/immuno/i/predict_data"
    output_ii = f"/home/wangyu/projects/synmask/data/sample/m_{n}/immuno/ii/predict_data"
    
    if os.path.exists(output_i):
        shutil.rmtree(output_i)
    if os.path.exists(output_ii):
        shutil.rmtree(output_ii)
    os.makedirs(output_i)
    os.makedirs(output_ii)
    output_bash_i = f"/home/wangyu/projects/synmask/data/sample/m_{n}/immuno/i/bash.sh"
    output_bash_ii = f"/home/wangyu/projects/synmask/data/sample/m_{n}/immuno/ii/bash.sh"
    
    bash("",fasta_file,allele_i,output_i,output_bash_i, n)
    bash("II",fasta_file,allele_ii,output_ii,output_bash_ii, n)


## Statistical analysis of immunogenicity prediction results and merging of secondary structure prediction results

In [None]:
import pandas as pd
import os
import pathlib
from tqdm import tqdm

def convert_predict_data(predict_folder, merged_csv, types="i"):
    dict = {}
    files = os.listdir(predict_folder)
    for file in files:
        name = pathlib.Path(file).stem
        df = pd.read_csv(os.path.join(predict_folder, file),header=1,sep="\t")
        df = df.groupby(['ID'])['NB'].sum().reset_index(name=f"sum_{types}")
        sums = df[f"sum_{types}"].to_list()
        dict[name] = sums
        
    merged_df = pd.DataFrame(dict)
    merged_df[f"sum_{types}"] = merged_df[list(merged_df.columns)].sum(axis=1)
    merged_df.to_csv(merged_csv)
    return merged_df

for n in tqdm(motif_num):
    predict_i_folder = f"/home/wangyu/projects/synmask/data/sample/m_{n}/immuno/i/predict_data"
    predict_ii_folder = f"/home/wangyu/projects/synmask/data/sample/m_{n}/immuno/ii/predict_data"
    
    merged_i = f"/home/wangyu/projects/synmask/data/sample/m_{n}/immuno/merged_i.csv"
    merged_ii = f"/home/wangyu/projects/synmask/data/sample/m_{n}/immuno/merged_ii.csv"
    
    all_merged = f"/home/wangyu/projects/synmask/data/sample/m_{n}/merged.csv"
    # all_merged = f"/home/wangyu/projects/synmask/data/sample/m_{n}/merged_immuno.csv"
    
    merged_i_df = convert_predict_data(predict_i_folder, merged_i)
    merged_ii_df = convert_predict_data(predict_ii_folder, merged_ii,"ii")
    origin_df = pd.read_csv(f"/home/wangyu/projects/synmask/data/sample/m_{n}/merged_ss.csv")
    origin_df_columns = list(origin_df.columns)
    result = pd.concat([origin_df,merged_i_df,merged_ii_df],axis=1)[origin_df_columns+["sum_i"]+["sum_ii"]]
    result['avg'] = (result["sum_i"] + result["sum_ii"])/result['aa_seq'].str.len()
    result.to_csv(all_merged, index=False)

In [14]:
for n in motif_num:
    all_merged = f"/home/wangyu/projects/synmask/data/sample/m_{n}/merged.csv"
    df = pd.read_csv(all_merged)
    # For motif 20，20/20=1，delete sequence have motif repeat more than 3 times
    nums_col = [str(i) for i in range(1,len(list(df.columns))-8)]
    for i in nums_col:
        if int(i) > n/20*3:
            df = df.drop(df[(df[i]>0)].index)
    df=df.loc[:, (df != 0).any(axis=0)]   # 删除全是0的列
    df = df.sort_values(['avg'], ascending=True)
    df.to_csv(all_merged,index=False)

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
from collections import Counter
import numpy as np

def not_show(current_df):
    splic_ids = current_df['seq_x']
    df_origin = pd.read_csv("/home/wangyu/projects/synmask/data/peptide/top20/peptide.csv")
    origin_ids = set(df_origin['ID'].to_list())
    ids = []
    for i in splic_ids:
        ids.extend([int(j) for j in i.split("-")])
    counter = Counter(ids)

    show = set(counter.keys())
    not_show = origin_ids-show
    for i in not_show:
        counter[i] = 0
    sort = dict(sorted(counter.items()))
    return sort



merged_sort_df = pd.DataFrame(columns=['id','count','motif_num'])
for n in motif_num:
    output = f"/home/wangyu/projects/synmask/data/sample/m_{n}/merged.csv"
    df = pd.read_csv(output)
    df=df.iloc[:20,:]
    sort = not_show(df)    
    lst = [str(i) for i in sort.keys()]
    lst2 = [i for i in sort.values()]
    sort_df = pd.DataFrame(zip(lst,lst2,[n for _ in range(len(lst))]),columns=['id','count','motif_num'])
    merged_sort_df = merged_sort_df.append(sort_df, ignore_index=True)
print(merged_sort_df)
    
# fig, ax =plt.subplots(2,2,constrained_layout=True, figsize=(8, 8))
    
# axesSub = sns.barplot(x='id',y='count', data=sort_df, ax=ax[0])
# axesSub.set_title(f'motif*{n}')

sea = sns.FacetGrid(merged_sort_df, col_wrap=5, aspect=1.5,col = "motif_num")
sea.map(sns.barplot, "id", "count", color='c')
# for axes in sea.axes.flat:
#     _ = axes.set_xticklabels(axes.get_xticklabels(), rotation=30)
# plt.tight_layout()