In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from collections import defaultdict

In [2]:

def process_fastq_pairs(r1_file, r2_file):
    # 初始化結果字典
    result_dict = defaultdict(int)
    
    # 同時讀取 R1 和 R2 文件
    with open(r1_file) as f1, open(r2_file) as f2:
        r1_records = SeqIO.parse(f1, "fastq")
        r2_records = SeqIO.parse(f2, "fastq")
        
        for r1, r2 in zip(r1_records, r2_records):
            # 獲取 R1 序列
            r1_seq = str(r1.seq)
            
            # 對 R2 進行反向互補
            r2_seq = str(r2.seq.reverse_complement())
            
            # 用 10 個 N 連接 R1 和 R2
            merged_seq = r1_seq + "N" * 10 + r2_seq
            
            # 更新結果字典
            result_dict[merged_seq] += 1
    
    return result_dict


In [4]:

# 使用示例
r1_file = "trim_IBR3_3_r1.fq"
r2_file = "trim_IBR3_3_r2.fq"

result = process_fastq_pairs(r1_file, r2_file)

# 對結果進行排序，從多到少
sorted_result = sorted(result.items(), key=lambda x: x[1], reverse=True)

# 輸出排序後的結果
for seq, count in sorted_result:
    print(f"Count: {count}")
    print(f"Sequence: {seq}")
    print("---")

Count: 26
Sequence: GCATCATCCGATGCGACAAAAATTTCATGCTCAATTTGCAAGTGAGTGATGACTAGGTCGTGTTGAACGTTTCTCATTAAGCCACCTGATGAGGAAAATGCCTGAAAATTTGTTGACATAGGGAAGGAAATACCTATGTTATTAATTTTCGAAAGTGGTGGATAAGCGGAGCAATGGATCCGAGATGCAAAGTCCTTATTGTGATGGTGAGATTTCCCAACTTGAAATGCATGTGAAGTCGGGAAACATTATGTTCTTCATACTGCTGTACTGATGTGNNNNNNNNNNGCATAACCAACAATCTATGATCTTGGTGGACATTGGTGCTTCTGGCGTTCGTATTGTACGACCACTGACCGTGTTTGGGTTTGATGATGCTCCTCATGGTCATGCAGAAATTGTTTTTGAAAACGTGCGTGTCCCAGTAGACAATCTCCTCGGTGAGGGGCGTGGCTTTGAGATCGCCCAGGTATGAGGTTTGGAAGTTGGAAATGTGACTTTCTGTCAAACTTGTCTTAATAACTATGGACGATGCTTGGTTTTTAATACAGGGAAGATTAGGCCCAGGAA
---
Count: 20
Sequence: GCATCATCCGATGCGACAAAAATTTCATGCTCAATTTGCAAGTGAGTGATGACTAGGTCGTGTTGAACGTTTCTCATTAAGCCACCTGATGAGGAAAATGCCTGAAAATTTGTTGACATAGGGAAGGAAATACCTATGTTATTAATTTTCGAAAGTGGTGGATAAGCGGAGCAATGGATCCGAGATGCAAAGTCCTTATTGTGATGGTGAGATTTCCCAACTTGAAATGCATGTGAAGTCGGGAAACATTATGTTCTTCATACTGCTGTACTGATGTTNNNNNNNNNNGCATAACCAACAATCTATGATCTTGGTGGACATTGGTGCTTCTGGCGTTCGTATTGTACGACCACTGACCGTGTTTGGGTTTGATGATGCTCCTCAT