In [1]:
import os
import re
import csv

import pandas as pd
from Bio import SeqIO, AlignIO
from Bio.Align import MultipleSeqAlignment

SNP_DIR = "SNP"

REFERENCE = "EPI_ISL_402124"

# Spike gene sequence

In [2]:
seqs = SeqIO.index(os.path.join(SNP_DIR,"spike_selected.fasta"), "fasta")

reference = seqs[REFERENCE]

In [3]:
trimmedRow = []
repSeqs = []

with open(os.path.join(SNP_DIR, "trimmed.csv"), "w") as csvfile:
    spamwriter = csv.DictWriter(csvfile, delimiter='\t', fieldnames = ["rep", "trimmed"])
    spamwriter.writeheader()
    with open(os.path.join(SNP_DIR, "Trim_S.fasta.clstr")) as f:
        repID = None
        accessions = []
        for line in f:
            if line.startswith('>'):
                if repID:
                    repSeqs.append(seqs[repID])
                if len(accessions) > 1:
                    spamwriter.writerow({"rep": repID, "trimmed": ', '.join(accessions) })
                if REFERENCE in accessions:
                    print(repID)
                accessions = []
            else:
                m = re.findall(r">(\w+)", line)[0]
                if '*' in line:
                    repID = m
                accessions.append(m)
        if len(accessions) > 1:
            repSeqs.append(seqs[repID])
            spamwriter.writerow({"rep": repID, "trimmed": ', '.join(accessions) })
        if REFERENCE in accessions:
            print(repID)

EPI_ISL_402124


In [4]:
aligned = MultipleSeqAlignment(repSeqs)

with open(os.path.join(SNP_DIR, "snp_trimmed.csv"), 'w', newline='') as f:
    spamwriter = csv.writer(f, delimiter=',')
    for record in aligned:
        snp = (0 if base == ref or base in ("-", "N") else 1 for ref, base in zip(reference, record))
        spamwriter.writerow((record.id, *snp))

# Spike protein AA

In [5]:
seqs = SeqIO.index(os.path.join(SNP_DIR,"spike_AA_selected.fasta"), "fasta")

reference = seqs[REFERENCE]

In [6]:
trimmedRow = []
repSeqs = []

with open(os.path.join(SNP_DIR, "trimmed_AA.csv"), "w") as csvfile:
    spamwriter = csv.DictWriter(csvfile, delimiter='\t', fieldnames = ["rep", "trimmed"])
    spamwriter.writeheader()
    with open(os.path.join(SNP_DIR, "Trim_S_AA.fasta.clstr")) as f:
        repID = None
        accessions = []
        for line in f:
            if line.startswith('>'):
                if repID:
                    repSeqs.append(seqs[repID])
                if len(accessions) > 1:
                    spamwriter.writerow({"rep": repID, "trimmed": ', '.join(accessions) })
                if REFERENCE in accessions:
                    print(repID)
                accessions = []
            else:
                m = re.findall(r">(\w+)", line)[0]
                if '*' in line:
                    repID = m
                accessions.append(m)
        if len(accessions) > 1:
            repSeqs.append(seqs[repID])
            spamwriter.writerow({"rep": repID, "trimmed": ', '.join(accessions) })
        if REFERENCE in accessions:
            print(repID)

EPI_ISL_406798


In [17]:
variant_AA = pd.read_csv(os.path.join(SNP_DIR, "variant_AA_S.csv"), header=None, index_col=0)

rowNames = tuple(str(siteName) for siteName in variant_AA.index)

In [18]:
aligned = MultipleSeqAlignment(repSeqs)

with open(os.path.join(SNP_DIR, "snp_AA_trimmed.csv"), 'w', newline='') as f:
    spamwriter = csv.writer(f, delimiter=',')
    spamwriter.writerow(("gisaid_id", *rowNames))
    for record in aligned:
        snp = tuple(0 if base == ref or base in ("-", "X") else 1 for ref, base in zip(reference, record))
        if sum(snp):
            spamwriter.writerow((record.id, *snp, 0))

In [19]:
aligned = AlignIO.read(os.path.join(SNP_DIR, "spike_AA_selected.fasta"), "fasta")

with open(os.path.join(SNP_DIR, "snp_AA.csv"), 'w', newline='') as f:
    spamwriter = csv.writer(f, delimiter=',')
    spamwriter.writerow(("gisaid_id", *rowNames))
    for record in aligned:
        snp = tuple(0 if base == ref or base in ("-", "X") else 1 for ref, base in zip(reference, record))
        if sum(snp):
            spamwriter.writerow((record.id, *snp, 0), )