In [38]:
"""

 Created on 30-Jun-21
 @author: Kiril Zelenkovski

"""
import six.moves.urllib.request as urlreq
from six import PY3
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq 
import dash
import dash_bio as dashbio
import dash_html_components as html
from Bio import SeqIO
import os
import glob
import pandas as pd
import re

In [44]:
def get_mutations(S_protein, variants):
    var_aa = []
    for name in variants:
        print(name)
        mutations = name.split("_")
        S_copy = S_protein[:]
        for m in range(0, len(mutations)):
            mutations_list = re.split('(\d+)', mutations[m])
            if mutations_list[2] == 'del':
                continue
            else:
                codon_num = int(mutations_list[1]) - 1
                print("Mutating ", S_copy[codon_num], "into", mutations_list[2])
                S_copy[codon_num] = mutations_list[2]

        S_aa = "".join(S_copy)
        print(S_aa[:15] + "..."+S_aa[-1], "\n")
        var_aa.append(S_aa)
    return var_aa

def create_linage_peptides(S_protein, lineages):
    """

    :param S_protein:
    :param lineages:
    :return:
    """
    linage_aa = []
    for lineage in lineages:
        # Create copy for current linage
        S_copy = S_protein[:]

        # Opening JSON file
        f = open('C:\\Users\\zelen\\Desktop\\Sars-Cov-2-variants\\outbreakinfo\\' + lineage + '.json', )

        # json.load: returns JSON object as a dictionary
        data = json.load(f)

        # Mutations in linage
        print(lineage)
        for mutation in data:
            if mutation['gene'] == 'S':

                if mutation['type'] == 'substitution':
                    ref_aa = mutation['ref_aa']
                    codon_num = mutation['codon_num'] - 1
                    alt_aa = mutation['alt_aa']

                    print("Mutation is of type - " + mutation['type'], " :",
                          mutation['ref_aa'],
                          mutation['codon_num'],
                          mutation['alt_aa'])

                    # Check if aa on codon is equal to aa on referent genome
                    if S_copy[codon_num] == ref_aa:
                        # Mutate to alternative amino acid
                        S_copy[codon_num] = alt_aa
                        print("Mutated " + ref_aa + " into " + alt_aa)



        S_aa = "".join(S_copy)
        print(S_aa, "\n")
        linage_aa.append(S_aa)


    return linage_aa

def sequence_compare(seq_a, seq_b):
    len1 = len(seq_a)
    len2 = len(seq_b)
    mismatches = []
    mismatches_count = 0
    print("-^-^-^-^-^-^-^-^-^-^-^-^ MISMATCHING -^-^-^-^-^-^-^-^-^-^-^-^")
    for pos in range(0, min(len1, len2)):
        if seq_a[pos] != seq_b[pos]:
            mismatches.append('|')
            mismatches_count += 1
            print(f"Mismatch {mismatches_count}:", seq_b[pos], pos + 1, seq_a[pos])
        else:
            mismatches.append(' ')
    print("\n", f"Total # of mismatches: {mismatches_count}")
    print()
    print("Lineage")
    print(seq_b)
    print("".join(mismatches))
    print(seq_a)
    print("Variant")

In [22]:
for seq_record in SeqIO.parse("C:\\Users\\zelen\\Desktop\\Sars-Cov-2-variants\\data-fasta\\sars-cov-2-glycoprotein.fasta", "fasta"):
    print("ID: ", seq_record.id)
    print("Description: Surface glycoprotein [Sars-Cov-2]")
    print("Protein sequence: ", repr(seq_record.seq))
    print("Amino acid length: ", len(seq_record), "\n")

S_region = list(seq_record.seq)

ID:  YP_009724390.1
Description: Surface glycoprotein [Sars-Cov-2]
Protein sequence:  Seq('MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDL...HYT')
Amino acid length:  1273 



In [56]:
import os
import glob
import pandas as pd

# Read combined_csv_data from 0-phase
df = pd.read_csv("C:\\Users\\zelen\\Desktop\\Sars-Cov-2-variants\\combined_data-2.csv")

# List all lineages
all_lineages = []
for index, row in df.iterrows():
    all_lineages.append(row['#Top Lineage'].split(" ")[1])
    # if row['#Top Lineage'].split(" ")[1] == "B.1.1.70":
    #     print(row['Variant'])


import collections

counter = collections.Counter(all_lineages)
c = counter.most_common(5)
c

[('B.1.1.7', 2434),
 ('B.1.351', 1006),
 ('B.1', 441),
 ('B.1.429', 384),
 ('B.1.258', 312)]

In [59]:
top_lineages = []

for i in reversed(c):
    label = i[0]
    top_lineages.append(label)

seq_temp = create_linage_peptides(S_region, top_lineages)

B.1.258
Mutation is of type - substitution  : D 614 G
Mutated D into G
Mutation is of type - substitution  : N 439 K
Mutated N into K
MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSKNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLT

In [60]:
seq_temp[0]

'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSKNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITG

In [62]:
records = [{'id': top_lineages[0], 
            'temp': '1',
            'seq': seq_temp[0]}, 
           
           {'id': top_lineages[1], 
            'temp': '2',
            'seq': seq_temp[1]}, 
           
           {'id': top_lineages[2], 
            'temp': '3',
            'seq': seq_temp[2]}, 
           
           {'id': top_lineages[3], 
            'temp': '4',
            'seq': seq_temp[3]}, 
           
           {'id': top_lineages[4], 
            'temp': '5',
            'seq': seq_temp[4]}]

In [63]:
records[0]['id']

'B.1.258'

In [73]:
def make_protein_record(nuc_record):
    return SeqRecord(
        seq=Seq(nuc_record['seq']),
        id=" " + nuc_record['id'],
        description=" [Fasta file from linage analysis] ",
    )


sequences = (
    make_protein_record(nuc_rec)
    for nuc_rec in records
)

print(sequences)

<generator object <genexpr> at 0x00000217EC5B60C8>


In [74]:
SeqIO.write(sequences, "example2.fasta", "fasta")

5