<a href="https://colab.research.google.com/github/talgalper/Honours-2021/blob/main/nextstrain_data_treatment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install ncbi-blast+
!apt-get install exonerate
!apt-get install dssp
!pip install biostructmap

In [14]:
import pandas as pd
from collections import defaultdict
import biostructmap

In [15]:
def nextstrain_data_processing(filename, seq_len, gene_id):
    ns_data = pd.read_csv(filename, sep='\t')

    s_gene_data = ns_data[ns_data.gene == gene_id]

    pos = s_gene_data.position

    entropy = s_gene_data.entropy

    # Default dict will return zero if not in original entropy data.
    pos_to_entropy = defaultdict(float, zip(pos, entropy))

    len_of_ref_seq = seq_len

    # Create a list that includes zeros.
    list_of_values = [pos_to_entropy[i+1] for i in range(len_of_ref_seq)]
    return list_of_values

In [16]:
s_protein_data = nextstrain_data_processing('nextstrain_ncov_global_diversity.tsv', 1273, 'S') #changed 1275 to 1273

In [17]:
print(s_protein_data)

[0.0, 0.004, 0.008, 0.0, 0.132, 0.032, 0.0, 0.002, 0.019, 0.0, 0.0, 0.043, 0.063, 0.013999999999999999, 0.002, 0.005, 0.005, 0.28600000000000003, 0.175, 0.2, 0.013000000000000001, 0.02, 0.004, 0.0, 0.006, 0.19899999999999998, 0.064, 0.002, 0.015, 0.0, 0.0, 0.0, 0.009000000000000001, 0.0, 0.0, 0.004, 0.0, 0.0, 0.002, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0.0, 0.026000000000000002, 0.0, 0.006, 0.079, 0.0, 0.011000000000000001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002, 0.01, 0.002, 0.005, 0.102, 0.004, 0.628, 0.637, 0.013000000000000001, 0.013999999999999999, 0.006999999999999999, 0.005, 0.086, 0.073, 0.024, 0.0, 0.0, 0.254, 0.0, 0.002, 0.0, 0.002, 0.0, 0.0, 0.0, 0.004, 0.0, 0.002, 0.0, 0.0, 0.002, 0.018000000000000002, 0.165, 0.013000000000000001, 0.002, 0.052000000000000005, 0.0, 0.016, 0.0, 0.019, 0.0, 0.0, 0.002, 0.0, 0.0, 0.0, 0.0, 0.002, 0.004, 0.002, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004, 0.002, 0.002, 0.002, 0.0, 0.0, 0.004, 0.0, 0.022000000000000002, 0.0, 0.002, 0.002, 0.005, 0

In [30]:
#removes fasta file header
def ref_seq_fasta_format(fasta_filename):
    with open(fasta_filename) as f:
        seq_lines = f.readlines()
    
    formatted_seq_lines = seq_lines[1:]
    
    new_list_lines_removed = list(map(str.strip, formatted_seq_lines)) 
    complete_format = ''.join(new_list_lines_removed)
    return complete_format
    
    #formatted_seq_lines = removed_fasta_seq_lines.replace('/n', '')
    #this is just a piece of code i was thinking to use (doesnt work) to get rid of /n. maybe not even necessary.

In [38]:
ref_seq = ref_seq_fasta_format('sars_cov_2_spike_ref_seq.fasta')

In [33]:
ref_seq

'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITG

In [50]:
#changed formatted_seq_lines to input_data which is the s_protein_data
def biostructmap_analysis(pdb_structure, gene_name, input_data, ref_sequence, output_filename):
    # Initialise structure object
    structure = biostructmap.Structure(pdb_structure, gene_name)

    data = {'A': input_data, 
            'B': input_data, 
            'C': input_data
            }
 
    ref_seqs = {'A': ref_sequence,
                'B': ref_sequence,
                'C': ref_sequence
                }

    # Note: Use method='default' for mapping an ordered list of values (must contain values for each residue in the sequence.)
    results = structure.map(data, method='default', ref=ref_seqs, radius=15, rsa_range=(0.2, 1.0))

    results.write_data_to_pdb_b_factor(fileobj=output_filename, scale_factor=100)

In [51]:
biostructmap_analysis('6xm0.pdb', 'sars_cov_2_spike', s_protein_data, ref_seq, 'biostructmap_global_diversity.pdb')



In [None]:
#color bfactor protein range 0,15 palette rainbow

In [None]:
 with open('process_diversity_data.txt', 'w') as f:
        for entropy in list_of_values:
            f.write(f"{entropy:0.3f}\n")