<a href="https://colab.research.google.com/github/talgalper/Honours-2021/blob/main/nextstrain_biostructmap_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from collections import defaultdict
import biostructmap
import argparse


def nextstrain_data_processing(filename, seq_len, gene_id):
    '''Process nextstrain data, adding 0 values for residues not already present'''
    ns_data = pd.read_csv(filename, sep='\t')
    gene_data = ns_data[ns_data.gene == gene_id]
    pos = gene_data.position
    entropy = gene_data.entropy
    pos_to_entropy = defaultdict(float, zip(pos, entropy))
    len_of_ref_seq = seq_len
    list_of_values = [pos_to_entropy[i+1] for i in range(len_of_ref_seq)]
    return list_of_values


def ref_seq_fasta_format(fasta_filename):
    with open(fasta_filename) as f:
        seq_lines = f.readlines()
    formatted_seq_lines = seq_lines[1:]
    new_list_lines_removed = list(map(str.strip, formatted_seq_lines)) 
    complete_format = ''.join(new_list_lines_removed)
    return complete_format


def biostructmap_analysis(pdb_structure, gene_name, input_data, ref_sequence, output_filename, chains):
    structure = biostructmap.Structure(pdb_structure, gene_name)

    data = {chain: input_data for chain in chains}
    ref_seqs = {chain: ref_sequence for chain in chains}
    results = structure.map(data, method='default', ref=ref_seqs, radius=15, rsa_range=(0.2, 1.0))

    results.write_data_to_pdb_b_factor(fileobj=output_filename, scale_factor=100)


def process_monthly_data_file(monthly_data_filename, output_filename, ref_seq_file, nextstrain_id, pdb_file, chains):
    ref_seq = ref_seq_fasta_format(ref_seq_file)
    protein_data = nextstrain_data_processing(monthly_data_filename, len(ref_seq), nextstrain_id)
    biostructmap_analysis(pdb_file, "protein_name_placeholder", protein_data, ref_seq, output_filename, chains)


def main():
    parser = argparse.ArgumentParser(description='Biostructmap Analysis Pipeline')
    parser.add_argument('diversity', metavar='INPUT', type=str,
                        help='Input TSV file')
    parser.add_argument('input_pdb', metavar='INPUT_PDB', type=str,
                        help='Input PDB file')
    parser.add_argument('output', metavar='PDB', type=str,
                        help='Output PDB file')
    parser.add_argument('ref_fasta', metavar='ref_fasta', type=str,
                        help='Reference FASTA file')
    parser.add_argument('nextstrain_id', type=str,
                        help='Nextstrain ID')
    parser.add_argument('chains', type=str,
                        help='Protein chains, comma delimited')
    args = parser.parse_args()
    chains = args.chains.split(',')
    process_monthly_data_file(args.diversity, args.output, args.ref_fasta, args.nextstrain_id, args.input_pdb, chains)


if __name__ == '__main__':
    main()
