In [3]:
%matplotlib inline

import shutil
import os
from joblib import dump, load
import urllib3
import certifi
from Bio import SeqIO
import Bio
from glob import glob
import json
from IPython import display
import pandas as pd
import numpy as np
import networkx as nx
from collections import OrderedDict

import matplotlib.pyplot as plt
import seaborn as sns
import re

In [4]:
# HIV properties

patients = ["p{}".format(i) for i in range(1,12)]
hiv_regions = ["V3", "PR", "psi", "vpr", "vpu", "p1", "p2", "p6", "p7", "p15", "p17", "RRE"]

In [9]:
def download_hivevo_haplotype(patient, folder, hiv_region):
    '''
    Downloading haplotypes for region for patient
    '''
    global hi
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                               ca_certs=certifi.where())
    
    api = "https://hiv.biozentrum.unibas.ch/api/data/haplotypes/"
    
    url = "/".join((api, patient, hiv_region))

    if not os.path.isdir(folder):
        os.mkdir(folder)

    file_name = folder + "_".join(("hivevo", patient, region)) + ".fasta"

    with http.request('GET', url, preload_content=False) as res, open(file_name, 'wb') as out_file:

        shutil.copyfileobj(res, out_file)

        

folder = "../data/regions/"
for patient in patients:
    for region in hiv_regions:
        download_hivevo_haplotype(patient, region, folder)

In [10]:
def download_hivevo_references(folder):
    '''
    Downloading references for each patient and storing them in dir folder/references
    '''
    global patients    
    
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                               ca_certs=certifi.where())
    
    if not os.path.isdir(folder):
        os.mkdir(folder)
    
    for patient in patients:
        api = "https://hiv.biozentrum.unibas.ch/api/data/referenceSequence"
        url = "/".join((api, patient))
        file_name = folder + "_".join(("hivevo", "reference", patient)) + ".fasta"
        
        with http.request('GET', url, preload_content=False) as res, open(file_name, 'wb') as out_file:
            shutil.copyfileobj(res, out_file)


folder = "data/references/"
download_hivevo_references(folder)

In [11]:
def extracting_region_from_reference(region, reference_path, folder):
    '''
    Extracting special region from reference
    '''
    
    if not os.path.isdir(folder+region):
        os.mkdir(folder+region)
    
    with open(reference_path) as f:
        reference_info = json.load(f)
    for reg in reference_info['features']:
        if reg['name'] == region:
            loc = reg['location'][0]
            break
        else:
            continue
    patient = re.search(r'p[\d]*', reference_path)[0]
    #print(patient)
    sequence = reference_info['seq'][loc[0]:loc[1]]
    #print(sequence)
    res = r'/'+re.search(r'[\w]*\.fasta', reference_path)[0].replace(patient, "_".join((patient, region)))
    
    with open(folder+region+res, 'w') as write_file:
        write_file.write('>'+reference_info['description'].replace('genomewide', 'region='+region)+'\n')
        write_file.write(sequence)


# UNCOMMENT THIS IF YOU ARE BRAVE        
#folder = 'data/references/'
#region = 'V3'
#ref_path = 'data/references/hivevo_reference_p4.fasta'
#extracting_region_from_reference(region, ref_path, folder)

In [12]:
def json2fasta(folder, path_json):
    '''
    Converting json to fasta
    WARNING!!! works not as properly as it should. be carefull
    '''
    
    if not os.path.isdir(folder+'fasta'):
        os.mkdir(folder+'fasta')
        
    with open(path_json) as f:
        json_file = json.load(f)
    
    path = path_json.replace('data/', 'data/fasta/')   
    with open(path, "w") as fasta_file:
        for obj in json_file:
            line_1 = '>' + obj['name'] + '\n'
            line_2 = obj['sequence'] + '\n'
            lines = [line_1, line_2]
            fasta_file.writelines(lines)
            
# UNCOMMENT THIS IF YOU ARE BRAVE
#json2fasta('data/','data/hivevo_p4_V3.fasta')

In [13]:
def add_ref_reg2fasta(path_fasta, path_ref_region):
    '''
    Adding exact region from reference file to fasta with haplotypes
    '''
    with open(path_fasta, 'a') as fasta, open(path_ref_region) as ref:
        for line in ref:
            #print(line)
            fasta.write(line)
            
#add_ref_reg2fasta('data/fasta/hivevo_p4_V3.fasta', 'data/references/V3/hivevo_reference_p4_V3.fasta')