In [4]:
import jpredapi
from Bio import Entrez, SeqIO
import numpy as np
import re
import time

In [62]:
def get_seq(id):
    handle = Entrez.efetch(db='protein', id='121949022', rettype='gb', retmode='text')
    record = SeqIO.read(handle, 'genbank')
    return record.seq, record.name
    
def get_permutaitons(seq, num):
    seq = np.array(list(seq))
    perms = []
    for _ in range(num):
        perms.append(np.random.permutation(seq))
    
    return ["".join(perm) for perm in perms]

def send_query(seq):
    res = jpredapi.submit(mode="single", user_format="raw", seq=seq)
    id = re.findall("chklog\?([a-zA-Z0-9_]*)</a>", res.text)[0]
    return id

def wait_until_finished_and_download(ids):
    all_finished = False
    while not all_finished:
        time.sleep(5)

        all_finished = True
        jobs_finished = []
        for id in ids:
            if id in jobs_finished:
                pass
            res = jpredapi.status(jobid=id)
            finished = len(re.findall(f"^Job {id} finished.", res.text)) != 0
            if not finished:
                all_finished = False
                break
            else:
                jobs_finished.append(id)

    # time.sleep(10)

    for id in ids:
        jpredapi.get_results(jobid=id, results_dir_path="jpred_sspred/results", extract=True)

        
def calc_num_len(id):
    file = f'jpred_sspred/results/{id}/{id}.jnet'
    for line in open(file):
        if line.startswith('jnetpred:'):
            break

    prediction = line.split(':')[1].strip().replace(',', '')
    
    structures = re.findall("(H+|E+)", prediction)
    structures_num = len(structures)
    structure_lengths = np.array(list(map(lambda x: len(x), structures)))


    return structures_num, np.mean(structure_lengths)
    

def experiment(seq, perm_num = 30):
    perms = get_permutaitons(seq, perm_num)

    original_id = send_query(seq)
    perm_ids = [send_query(perm) for perm in perms]

    wait_until_finished_and_download([original_id] + perm_ids)

    original_num, original_len_mean = calc_num_len(original_id)

    perm_nums = []
    perm_lens = []
    for id in perm_ids:
        num, len_mean = calc_num_len(id)
        perm_nums.append(num)
        perm_lens.append(len_mean)
    
    perm_nums = np.array(perm_nums)
    perm_lens = np.array(perm_lens)

    return original_num, original_len_mean, np.mean(perm_nums), np.mean(perm_lens)

protein_ids =  ['1409222234', '121949022']#, '1409222232', '1409222230', '1409222226']
messages = []
for protein_id in protein_ids:
    seq, name = get_seq(protein_id)
    original_num, original_len_mean, perm_num_mean, perm_len_mean = experiment(seq[:800], perm_num=5)

    messages.append(f'Originally in {name} are {original_num} structures. In permutations there are {perm_num_mean} on average')
    messages.append(f'Originally in {name} structure have {original_len_mean} length. In permutations they have {perm_len_mean} on average')



Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.


Your job will be submitted with the following parameters:
format: seq
skipPDB: on
seq: MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKDEVSIIQSMGYRNRAKRLLQSEPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYIELGSDSSEDTVNKATYCSVGDQELLQITPQGTRDEISLDSAKKAACEFSETDVTNTEHHQPSNNDLNTTEKRAAERHPEKYQGSSVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVEKAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPCSENPRDTEDVPWITLNSSIQKVNEWFSRSDELLGSDDSHDGESESNAKVADVLDVLNEVDEYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHVTENLIIGAFVTEPQIIQERPLTNKLKRKRRPTSGLHPEDFIKKADLAVQKTPEMINQGTNQTEQNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNIHNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKKKKYNQMPVRHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKEFVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGERVLQTERSVESSSISLVPGTDYGTQESISLLEVSTLGKAKTEPNK
<h1>Created JPred job. Interactive access through:</h1><ul><li><a href="http://www.compbio.dundee.ac.uk/jpred4/c

In [68]:
!rm -r jpred_sspred

In [66]:
print(*messages, sep='\n')

Originally in Q3LRJ6_HUMAN are 26 structures. In permutations there are 41.2 on average
Originally in Q3LRJ6_HUMAN structure have 7.153846153846154 length. In permutations they have 5.529934669201891 on average
Originally in Q3LRJ6_HUMAN are 26 structures. In permutations there are 40.4 on average
Originally in Q3LRJ6_HUMAN structure have 7.153846153846154 length. In permutations they have 5.467908710674668 on average


## Opierając się na dane nie potwierdziłem że prawdizwe proteiny zawierają więcej struktuk niż randomowe permutacje.
## Natomiast długość struktur jest większa w prawdziwych proteinach.

### Wyniki mogą nie być zgodne z rzeczywistością ze wzgłędu na małą próbkę. Rozmiar próbki jest spowodowany długim czasem oczekiwania na zakończenie obliczeń jpred.