In [3]:
lines = open('pred.txt').readlines()
data = []
for line in lines:
    if line.startswith('>'):
        id = line.strip()[1:]
        data.append([id, []])
    else:
        data[-1][1].append(line.strip().split('\t')[1])
data = {d[0]: d[1] for d in data}
data = {k:''.join(v) for k,v in data.items()}

# read seq
lines = open('seq.fasta').readlines()
seqs = []
for line in lines:
    if line.startswith('>'):
        id, *_ = line.strip()[1:].split(' ')
        seqs.append([id, []])
    else:
        seqs[-1][1].append(line.strip())
seqs = {d[0]: d[1] for d in seqs}
seqs = {k:''.join(v) for k,v in seqs.items()}
# read disorder
lines = open('caid.fasta').readlines()
dis = []
for line in lines:
    if line.startswith('>'):
        id = line.strip().split('|')[1]
        dis.append([id, []])
    else:
        dis[-1][1].append(line.strip())
dis = {d[0]: d[1] for d in dis}
dis = {k:''.join(v) for k,v in dis.items()}

# check lengths
for k in data:
    if len(data[k]) != len(seqs[k]):
        print(k, len(data[k]), len(seqs[k]))
    if len(data[k]) != len(dis[k]):
        print(k, len(data[k]), len(dis[k]))

# write to file
with open('CAID.txt', 'w') as f:
    for k, v in data.items():
        f.write('>' + k + '\n')
        f.write(seqs[k] + '\n')
        f.write(dis[k].replace('-', '0').replace('D', '1') + '\n')


In [30]:
import pandas as pd
import numpy as np
from pathlib import Path
import requests
from tqdm import tqdm

def get_seqs(acc):
    url = f'https://rest.uniprot.org/uniprotkb/{acc}.fasta'
    r = requests.get(url).text
    return r

df = pd.read_csv('disprot2022dec.tsv', sep='\t')
acc = df['acc'].unique()
Path('DisProt2022Dec').mkdir(exist_ok=True, parents=True)
seqs = {}
for a in tqdm(acc):
    if not Path(f'DisProt2022Dec/{a}.fasta').exists():
        seqs[a] = get_seqs(a)
        with open(f'DisProt2022Dec/{a}.fasta', 'w') as f:
            f.write(f'{seqs[a]}')

100%|██████████| 2470/2470 [00:00<00:00, 114106.21it/s]


In [32]:
def read_fasta(acc):
    with open(f'DisProt2022Dec/{acc}.fasta') as f:
        lines = f.read().splitlines()
    return ''.join(lines[1:]).strip()

regions = {}
for i, row in df.iterrows():
    acc = row['acc']
    disprot_acc = row['disprot_id']
    if acc in regions:
        t = regions[(acc, disprot_acc)]
    else:
        t = []
    t.append((row['start']-1, row['end']))
    regions[(acc, disprot_acc)] = t

In [33]:
with open(f'DisProt.txt', 'w') as f:
    for (acc, disprot_acc), reg in regions.items():
        seq = read_fasta(acc)
        dis = np.zeros(len(seq))
        for r in reg:
            dis[r[0]:r[1]] = 1
        dis = ''.join([str(int(d)) for d in dis])
        f.write(f'>{disprot_acc}\n{seq}\n{dis}\n')

In [12]:
train_data = open('flDPnn/flDPnn_Training_Annotation.txt').read().splitlines()[10:]
val_data = open('flDPnn/flDPnn_Validation_Annotation.txt').read().splitlines()[10:]
train_ids = [s[1:] for s in train_data[::7]]
val_ids = [s[1:] for s in val_data[::7]]
train_ids = train_ids + val_ids

lines = open('DisProt_clean.txt').read().splitlines()
test_ids = [s[1:] for s in lines[::3]]

for i in test_ids:
    if i in train_ids:
        print(i)

In [17]:
fldpnn_test_data = open('flDPnn/flDPnn_DissimiTest_Annotation.txt').read().splitlines()[10:]
fldpnn_test = {}
for i in range(0, len(fldpnn_test_data), 3):
    fldpnn_test[fldpnn_test_data[i]] = fldpnn_test_data[i+1]
disprot_test_data = open('DisProt_clean.txt').read().splitlines()

overlap = 0
for i in range(0, len(disprot_test_data), 3):
    if disprot_test_data[i] in fldpnn_test:
        overlap += 1
        assert len(fldpnn_test[disprot_test_data[i]]) == len(disprot_test_data[i+2])
print(overlap, len(disprot_test_data)//3 - overlap)

6 118


In [19]:
lines = open('DisProt_clean.txt').read().splitlines()
with open('DisProt_clean.fasta', 'w') as f:
    for i in range(0, len(lines), 3):
        f.write(f'{lines[i]}\n{lines[i+1]}\n')


In [2]:
with open('flDPnn/flDPnn_DissimiTest_Annotation.txt') as f, open('fldpnn.fasta', 'w') as f2:
    lines = f.read().splitlines()[10:]
    for i in range(0, len(lines), 7):
        f2.write(f'{lines[i]}\n{lines[i+1]}\n')

In [1]:
lines = open('CAID.txt').read().splitlines()
with open('CAID.fasta', 'w') as f:
    for i in range(0, len(lines), 3):
        f.write(f'{lines[i]}\n{lines[i+1]}\n')