# SCOPのテキストをダウンロード

In [None]:
import urllib.request
import os

scop = "http://scop.mrc-lmb.cam.ac.uk/files/scop-cla-latest.txt"
with urllib.request.urlopen(scop) as content:
    contents = content.read()
    html = contents.decode()

path = "./data/database"
with open(path, "w") as f:
    f.write(html)

# コンタクトマップ作成

In [None]:
import urllib.request, urllib.error
import sys
import os
import numpy as np
from scipy.spatial import distance
import matplotlib.pyplot as plt
from tqdm import tqdm

scop = "./data/database"
with open(scop) as f:
    slst = f.read().splitlines()

for i in tqdm(range(len(slst))):
    if slst[i][0] == "#":
        continue
    slst[i] = list(slst[i].split())
    name = slst[i][1]   #nameはPDBのID
    leng = slst[i][2]
    if "A" in leng[2:] or 'B' in leng[2:] or 'S' in leng[2:] or'I' in leng[2:]or 'P'in leng[2:]:
        continue
    if "," in leng:
        continue
    num1 = leng.index(":")
    reg = leng[:num1]   #regはregion(IDの後のアルファベット)
    num = leng[3:].index("-")
    num += 3
    st = int(leng[num1+1:num])
    end = int(leng[num+1:])
    if end > 1000:
        continue
    n = end - st + 1    #nは配列の長さ

    #pdbファイルを保存
    url = "https://files.rcsb.org/download/" + name + ".pdb"
    path_p = "./data/pdb/" + name + ".pdb" #pdbファイルの保存先
    if not (os.path.exists(path_p)):   
        try:
            with urllib.request.urlopen(url) as content:
                contents = content.read()
                html = contents.decode()
            with open(path_p, "w") as f:
                f.write(html)
        except:
            continue

    #pdbから距離行列を作成
    pdb = './data/pdb/' + name + '.pdb'
    CB_list = [[0.0001] * 3 for _ in range(n)]
    with open(pdb) as f:
        lst = f.read().splitlines()
    for j in range(len(lst)):
        lst[j] = list(lst[j].split())
        if len(lst[j]) == 12:
            if (lst[j][0] == 'ATOM' or lst[j][0] == 'HETATM') and lst[j][4] == reg:
                try:
                    a=int(lst[j][5]) #配列が重なったもの(30Aなど)を排除
                except:
                    #print(name)
                    break
                if int(lst[j][5]) < st:
                    continue
                if int(lst[j][5])-st >= n: #残基長をオーバーしたら終了
                    break
                if lst[j][3] == 'GLY' and lst[j][2] == 'CA':
                    CB_list[int(lst[j][5])-st] = lst[j][6:9]
                elif lst[j][2] == 'CB':
                    CB_list[int(lst[j][5])-st] = lst[j][6:9]

    if [0.0001]*3 in CB_list: #距離行列に0,0,0を含む場合を排除
        #print(name+'unmodeled')
        continue
    CB_list = np.array(CB_list) 
    dist = distance.cdist(CB_list, CB_list, metric='euclidean')
    dist = np.array(dist)
    nn = dist.shape[0] #残基長
    if nn < 128: #64残基以下は排除
        continue
    else:
        nnn = nn - 128 + 1
        for j in range(nnn): #64残基分を切り取る
            if(j%30!=0): continue　#開始位置を30残基ずらしている。
            dist128 = []
            for k in range(128):
                #cutoff
                #for l in range(128):
                    #if(dist[k+j][j+l]>=20):dist[k+j][j+l]=20
                d = dist[k+j][j:j+128]
                dist128.append(d)
            dist128 = np.array(dist128)
            plt.imshow(-dist128)
            np.save("./data/distmap128/"+name+str(j), dist128)

# データに対しての適切な平均・標準偏差を確認

In [None]:
import glob
import numpy as np
import torch
from torchvision import transforms
trans = transforms.ToTensor()
trans2 = transforms.Normalize(10,10)#この部分を書き換える(前者が平均で後者が標準偏差)

files = glob.glob("./data/distmap128/*")
a = np.load(files[0])
print(a)
print(a.mean)
print(np.std(a))
aa = trans(a)
aa = trans2(aa)
aa = aa.to(torch.float32)
print(aa)#値が-1~1の範囲内にあるかを確認
plt.imshow(-aa[0])
