#### Uniprotに関するコード
ファイルの名称について
- uniprotから全てのsubcellular locationsを取り出す*
    - ac2locs.json
- リストcompartmentに入っているものを取り出す（元ac2subcellularlocations_up_mini.json）
    - ac2locs_classified.json
- 複数局在を考慮
    - ac2locs_classified_multi.json

In [None]:
# UniprotからSUBCELLULAR LOCATIONを抜き出してACをKeyにしたjsonをつくるコード
import re
import json
from collections import defaultdict

data = open("../uniprot_sprot_human.dat")

# 正規表現のパターン定義
# 左のパターンのCytoplasmをとる。「Cytoplasm {ECO:0000269|PubMed:17081065}.」
pattern = r'([\w\.\,\;\-\s]+)\s\{[^\}]+\}[\.\;]'
repatter = re.compile(pattern)
# 左のパターンのSecretedをとる。「Cytoplasm. Nucleus {ECO:0000250}. Secreted. 」
pattern = r'[\w\.\,\;\-\s]+\s\{[^\}]+\}[\.\;] ([\w\s]+)\.\s*$'
repatter2 = re.compile(pattern)

proteins = defaultdict(list)
locations = []
acs = []
slline = []
flag = False

for line in data:
    if line.startswith("AC"):
        info = line.rstrip().split()
        acs = []
        slline = []
        loc = []
        loc2 = []
        # ACが複数あるときに対応するためリストに入れる
        for a in info[1:]:
            acs.append(a.replace(";",""))            
    # SUBCELLULAR LOCATIONの行をつなげて slline にappendしていく
    elif line.startswith("CC   -!- SUBCELLULAR LOCATION:"):
        flag = True # フラグ立てる
        slline.append(line.rstrip().replace("CC   -!- SUBCELLULAR LOCATION: ",""))
    elif flag and line.startswith("CC       "):
        slline.append(line.rstrip().replace("CC       ",""))
    elif flag and "-!-" in line:
        flag = False # フラグ折る
        
        # Noteはいらないので削除
        sl = re.sub("Note=.+$",""," ".join(slline))
        # Uniprotの行を結合してNoteを削除したのをテスト出力
        # print("{}\t{}".format(acs[0],sl))

        # 正規表現使って抜き出す
        loc = repatter.findall(sl)
        loc2 = repatter2.findall(sl)
        # findallはリストでマッチしたところが返されるので、2つのリストを結合
        if len(loc2) > 0:
            loc = loc + loc2
        # 正規表現で捕まらないときは平文なのでピリオドでsplitして持っておく
        if len(loc) == 0:
            loc = sl.rstrip().rstrip(".").split(".")

    elif line.startswith("//"):
        # print("{}\t{}".format(acs[0],loc))
        # 局在先にピリオドが含まれている場合は分割、前後のスペース取る
        locations = set()
        for l in loc:
            if "." in l:
                for sl in l.split("."):
                    locations.add(sl.strip())
            else:
                locations.add(l.strip())
        # jsonつくる準備。複数ACがある場合に対応
        for ac in acs:
            # 局在先が複数ある場合に対応するためにディクショナリのValueはリストにする
            for location in locations:
                if location is not None:
                    proteins[ac].append(location)
            if ac not in proteins.keys():
                proteins[ac].append("NoData")
# JSON形式でファイルに出力する
with open('../ac2locs.json', 'w') as f:
    json.dump(proteins, f, indent=4)

In [None]:
import json
from collections import defaultdict

json_open = open('../ac2locs.json', 'r')
ac2sub = json.load(json_open)

compartment = ["Nucleus","Mitochondrion","Cytoplasm","Membrane","Secreted","Endoplasmic reticulum","Golgi","Lysosome","Others","NoData"]
proteins = defaultdict(list)
mem = defaultdict(int)

for ac,subcels in ac2sub.items():
    locations = set()
    for subcel in subcels:
        for c in compartment:
            if c.lower() in subcel.lower():
                locations.add(c)
        locations.add("All")
    if len(locations) == 0:
        locations.add("Others")
    locations.add("All")
    proteins[ac] = list(locations)

with open('../ac2locs_classified.json', 'w') as f:
    json.dump(proteins, f, indent=4)

In [None]:
import json
from collections import defaultdict

json_open = open('../ac2locs_classified.json', 'r')
ac2sub = json.load(json_open)

locations = []

# Secreted
# Cytoplasm,Nucleus
# Nucleus
# Cytoplasm,Mitochondrion
# Mitochondrion
# ER,Golgi

#elif

# Lysosome
# Membrane
# Cytoplasm

# Golgi→ER,Golgi
# Endoplasmic reticulum→ER,Golgi

#else

# NoData

proteins = {}

for ac,subcels in ac2sub.items():
    locations = []
    if "Secreted" in subcels:
        locations.append("Secreted")
    if "Nucleus" in subcels and "Cytoplasm" in subcels:
        locations.append("Cytoplasm,Nucleus")    
    elif "Nucleus" in subcels:
        locations.append("Nucleus")    
    elif "Mitochondrion" in subcels and "Cytoplasm" in subcels:
        locations.append("Cytoplasm,Mitochondrion")    
    elif "Mitochondrion" in subcels:
        locations.append("Mitochondrion")    
    elif "Endoplasmic reticulum" in subcels and "Golgi" in subcels:
        locations.append("ER,Golgi")  
    else:    
        if "Lysosome" in subcels:
            locations.append("Lysosome")    
        elif "Membrane" in subcels:
            locations.append("Membrane")
        elif "Cytoplasm" in subcels:
            locations.append("Cytoplasm")
        elif "Golgi" in subcels:
            locations.append("ER,Golgi")
        elif "Endoplasmic reticulum" in subcels:
            locations.append("ER,Golgi")
        elif "NoData" in subcels:
            locations.append("NoData")
        else:
            if len(locations) == 0:
                locations.append("Others")
    locations.append("All")            
    proteins[ac] = locations

with open('../ac2locs_classified_multi.json', 'w') as f:
    json.dump(proteins, f, indent=4)

In [None]:
import json
from collections import defaultdict
import re

gene2ac = defaultdict(list)
gene2ac_rawData = defaultdict(list)
gene2ac_Data = defaultdict(list)
gene2ac_flag = defaultdict(list)

acs =[]
pattern = "Name=(.*);?"
pattern2 = "Name=(.*); Synonyms=(.*);?"
pattern3 = "Name=(.*); Synonyms=(.*); ORFNames=(.*);?"
pattern4 = "Name=(.*); ORFNames=(.*);?"
pattern5 = "Synonyms=(.*);?"
pattern6 = "Synonyms=(.*); ORFNames=(.*);?"
pattern7 = "ORFNames=(.*);?"
pattern_del = "({?ECO:.*}?)"

repatter1 = re.compile(pattern)
repatter2 = re.compile(pattern2)
repatter3 = re.compile(pattern3)
repatter4 = re.compile(pattern4)
repatter5 = re.compile(pattern5)
repatter6 = re.compile(pattern6)
repatter7 = re.compile(pattern7)
repatter_del = re.compile(pattern_del)

gene_dic = {1:repatter1,2:repatter2,3:repatter3,4:repatter4,5:repatter5,6:repatter6,7:repatter7}
all_gene_ids = []

data = open("../uniprot_human.dat")
for line in data:
    if line.startswith("AC"):
        info = line.rstrip().split()
        for b in info[1:]:
            acs.append(b.replace(";",""))  
    if line.startswith("GN   and"):
        pass
    elif line.startswith("GN   "):
        gene_ids = []
        #{ECO XXXXX}を消す
        line_deleted = re.sub(pattern_del, "", line)
        
        if "Name=" in line_deleted:
            flag = 1
            if "Synonyms=" in line_deleted:
                flag = 2
                if "ORFNames=" in line_deleted:
                    flag = 3
            elif "ORFNames=" in line_deleted:
                flag = 4
        elif "Synonyms=" in line_deleted:
            flag = 5
            if "ORFNames=" in line_deleted:
                flag = 6
        elif "ORFNames=" in line_deleted:
            flag = 7
        gene_id = gene_dic[flag].findall(line_deleted)
        if gene_id != []:
            if type(gene_id[0]) == tuple:
                for data2 in gene_id[0]:
                    #ORFを取るならスラッシュ区切りに対応
                    for data3 in data2.replace("/",",").split(","):
                        gene_ids.append(data3.replace(";","").strip())
            if type(gene_id[0]) == str:
                gene_ids.append(gene_id[0].replace(";","").strip())
    elif line.startswith("//"):
        for data4 in gene_ids:
            if data4 not in all_gene_ids:
                all_gene_ids.append(data4)
                gene2ac[data4.upper()]=acs[0]
        acs = []

with open('../gene2ac.json', 'w') as f:
    json.dump(gene2ac, f, indent=4)

In [None]:
from collections import defaultdict

ac2gene = defaultdict(list)
for gene in gene2ac:
    ac2gene[gene2ac[gene]]=gene


with open('../ac2gene.json', 'w') as f:
    json.dump(ac2gene, f, indent=4)