In [1]:
import json

In [2]:
cache_log_path = "mmcif_cache_v3.log"
with open(cache_log_path, "r") as f:
    cache_log = f.read().splitlines()

error_num = 0
for line in cache_log:
    if "Failed to parse" in line:
        error_num += 1
print(f"Number of failed to parse errors: {error_num}")

hybrid_num = 0
for line in cache_log:
    if "Hybrid" in line:
        hybrid_num += 1
print(f"Number of hybrid errors: {hybrid_num}")

other_error_num = error_num - hybrid_num
print(f"Number of other errors: {other_error_num}")

Number of failed to parse errors: 625
Number of hybrid errors: 595
Number of other errors: 30


In [3]:
def deduplicate_dict(input_dict):
    seen_values = set()
    result_dict = {}

    for key, value in input_dict.items():
        if value not in seen_values:
            result_dict[key] = value
            seen_values.add(value)

    return result_dict

In [4]:
cache_path = "mmcif_cache_v3.json"
with open(cache_path, "r") as f:
    cache = json.load(f)
print(f"Number of total mmcif files: {error_num + len(cache)}")
print(f"Number of parsed mmcif files: {len(cache)}")

Number of total mmcif files: 223532
Number of parsed mmcif files: 222907


In [5]:
# has rna
new_cache = {}
for file_id, info in cache.items():
    if info["no_chains"]["rna"] > 0:
        new_cache[file_id] = info
print(f"Number of mmcif files with RNA: {len(new_cache)}")

Number of mmcif files with RNA: 7744


In [6]:
# resolution <= 4.5
tmp = {}
for file_id, info in new_cache.items():
    if info["header"]["resolution"] <= 4.5:
        tmp[file_id] = info
new_cache = tmp
print(f"Number of mmcif files with resolution <= 4.5: {len(new_cache)}")

Number of mmcif files with resolution <= 4.5: 7211


In [7]:
# release date < 2021-01-01
tmp = {}
for file_id, info in new_cache.items():
    if info["header"]["release_date"] < "2021-01-01":
        tmp[file_id] = info
new_cache = tmp
print(f"Number of mmcif files with release date before 2021-01-01: {len(new_cache)}")

Number of mmcif files with release date before 2021-01-01: 4725


In [8]:
# extract rna chains
rna_chains = {}
for file_id, info in new_cache.items():
    # keep the first chain only for identical chains in each file
    for k, v in deduplicate_dict(info["rna"]).items():
        rna_chains[f"{file_id}_{k}"] = v
print(f"Number of RNA chains: {len(rna_chains)}")

Number of RNA chains: 8575


In [9]:
# length >= 15
tmp = {}
for chain_id, seq in rna_chains.items():
    if len(seq) >= 15:
        tmp[chain_id] = seq
print(f"Number of RNA chains with length >= 15: {len(tmp)}, remove {len(rna_chains) - len(tmp)}")
rna_chains = tmp

Number of RNA chains with length >= 15: 6468, remove 2107


In [10]:
# unk to X
for chain_id, seq in rna_chains.items():
    seq = ''.join(['X' if nucleotide not in 'AUCG' else nucleotide for nucleotide in seq])
    rna_chains[chain_id] = seq

In [11]:
# < 90% identical
tmp = {}
for chain_id, seq in rna_chains.items():
    s = max(seq, key=seq.count)
    prop = seq.count(s)/len(seq)
    if prop < 0.9:
        tmp[chain_id] = seq
print(f"Number of RNA chains with <90% identical nucleotides: {len(tmp)}, remove {len(rna_chains) - len(tmp)}")
rna_chains = tmp

Number of RNA chains with <90% identical nucleotides: 6420, remove 48


In [12]:
# < 5% X
tmp = {}
for chain_id, seq in rna_chains.items():
    prop = seq.count("X")/len(seq)
    if prop < 0.05:
        tmp[chain_id] = seq
print(f"Number of RNA chains with <5% X: {len(tmp)}, remove {len(rna_chains) - len(tmp)}")
rna_chains = tmp

Number of RNA chains with <5% X: 6341, remove 79


In [13]:
rna_chains

{'6sqq_D': 'AAUCCAUUGCACUCCGGAUUU',
 '6g51_l': 'UACCUGGUUGAUCCUGCCAGUAGCAUAUGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUACGCACGGCCGGUACAGUGAAACUGCGAAUGGCUCAUUAAAUCAGUUAUGGUUCCUUUGGUCGCUCGCUCCUCUCCUACUUGGAUAACUGUGGUAAUUCUAGAGCUAAUACAUGCCGACGGGCGCUGACCCCCUUCGCGGGGGGGAUGCGUGCAUUUAUCAGAUCAAAACCAACCCGGUCAGCCCCUCUCCGGCCCCGGCCGGGGGGCGGGCGCCGGCGGCUUUGGUGACUCUAGAUAACCUCGGGCCGAUCGCACGCCCCCCGUGGCGGCGACGACCCAUUCGAACGUCUGCCCUAUCAACUUUCGAUGGUAGUCGCCGUGCCUACCAUGGUGACCACGGGUGACGGGGAAUCAGGGUUCGAUUCCGGAGAGGGAGCCUGAGAAACGGCUACCACAUCCAAGGAAGGCAGCAGGCGCGCAAAUUACCCACUCCCGACCCGGGGAGGUAGUGACGAAAAAUAACAAUACAGGACUCUUUCGAGGCCCUGUAAUUGGAAUGAGUCCACUUUAAAUCCUUUAACGAGGAUCCAUUGGAGGGCAAGUCUGGUGCCAGCAGCCGCGGUAAUUCCAGCUCCAAUAGCGUAUAUUAAAGUUGCUGCAGUUAAAAAGCUCGUAGUUGGAUCUUGGGAGCGGGCGGGCGGUCCGCCGCGAGGCGAGCCACCGCCCGUCCCCGCCCCUUGCCUCUCGGCGCCCCCUCGAUGCUCUUAGCUGAGUGUCCCGCGGGGCCCGAAGCGUUUACUUUGAAAAAAUUAGAGUGUUCAAAGCAGGCCCGAGCCGCCUGGAUACCGCAGCUAGGAAUAAUGGAAUAGGACCGCGGUUCUAUUUUGUUGGUUUUCGGAACUGAGGCCAUGAUUAAGAGGGACGGCCGGGGGCAUUCGUAUUGCGCCG

In [4]:
cache[list(cache.keys())[0]]

{'header': {'structure_method': 'solution nmr',
  'release_date': '1998-10-07',
  'resolution': 0.0},
 'protein': {},
 'rna': {},
 'dna': {'A': 'CGCATXGTTACC', 'B': 'GGTAACAATGCG'},
 'mmcif_to_custom_mapping': {'A': 'A', 'B': 'B'},
 'no_chains': {'protein': 0, 'rna': 0, 'dna': 2}}