In [1]:
import pandas as pd
from os.path import join as path_join

In [2]:
GI50 = pd.read_csv(path_join("data", "GI50.csv"))
chem_names = pd.read_csv(
    path_join("data", "chemnames_Aug2013.txt"),
    "|",
    header=None,
    names=["NSC", "Name", "Name Type"],
)
pubchem_chem_names = pd.read_csv(path_join("nsc_from_pubchem", "results", "nsc2synonym.csv"))

print(f"GI50 has shape {GI50.shape}")
print(f"Chem names has shape {chem_names.shape}")
print(f"PubChem NSC {pubchem_chem_names.shape}")

GI50 has shape (4547757, 14)
Chem names has shape (251887, 3)
PubChem NSC (3729858, 3)


In [3]:
all_gi50_nsc = GI50["NSC"].unique()
all_chem_names_nsc = chem_names["NSC"].unique()
all_pubchem_nsc = pubchem_chem_names["NSC"].unique()

In [4]:
print(f"GI50 has {all_gi50_nsc.size} unique NSC number")
print(f"Chem names 2013 has {all_chem_names_nsc.size} unique NSC number")
print(f"Pubchem has {all_pubchem_nsc.size} unique NSC number")

GI50 has 56220 unique NSC number
Chem names 2013 has 67534 unique NSC number
Pubchem has 296385 unique NSC number


Comparing the NSC number between all chem names and GI50 gives us a list of 13,159 NSC number that appear in both of them. This of the 56,220 NSC number in GI50 means we 76.6% of all experiments can not be used because of missing mapping.

In [5]:
only_gi50 = list(set(all_gi50_nsc) - set(all_chem_names_nsc))
print(f"GI50 has {len(only_gi50)} number not in chem names")
gi50_chem_names = list(set(all_gi50_nsc) - set(only_gi50))
print(f"A total of {len(gi50_chem_names)} NSC numbers are in both")

GI50 has 43061 number not in chem names
A total of 13159 NSC numbers are in both


But comparing it to pubchem NSC number we only find 2.3% to be missed. 

In [6]:
only_gi50 = list(set(all_gi50_nsc) - set(all_pubchem_nsc))
print(f"GI50 has {len(only_gi50)} number not in pubchem")
gi50_chem_names = list(set(all_gi50_nsc) - set(only_gi50))
print(f"A total of {len(gi50_chem_names)} NSC numbers are in both")

GI50 has 1269 number not in pubchem
A total of 54951 NSC numbers are in both


Combine the two list even reduces it to 1.3%

In [7]:
only_gi50 = list(set(all_gi50_nsc) - set(all_chem_names_nsc) - set(all_pubchem_nsc))
print(f"GI50 has {len(only_gi50)} number not in chem names or pubchem")
gi50_chem_names = list(set(all_gi50_nsc) - set(only_gi50))
print(f"A total of {len(gi50_chem_names)} NSC numbers are in both")

GI50 has 736 number not in chem names or pubchem
A total of 55484 NSC numbers are in both


In [8]:
only_gi50[:10]

[811013,
 811014,
 811015,
 114696,
 782346,
 378895,
 378896,
 641053,
 641054,
 641055]