# Notebook walking through constructing a Dataset with DatasetBuilder

In [1]:
import sys
import pandas as pd
sys.path.append("..")
import matplotlib.pyplot as plt
from statistics import median, mean, stdev


from chemspace.Dataset.DatasetBuilder import DatasetBuilder

  from .autonotebook import tqdm as notebook_tqdm


# Step 1: Instantiate DataBuilder object

Dataset Builder Class can be instantiated one of 3 ways:
1. From a `.csv` file containing CIDs as the indices of the file (Recommended)
2. From a previously constructed DataFrame that has CIDs of interest as the indices
3. From a Chemical Structures Records `.json.gz` file downloaded from PubChem's search function
    - Not Recommended: if using this method, CIDs must be added from other sources as well for a complete dataset

Add CIDS that were missing from shenchao group

Add CIDs from S2R corpus

### Method 1: 
Instantiate a DatasetBuilder object from a `.csv` or `.csv.gz` file containing CIDs as the indices of the file

In [2]:
# Instantiate class with previously generated Dataframe (CSV)
DB = DatasetBuilder(compound_file_path='../chemspace/Dataset/Data/CIDs.csv')
# Display dataset
DB.CIDs
DB.dataset

Unnamed: 0,CID
0,1
1,3
2,4
3,5
4,6
...,...
379755,168265162
379756,168265381
379757,168265990
379758,168265998


### Method 2: 
Instantiate a DatasetBuilder object from a previously constructed DataFrame that has CIDs of interest as the indices

In [None]:
# Load df
df = pd.read_csv('../chemspace/Dataset/Data/CIDs.csv')

# Instantiate class with previously generated Dataframe
DB = DatasetBuilder(compound_df=df)

# Display dataset
DB.CIDs

### Method 3: 
Instantiate a DatasetBuilder object from a Chemical Structures Records `.json.gz` file downloaded from PubChem's search function

In [None]:
# Instantiate class with json file form PubChem
DB = DatasetBuilder(compound_file_path='../chemspace/Dataset/Data/PubChem_compound_list_records.json.gz')

# Save as .CSV
DB.CIDs.to_csv('../chemspace/Dataset/Data/CIDs.csv', index = False)

# Display dataset
DB.CIDs

In [None]:
sc = pd.read_csv('../chemspace/Dataset/Data/CID2SMILES-shengchao.csv') # Must be downloaded separately from HuggingFace, requires access
w = pd.read_csv('../chemspace/Dataset/Data/original_CIDs.csv')

missing = sc['CID'].loc[~sc['CID'].isin(w['CID'])]
missing.to_csv('../chemspace/Dataset/Data/missing_CIDs.csv', index=False)
print(len(missing))

In [None]:
DB.CIDs = pd.merge(w, sc['CID'], left_on='CID', right_on='CID', how='outer', sort=True)
print(len(DB.CIDs) == len(w) + len(missing))
DB.CIDs.to_csv('../chemspace/Dataset/Data/CIDs.csv', index=False)

In [None]:
DB.add_s2r_text() # Reuires file output from github.com/whitead/chem-matcher
sr = pd.read_csv('../chemspace/Dataset/Data/s2rtext.csv') # Must be downloaded separately from HuggingFace, requires access
#sr.to_csv('../chemspace/Dataset/Data/s2rtext.csv', index=False)
len(sr)

In [None]:
sr['CID'].head()
org_len = len(DB.CIDs)
DB.CIDs = pd.merge(DB.CIDs, sr['CID'], left_on='CID', right_on='CID', how='outer', sort=True)
print(len(sr))
print(len(DB.CIDs) - org_len)
DB.CIDs.to_csv('../Chemspace/Dataset/Data/CIDs.csv', index=False)

# Step 2: Add Data

Data can be added to the dataset in any order  
The cells below add different kinds of data to the dataset

### Data type 1: SMILES 
Add SMILES data to Dataset by using the `add_SMILES()` method

In [4]:
# Add data
DB.add_SMILES()

# Save as csv
DB.dataset.to_csv('../chemspace/Dataset/Data/Dataset.csv', index=False)

0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115


[14:09:05] Explicit valence for atom # 1 Cl, 7, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Br, 3, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Br, 5, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Cl, 3, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:09:05] Explicit valence for atom # 3 Si, 8, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Cl, 5, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:09:05] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:09:05] Explicit valence for atom # 11 Br, 3, is greater than permitted
[14:09:05] Explicit valence for atom 

### Data type 2: PubChem Textual Descriptions
Add PubChem Text to Dataset by using the `add_pubchem_text()` method

In [None]:
# Add data
DB.add_pubchem_text()

# Save as csv
DB.dataset.to_csv('../chemspace/Dataset/Data/Dataset.csv', index=False)

### Data type 3: List of possible synonyms

Add a list of synonyms extracted from PubChem by using the `add_synonyms()` method

In [2]:
DB = DatasetBuilder('../chemspace/Dataset/Data/Dataset.csv')

In [3]:
DB.add_synonyms()
DB.dataset.to_csv('../chemspace/Dataset/Data/Dataset.csv', index=False)

In [4]:
DB.dataset

Unnamed: 0,CID,SMILES,NumAtoms,Synonyms,Number_of_Synonyms
0,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,14.0,(+/-)-acetylcarnitine; (3-CARBOXY-2-HYDROXYPRO...,38.0
1,3,C1=CC(C(C(=C1)C(=O)O)O)O,11.0,"100459-00-5; 2,3-dihydro-2,3-dihydroxybenzoic ...",9.0
2,4,CC(CN)O,5.0,(+-)-1-amino-2-propanol; (+/-)-1-Amino-2-propa...,122.0
3,5,C(C(=O)COP(=O)(O)O)N,10.0,(3-amino-2-oxopropyl) dihydrogen phosphate; 1-...,11.0
4,6,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl,13.0,"1 - Chloro - 2,4 - dinitrobenzene; 1,3-Dinitro...",101.0
...,...,...,...,...,...
58080574,168268190,,,tris-[(1-methyl-2-ethyl-3-hydroxy-4(1H)-pyridi...,1.0
58080575,168268197,,,(7~{S})-2'-azanyl-3-[2-[(2~{S})-2-methylpipera...,2.0
58080576,168268199,,,2-chloro-1-[(5R)-3-phenyl-5-(quinoxalin-5-yl)-...,2.0
58080577,168268200,,,"2-chloro-1-[(4R,5R)-3,4,5-triphenyl-4,5-dihydr...",2.0


### Data type 4: S2ORC Textual Descriptions

> Work in progress

In [None]:
DB.add_s2r_text() # Work in progress, incomplete. Just generates a csv of concatenated text for now

# Step 3: Clean the Dataset

Remove the rows that have no description from the dataset by calling the `clean_dataset()` method

In [None]:
print(f"{len(DB.dataset)} rows in dataset before cleaning")
DB.clean_dataset()
print(f"{len(DB.dataset)} rows in dataset after cleaning")

# Dataset Metrics

Number of compounds represented

In [None]:
DB.dataset['TextLength'] = DB.dataset['AllText'].apply(lambda x: len(x.split(' ')) if isinstance(x,str) else 0 )

In [None]:
path = "../chemspace/Dataset/Data/out.csv"
df = pd.read_csv(path, chunksize = 10 ** 6, names=['Name','CID','Description','PaperID'], usecols=['CID'])

Metrics for length of text descriptions gathered

In [None]:
DB.dataset['TextLength']
print(max(DB.dataset['TextLength']))
print(min(DB.dataset['TextLength']))
print(median(DB.dataset['TextLength']))
print(mean(DB.dataset['TextLength']))
print(f"Count over 5 {sum(DB.dataset['TextLength'] > 5)}")
print(f"Count over 10 {sum(DB.dataset['TextLength'] > 10)}")
print(f"Count over 15 {sum(DB.dataset['TextLength'] > 15)}")
print(f"Count over 20 {sum(DB.dataset['TextLength'] > 20)}")
print(f"Count over 30 {sum(DB.dataset['TextLength'] > 30)}")

In [None]:
print(DB.dataset.loc[DB.dataset['TextLength']==1357]['CID'][540])

In [None]:
sum(DB.dataset['TextLength'] > 30)/len(DB.dataset)

In [None]:
histogram = plt.hist(DB.dataset['TextLength'], bins = range(0,200,5))

plt.xlabel('Description Word Count')
plt.ylabel('Number of Compunds')
plt.title('Distribution of Desctiption Lengths')

Metrics for complexity of compounds represented: number of atoms

In [None]:
histogram = plt.hist(DB.dataset['NumAtoms'], bins = range(0,300,5))
plt.xlabel('Number of atoms in compound')
plt.ylabel('Count of Compounds')
plt.title('Distribution of Compound Number of Atoms')


print(max(DB.dataset['NumAtoms']))
print(min(DB.dataset['NumAtoms']))
print(median(DB.dataset['NumAtoms']))
print(stdev(DB.dataset['NumAtoms'].dropna()))
print(f"Count over 15 {sum(DB.dataset['NumAtoms'] > 15)}")
print(f"Count over 30 {sum(DB.dataset['NumAtoms'] > 30)}")
print(f"Count over 45 {sum(DB.dataset['NumAtoms'] > 45)}")
print(f"Count over 60 {sum(DB.dataset['NumAtoms'] > 60)}")
print(f"Count over 75 {sum(DB.dataset['NumAtoms'] > 75)}")

In [None]:
min_text_words = 10
min_unique_atoms = 2

len(DB.dataset.loc[(DB.dataset['TextLength']>min_text_words) & (DB.dataset['NumAtoms']>min_unique_atoms)])