# Notebook walking through constructing a Dataset with DatasetBuilder

In [None]:
import sys
import pandas as pd
sys.path.append("..")
import matplotlib.pyplot as plt
from statistics import median, mean, stdev


from chemspace.Dataset.DatasetBuilder import DatasetBuilder

# Step 1: Instantiate DataBuilder object

Dataset Builder Class can be instantiated one of 3 ways:
1. From a Chemical Structures Records `.json.gz` file downloaded from PubChem's search function
2. From a `.csv` file containing CIDs as the indices of the file
3. From a previously constructed DataFrame that has CIDs of interest as the indices

### Method 1: 
Instantiate a DatasetBuilder object from a Chemical Structures Records `.json.gz` file downloaded from PubChem's search function

In [None]:
# Instantiate class with json file form PubChem
DB = DatasetBuilder(compound_file_path='../chemspace/Dataset/Data/PubChem_compound_list_records.json.gz')

# Save as .CSV
DB.CIDs.to_csv('../chemspace/Dataset/Data/CIDs.csv', index = False)

# Display dataset
DB.CIDs

Add CIDS that were missing from shenchao group

In [None]:
sc = pd.read_csv('../chemspace/Dataset/Data/CID2SMILES-shengchao.csv')
w = pd.read_csv('../chemspace/Dataset/Data/original_CIDs.csv')

missing = sc['CID'].loc[~sc['CID'].isin(w['CID'])]
missing.to_csv('../chemspace/Dataset/Data/missing_CIDs.csv', index=False)
print(len(missing))

In [None]:
DB.CIDs = pd.merge(w, sc['CID'], left_on='CID', right_on='CID', how='outer', sort=True)
print(len(DB.CIDs) == len(w) + len(missing))
DB.CIDs.to_csv('../chemspace/Dataset/Data/CIDs.csv', index=False)

Add CIDs from S2R corpus

In [None]:
sr = pd.read_csv('../chemspace/Dataset/Data/s2rtext.csv')
#sr.to_csv('../chemspace/Dataset/Data/s2rtext.csv', index=False)
len(sr)

In [None]:
sr['CID'].head()
org_len = len(DB.CIDs)
DB.CIDs = pd.merge(DB.CIDs,sr['CID'],left_on='CID',right_on='CID',how='outer', sort=True)
print(len(sr))
print(len(DB.CIDs)-org_len)
DB.CIDs.to_csv('../Chemspace/Dataset/Data/CIDs.csv',index=False)

### Method 2: 
Instantiate a DatasetBuilder object from a `.csv` or `.csv.gz` file containing CIDs as the indices of the file

In [None]:
# Instantiate class with previously generated Dataframe (CSV)
DB = DatasetBuilder(compound_file_path='../chemspace/Dataset/Data/Dataset.csv.gz')
# Display dataset
DB.CIDs
DB.dataset

### Method 3: 
Instantiate a DatasetBuilder object from a previously constructed DataFrame that has CIDs of interest as the indices

In [None]:
# Load df
df = pd.read_csv('../chemspace/Dataset/Data/CIDs.csv')

# Instantiate class with previously generated Dataframe
DB = DatasetBuilder(compound_df=df)

# Display dataset
DB.CIDs

# Step 2: Add Data

Data can be added to the dataset in any order  
The cells below add different kinds of data to the dataset

### Data type 1: SMILES 
Add SMILES data to Dataset by using the `add_SMILES()` method

In [None]:
# Add data
DB.add_SMILES()

# Save as csv
DB.dataset.to_csv('../chemspace/Dataset/Data/Dataset.csv', index=False)

### Data type 2: PubChem Textual Descriptions
Add PubChem Text to Dataset by using the `add_pubchem_text()` method

In [None]:
# Add data
DB.add_pubchem_text()

# Save as csv
DB.dataset.to_csv('../chemspace/Dataset/Data/Dataset.csv', index=False)

In [None]:
DB.add_s2r_text()

# Step 3: Clean the Dataset

Remove the rows that have no description from the dataset by calling the `clean_dataset()` method

In [None]:
print(f"{len(DB.dataset)} rows in dataset before cleaning")
DB.clean_dataset()
print(f"{len(DB.dataset)} rows in dataset after cleaning")

# Dataset Metrics

Number of compounds represented

In [None]:
DB.dataset['TextLength'] = DB.dataset['AllText'].apply(lambda x: len(x.split(' ')) if isinstance(x,str) else 0 )

In [None]:
path = "../chemspace/Dataset/Data/out.csv"
df = pd.read_csv(path, chunksize = 10 ** 6, names=['Name','CID','Description','PaperID'], usecols=['CID'])

Metrics for length of text descriptions gathered

In [None]:
DB.dataset['TextLength']
print(max(DB.dataset['TextLength']))
print(min(DB.dataset['TextLength']))
print(median(DB.dataset['TextLength']))
print(mean(DB.dataset['TextLength']))
print(f"Count over 5 {sum(DB.dataset['TextLength'] > 5)}")
print(f"Count over 10 {sum(DB.dataset['TextLength'] > 10)}")
print(f"Count over 15 {sum(DB.dataset['TextLength'] > 15)}")
print(f"Count over 20 {sum(DB.dataset['TextLength'] > 20)}")
print(f"Count over 30 {sum(DB.dataset['TextLength'] > 30)}")

In [None]:
print(DB.dataset.loc[DB.dataset['TextLength']==1357]['CID'][540])

In [None]:
sum(DB.dataset['TextLength'] > 30)/len(DB.dataset)

In [None]:
histogram = plt.hist(DB.dataset['TextLength'], bins = range(0,200,5))

plt.xlabel('Description Word Count')
plt.ylabel('Number of Compunds')
plt.title('Distribution of Desctiption Lengths')

Metrics for complexity of compounds represented: number of atoms

In [None]:
histogram = plt.hist(DB.dataset['NumAtoms'], bins = range(0,300,5))
plt.xlabel('Number of atoms in compound')
plt.ylabel('Count of Compounds')
plt.title('Distribution of Compound Number of Atoms')


print(max(DB.dataset['NumAtoms']))
print(min(DB.dataset['NumAtoms']))
print(median(DB.dataset['NumAtoms']))
print(stdev(DB.dataset['NumAtoms'].dropna()))
print(f"Count over 15 {sum(DB.dataset['NumAtoms'] > 15)}")
print(f"Count over 30 {sum(DB.dataset['NumAtoms'] > 30)}")
print(f"Count over 45 {sum(DB.dataset['NumAtoms'] > 45)}")
print(f"Count over 60 {sum(DB.dataset['NumAtoms'] > 60)}")
print(f"Count over 75 {sum(DB.dataset['NumAtoms'] > 75)}")

In [None]:
min_text_words = 10
min_unique_atoms = 2

len(DB.dataset.loc[(DB.dataset['TextLength']>min_text_words) & (DB.dataset['NumAtoms']>min_unique_atoms)])