### The purpose of this notebook is to complete a data cleaning workflow from start to finish in order to validate the core functionality our package

In [None]:
# imports

import pandas as pd
from core import *
from create_cpd_info import *
from mol_sim import *
# import gzip
# 
# from Bio.KEGG import REST
# from Bio.KEGG import Enzyme
# import re
# from Bio.KEGG import Compound
# import numpy as np

In [None]:
!ls

## Step 1
#### Generate dataframe of all current kegg enzymes from zipped text file

In [None]:
# create_kegg_df()

enzyme_df = create_kegg_df('../datasets/KEGG_enzymes_all_data.gz', 'enzyme')
print(enzyme_df.shape)
enzyme_df.head()

## Step 2
#### Down select promiscuous enzymes from master enzyme dataframe

In [None]:
# select_promiscuous_enzymes()

prom_df = select_promiscuous_enzymes(enzyme_df)
print(prom_df.shape)
prom_df.head()

## Step 3
#### Combine substrates and products to account for reversible reactions

In [None]:
# combine_substrates_products()
    
combo_df = combine_substrates_products(prom_df)
print(combo_df.shape)
combo_df.head()

## Step 4

#### Expand dataframe so that each row is a unique enzyme-product pair
There are multiple compounds in the 'product' field of the dataframe. This function parses each of those, and for each provides a new enzyme-product pair.

In [None]:
# explode_dataframe()

exploded_df = explode_dataframe(combo_df, parse_compound_ids, 'product', ['entry'])
print(exploded_df.shape)
exploded_df.head()

## Step 5
#### Remove cofactors from dataframe
We have curated a list of 37 common cofactors and reactant molecules that participate in reactions, but are not products that we care to train our model on. These steps removes cofactor data from our master dataset.

In [None]:
# remove_cofactors()

cofactors_df = pd.read_csv('../datasets/cofactor_list.csv')
clean_df = remove_cofactors(exploded_df, 'product', cofactors_df, 'CPD')
print(clean_df.shape)
clean_df.head()

## Step 6
#### Get SMILES strings for each product from the PubChem database
The RDKit chemistry package requires SMILES strings as an input. The Kegg database does not store SMILES strings for compounds. This step joins a previously curated dataset of SMILES strings into our master dataset

In [None]:
smiles_df = pd.read_csv('../datasets/df_cleaned_kegg_with_smiles.csv')
smiles_df = smiles_df.drop_duplicates(subset='SMILES')
master_df = pd.merge(clean_df, smiles_df, how='inner', left_on='product', right_on='KEGG')
master_df = master_df.drop(columns=['Unnamed: 0', 'entry_y', 'KEGG', 'CID'])
master_df = master_df[master_df['SMILES'] != 'none']
master_df = master_df.rename(columns={'entry_x': 'kegg_enzyme', 'product': 'kegg_compound', 
                                      'PubChem': 'pubchem_compound'})
master_df = master_df.reset_index(drop=True)
print(master_df.shape)
master_df.head()

## Step 7 
#### Get dummy variables to represent enzyme class

In [None]:
# binarize_enzyme_class()

master_df = binarize_enzyme_class(master_df, 'kegg_enzyme')
print(master_df.shape)
master_df.head()

## Step 8 

#### Pre-process negative and positive datasets to remove rows with only 1 enzyme

In [None]:
# def remove_single_cpd_rows(dataframe, enzyme_col, smiles_col):
#     """
#     remove_single_cpd_rows() is meant to be a pre-processing function prior to passing a dataframe to the
#         calculate_dist() function
        
#     Args:
#         dataframe (pandas.Dataframe): input dataset
#         enzyme_col (str): name for column that contains kegg enzyme ids
#         smiles_col (str): name for column that contains smiles string
    
#     Returns:
#         pandas.Dataframe: output dataframe with rows removed in which there was only one product paired with 
#             the enzyme entry, enzyme_col renamed 'entry', and smiles_col renamed 'SMILES'
#     """
#     dataframe = dataframe.rename(columns={enzyme_col:'entry', smiles_col:'SMILES'})
#     counts_df = dataframe.groupby('entry').count()
#     singles_df = counts_df[counts_df['SMILES'] == 1]
#     singles = singles_df.index.tolist()
#     bool_mask = [False if row['entry'] in singles else True for _, row in dataframe.iterrows()]
#     clean_df = dataframe[bool_mask]
#     return clean_df


In [None]:
# remove_single_cpd_rows()

master_df = remove_single_cpd_rows(master_df, 'kegg_enzyme', 'SMILES')
print(master_df.shape)
master_df.head()

In [None]:
# counts_df = master_df.groupby('entry').count()
# singles_df = counts_df[counts_df['kegg_compound'] == 1]
# singles = singles_df.index.tolist()
# print(singles)

In [None]:
# bool_mask = [False if row['entry'] in singles else True for _, row in master_df.iterrows()]
# clean_master_df = master_df[bool_mask]
# print(clean_master_df.shape)
# clean_master_df.head()

## Step 9
#### Calculate molecular distances between products of the same enzyme

In [None]:
# calculate_dist()

dist_master_df = calculate_dist(clean_master_df)
print(dist_master_df.shape)
dist_master_df.head()

## Step 8
#### Curate negative dataset
So far our curated dataset includes only examples of enzyme-product pairs that are known to react. In order to train our model, we need to include negative examples of enzyme-product pairs not expected to react. This function artifically pairs enzymes and products that are not known to react, and selects a subsample of these negative pairs to include in the master dataset

In [None]:
# create_negative_matches() - good to go

pos_df, neg_df = create_negative_matches(clean_df, 'entry', 'product')

In [None]:
pos_df.shape

In [None]:
neg_df.shape

In [None]:
# concatenate negative & positive data

master_df = pd.concat((pos_df, neg_df), axis=0)

In [None]:
master_df.shape

## Step 7
#### Add in compound features with RDKit
This step uses the RDKit packages to generate descriptive features of the compounds