In [1]:
import wx
import wx.grid
%gui wx
import wx.lib.scrolledpanel as scrolled
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sqlite3 import Error
import os
import chemml
from chemml.datasets import load_cep_homo
from chemml.datasets import load_organic_density
from chemml.datasets import load_xyz_polarizability
from chemml.datasets import load_comp_energy
from chemml.datasets import load_crystal_structures
import csv
from rdkit import Chem
from rdkit.Chem import AllChem,Draw
from chemml.chem import Molecule
import pubchempy as pcp
from chembl_webresource_client.new_client import new_client
from rdkit import Chem,DataStructs
from RAscore import RAscore_NN
import sascorer
import npscorer
from syba.syba import SybaClassifier
import pickle
import random
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler,normalize
from chemml.models import MLP
from chemml.utils import regression_metrics
from matplotlib import ticker
from rdkit.Chem import QED
from rdkit.Chem import Descriptors
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# connect the database file
def create_connection(filename):
    conn = None
    try:
        conn = sqlite3.connect(filename)
    except Error as error:
        print(error)
    return conn

#creat table within the database
def create_table(conn, query):
    try:
        cur = conn.cursor()
        cur.execute(query)
    except Error as error:
        print(error)
#execute sql query
def execute_query(conn, query):
    cur = conn.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    return rows
#insert record into table
def insert_table(conn,query,values):
    cur = conn.cursor()
    cur.execute(query,values)
    return cur.lastrowid

In [3]:
#loading the models for predicting RA, SA, SYBA, and NP scores of the molecules
#This may take extra time to load the models
ra_nn_scorer = RAscore_NN.RAScorerNN()
syba = SybaClassifier()
syba.fitDefaultScore()
model_file = open('publicnp.model','rb')
fs = pickle.load(model_file)




In [4]:
#create the main frame of the GUI
class proj_frame(wx.Frame):
    def __init__(self):
        wx.Frame.__init__(self,parent = None,id = 1, title = 'Chemical Molecules Analyzer')
        self.panel = wx.Panel(self,1)
        self.pnb = wx.Notebook(self.panel)
        data_tab = data_tab_content(self.pnb)
        reaction_tab = reaction_tab_content(self.pnb)
        score_tab = score_tab_content(self.pnb)
        rs_tab = rs_tab_content(self.pnb)
        self.pnb.AddPage(data_tab, "Data")
        self.pnb.AddPage(score_tab,'Score')
        self.pnb.AddPage(reaction_tab, "Reaction")
        self.pnb.AddPage(rs_tab,"Recommender System")
        
        s1 = wx.BoxSizer()
        s1.Add(self.pnb,1,wx.EXPAND)
        self.panel.SetSizer(s1)
        self.Show()

        

        

    
    
    


In [5]:

#create buttons on the panels
#Each button is binded with event handler
class data_tab_content(scrolled.ScrolledPanel):
    def __init__(self,parent):
        scrolled.ScrolledPanel.__init__(self,parent)
        s = wx.BoxSizer(wx.VERTICAL)
        h = wx.BoxSizer(wx.HORIZONTAL)
        self.v = wx.BoxSizer(wx.VERTICAL)
        button_sample_data = wx.Button(self,label = 'Sample Data')
        button_random_data = wx.Button(self,label = 'Generate Random Data')
        button_own_data = wx.Button(self,label = 'Enter Data')
        button_own_data.Bind(wx.EVT_BUTTON,self.click_own_data)
        button_sample_data.Bind(wx.EVT_BUTTON,self.click_sample_data)
        button_random_data.Bind(wx.EVT_BUTTON,self.click_random_data)
        h.Add(button_sample_data)
        h.Add(button_own_data)
        h.Add(button_random_data)
        s.Add(h)
        s.Add(self.v)
        self.SetSizer(s)
        self.SetupScrolling()
        self.Layout()
    
    #if the user clicks 'Enter Data' button, the interface that allows the user to create new database or open existing database will show up
    def click_own_data(self,event):
        self.v.Clear(True)
        self.dbname = None
        dh1 = wx.BoxSizer(wx.HORIZONTAL)
        self.database_choice = wx.ComboBox(self,choices = ["create new database","open existing database"])
        confirm_button = wx.Button(self,label = "confirm")
        confirm_button.Bind(wx.EVT_BUTTON,self.click_confirm_create_database)
        dh1.Add(self.database_choice)
        dh1.Add(confirm_button)
        self.v.Add(dh1)
        self.v2 = wx.BoxSizer(wx.VERTICAL)
        self.v.Add(self.v2)
        self.Layout()
        
   # A message dialog will pop up for user to enter the name of new database if the user chooses to create a new database
    def click_confirm_create_database(self,event):
        action = self.database_choice.GetValue()
        if action == "create new database":
            enter_dbname = wx.TextEntryDialog(self,'Name:','Enter name of new database')
            if enter_dbname.ShowModal() == wx.ID_OK:
                if enter_dbname.GetValue() == '':
                    wx.MessageBox("Please enter a valid name!","Info",wx.OK|wx.ICON_INFORMATION)
                else:
                    self.dbname = enter_dbname.GetValue()+'.db'
            enter_dbname.Destroy()
            self.new_database()
        if action == "open existing database":
            self.open_database()
        self.Layout()
    # The interface will be exchanged to another mode if the user choose to open the database file
    # A list of avilable tables in the database will be shown in the combo box for the user to choose
    def open_database(self):
        try:
            self.v2.Clear(True)
            file_dialog = wx.FileDialog(self,"Choose DB File", wildcard = "DB files (*.db)|*.db",style = wx.FD_OPEN|wx.FD_FILE_MUST_EXIST)
            if file_dialog.ShowModal() == wx.ID_CANCEL:
                return
            self.dbname = file_dialog.GetPath()   
            c = create_connection(self.dbname)
            query = "select name from sqlite_master where type = 'table' and name not like 'sqlite_%'"
            df = pd.read_sql_query(query,c)
            table_name_dict = df.to_dict('list')
            dh5 = wx.BoxSizer(wx.HORIZONTAL)
            label = wx.StaticText(self,label = "table:")
            self.table_created = wx.ComboBox(self,choices = list(table_name_dict.values())[0])
            open_button = wx.Button(self,label = "open")
            open_button.Bind(wx.EVT_BUTTON,self.click_open_table)
            dh5.Add(label)
            dh5.Add(self.table_created)
            dh5.Add(open_button)
            self.dh6 = wx.BoxSizer(wx.HORIZONTAL)
            self.dh7 = wx.BoxSizer(wx.VERTICAL)
            self.v2.Add(dh5)
            self.v2.Add(self.dh6)
            self.v2.Add(self.dh7)
            self.Layout()
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
    # After the user open the existing table, the features about deleting and updating will be shown on the interface
    def click_open_table(self,event):
        try:
            self.dh6.Clear(True)
            self.dh7.Clear(True)
            delete_table_button = wx.Button(self,label = "delete table")
            delete_table_button.Bind(wx.EVT_BUTTON,self.click_delete_table)
            delete_feature_button = wx.Button(self,label = "delete feature")
            delete_feature_button.Bind(wx.EVT_BUTTON,self.click_delete_feature2)
            add_feature_button = wx.Button(self,label = "add feature")
            add_feature_button.Bind(wx.EVT_BUTTON,self.click_add_features2)
            add_record_button = wx.Button(self,label = "add record")
            add_record_button.Bind(wx.EVT_BUTTON,self.click_add_record)
            delete_record_button = wx.Button(self,label = "delete record")
            delete_record_button.Bind(wx.EVT_BUTTON,self.click_delete_record)
            save_table_button = wx.Button(self,label =  "save table(.csv)")
            save_table_button.Bind(wx.EVT_BUTTON,self.click_save_table)
            self.dh6.Add(delete_table_button)
            self.dh6.Add(delete_feature_button)
            self.dh6.Add(add_feature_button)
            self.dh6.Add(add_record_button)
            self.dh6.Add(delete_record_button)
            self.dh6.Add(save_table_button)
            if self.table_created.GetValue() == '':
                wx.MessageBox("No table is selected.","Info",wx.OK|wx.ICON_INFORMATION)
            else:
                tname = self.table_created.GetValue()
                query = "select * from {}".format(tname)
                c = create_connection(self.dbname)
                cur = c.cursor()
                df = pd.read_sql_query(query,c)
                self.current_end_index = df.shape[0]
                self.dtable = wx.grid.Grid(self,1)
                self.dtable.CreateGrid(df.shape[0],df.shape[1])
                for index,col in zip(range(len(df.columns)),df.columns):
                    self.dtable.SetColLabelValue(index,col)
                for col in range(df.shape[1]):
                    col_list = df.iloc[:,col]
                    for row in range(df.shape[0]):
                        self.dtable.SetCellValue(row,col,str(col_list[row]))
                self.dh7.Add(self.dtable)        
                self.Layout()
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION) 
    # delete the record based on the line number of the records in table
    def click_delete_record(self,event):
        if self.table_created.GetValue() == '':
            wx.MessageBox("No table is selected.","Info",wx.OK|wx.ICON_INFORMATION)
        else:
            try:
                c = create_connection(self.dbname)
                tname = self.table_created.GetValue()
                query = "select count(*) from {}".format(tname)
                df2 = pd.read_sql_query(query,c)        
                record_count = df2.iloc[:,0].to_list()[0]
                enter_record_deleted = wx.TextEntryDialog(self,'line number of record:','Enter line number to delete record')
                if enter_record_deleted.ShowModal() == wx.ID_OK:
                    if enter_record_deleted.GetValue() == '' or int(enter_record_deleted.GetValue().strip()) > record_count or int(enter_record_deleted.GetValue().strip()) <= 0:
                        wx.MessageBox("Please enter a valid line number!","Info",wx.OK|wx.ICON_INFORMATION)
                    else:
                        index_record = int(enter_record_deleted.GetValue().strip())
                        query = "select * from {}".format(tname)
                        cur = c.cursor()
                        df = pd.read_sql_query(query,c)
                        df_dict = df.to_dict('list')
                        q = []
                        check = False
                        for col in df_dict.keys():
                            temp = ''
                            val = list(df_dict[col])[index_record-1]
                            if val == None:
                                temp = col + ' is null'
                            else:
                                temp = col + ' = ' + str(val)
                                check = True
                            q.append(temp)
                        q_combined = ' and '.join(q)
                        print(q_combined)
                        query = "delete from {} where {};".format(tname,q_combined)
                        print(query)
                        execute_query(c, query)
                        c.commit()
                        query = "select * from {}".format(tname)
                        cur = c.cursor()
                        df = pd.read_sql_query(query,c)
                        display(df)
                        self.current_end_index = df.shape[0]
                        self.dh7.Clear(True)
                        self.dtable = wx.grid.Grid(self,1)
                        self.dtable.CreateGrid(df.shape[0],df.shape[1])
                        for index,col in zip(range(len(df.columns)),df.columns):
                            self.dtable.SetColLabelValue(index,col)
                        for col in range(df.shape[1]):
                            col_list = df.iloc[:,col]
                            for row in range(df.shape[0]):
                                self.dtable.SetCellValue(row,col,str(col_list[row]))
                        self.dh7.Add(self.dtable) 
                        self.Layout()
                enter_record_deleted.Destroy()
            except Error as error:
                wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
    #delete the whole table from the database            
    def click_delete_table(self,event):
        if self.table_created.GetValue() == '':
            wx.MessageBox("No table exists in database","Info",wx.OK|wx.ICON_INFORMATION)
        else:    
            try:
                tname = self.table_created.GetValue()
                query = "drop table {};".format(tname)
                c = create_connection(self.dbname)
                cur = c.cursor()
                cur.execute(query)
                cur = c.cursor()
                query = "select name from sqlite_master where type = 'table' and name not like 'sqlite_%'"
                df = pd.read_sql_query(query,c)
                self.table_created.Clear()
                self.table_created.SetItems(list(df.to_dict('list').values())[0])
                self.dh7.Clear(True)    
                self.Layout()
            except Error as error:
                wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)

    # delete the feature of the existing table according to the feature name entered by the user
    def click_delete_feature2(self,event):
        if self.table_created.GetValue() == '':
            wx.MessageBox("No table is selected.","Info",wx.OK|wx.ICON_INFORMATION)
        else:
            try:
                tname = self.table_created.GetValue()
                query = "select * from {}".format(tname)
                c = create_connection(self.dbname)
                cur = c.cursor()
                df = pd.read_sql_query(query,c)
                enter_feature_deleted = wx.TextEntryDialog(self,'Feature:','Enter feature to delete')
                if enter_feature_deleted.ShowModal() == wx.ID_OK:
                    if enter_feature_deleted.GetValue() == '' or enter_feature_deleted.GetValue().strip() not in list(df.columns):
                        wx.MessageBox("Please enter a valid feature!","Info",wx.OK|wx.ICON_INFORMATION)
                    else:
                        cur = c.cursor()
                        query = "alter table {} drop column {};".format(tname,enter_feature_deleted.GetValue())
                        cur.execute(query)
                        query = "select * from {}".format(tname)
                        c = create_connection(self.dbname)
                        cur = c.cursor()
                        cur.execute(query)
                        df = pd.read_sql_query(query,c)
                        self.current_end_index = df.shape[0]
                        self.dh7.Clear(True)
                        self.dtable = wx.grid.Grid(self,1)
                        self.dtable.CreateGrid(df.shape[0],df.shape[1])
                        for index,col in zip(range(len(df.columns)),df.columns):
                            self.dtable.SetColLabelValue(index,col)
                        for col in range(df.shape[1]):
                            col_list = df.iloc[:,col]
                            for row in range(df.shape[0]):
                                self.dtable.SetCellValue(row,col,str(col_list[row]))
                        self.dh7.Add(self.dtable)        
                enter_feature_deleted.Destroy()
                self.Layout()
            except Error as error:
                wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
    # add a new feature to the existing table according to the feature and data type entered by the user
    def click_add_features2(self,event):
        if self.table_created.GetValue() == '':
            wx.MessageBox("No table is selected.","Info",wx.OK|wx.ICON_INFORMATION)
        else:
            try:
                tname = self.table_created.GetValue()
                query = "select * from {}".format(tname)
                c = create_connection(self.dbname)
                cur = c.cursor()
                df = pd.read_sql_query(query,c)
                enter_feature_added = wx.TextEntryDialog(self,'Feature and type(varchar,integer,decimal),separated by comma:','Enter feature and type')
                if enter_feature_added.ShowModal() == wx.ID_OK:
                    if enter_feature_added.GetValue() == '' or ',' not in enter_feature_added.GetValue():
                        wx.MessageBox("Please enter a valid feature!","Info",wx.OK|wx.ICON_INFORMATION)
                    else:
                        feature_and_type = ''
                        fea = enter_feature_added.GetValue().split(',')[0]
                        ctype = enter_feature_added.GetValue().split(',')[1]
                        cur = c.cursor()
                        if ctype == 'varchar':
                            feature_and_type = fea + ' '+ 'varchar(255)'
                        elif ctype == 'integer':
                            feature_and_type = fea + ' '+ 'int'
                        elif ctype == 'decimal':
                            feature_and_type = fea + ' '+ 'decimal(9,2)'
                        query = "alter table {} add {};".format(tname,feature_and_type)
                        cur.execute(query)
                        query = "select * from {}".format(tname)
                        cur = c.cursor()
                        cur.execute(query)
                        df = pd.read_sql_query(query,c)
                        self.current_end_index = df.shape[0]
                        self.dh7.Clear(True)
                        self.dtable = wx.grid.Grid(self,1)
                        self.dtable.CreateGrid(df.shape[0],df.shape[1])
                        for index,col in zip(range(len(df.columns)),df.columns):
                            self.dtable.SetColLabelValue(index,col)
                        for col in range(df.shape[1]):
                            col_list = df.iloc[:,col]
                            for row in range(df.shape[0]):
                                self.dtable.SetCellValue(row,col,str(col_list[row]))
                        self.dh7.Add(self.dtable)        
                enter_feature_added.Destroy()
                self.Layout()
            except Error as error:
                wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)

    # add one row below the existing table on the interface for user to enter the values       
    def click_add_record(self,event):
        if self.table_created.GetValue() == '':
            wx.MessageBox("No table is selected.","Info",wx.OK|wx.ICON_INFORMATION)
        else:
            try:
                self.dh7.Clear(True)
                tname = self.table_created.GetValue()
                query = "select * from {}".format(tname)
                c = create_connection(self.dbname)
                cur = c.cursor()
                df = pd.read_sql_query(query,c)

                self.current_end_index = self.current_end_index+1

                self.dtable = wx.grid.Grid(self,1)
                self.dtable.CreateGrid(self.current_end_index,df.shape[1])
                for index,col in zip(range(len(df.columns)),df.columns):
                    self.dtable.SetColLabelValue(index,col)
                for col in range(df.shape[1]):
                    col_list = df.iloc[:,col]
                    for row in range(df.shape[0]):
                        self.dtable.SetCellValue(row,col,str(col_list[row]))
                self.dh7.Add(self.dtable) 
                insert_button = wx.Button(self,label = "insert")
                self.dh7.Add(insert_button)
                insert_button.Bind(wx.EVT_BUTTON,self.click_insert_table2)
                self.current_table_name2 = tname
                self.Layout()
            except Error as error:
                wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
    #insert the values that the user entered on the cells of the new row(s) to the existing table
    def click_insert_table2(self,event):
        try:
            c = create_connection(self.dbname)
            cur = c.cursor()
            df = pd.read_sql_query("select * from %s" %self.current_table_name2, c)
            display(df)
            num_q = '?,'*len(df.columns)
            num_q = num_q[:-1]
            tname = self.table_created.GetValue()
            query = "select count(*) from {}".format(tname)
            cur = c.cursor()
            df2 = pd.read_sql_query(query,c)        
            self.current_start_index = df2.iloc[:,0].to_list()[0]
            for row in range(self.current_start_index,self.current_end_index):
                row_values = []
                for col in range(len(df.columns)):
                    val = self.dtable.GetCellValue(row,col)
                    if val == '':
                        val = None
                    row_values.append(val)
                print(row_values)
                with c:
                    query = "insert or ignore into {}({}) values ({});".format(self.current_table_name2,','.join(df.columns),num_q)
                    print(query)
                    insert_table(c,query,row_values)
            wx.MessageBox("The record(s)","Info",wx.OK|wx.ICON_INFORMATION)
            self.dh7.Clear(True) 
            tname = self.table_created.GetValue()
            query = "select * from {}".format(tname)
            cur = c.cursor()
            df = pd.read_sql_query(query,c)
            self.current_end_index = df.shape[0]
            self.dtable = wx.grid.Grid(self,1)
            self.dtable.CreateGrid(df.shape[0],df.shape[1])
            for index,col in zip(range(len(df.columns)),df.columns):
                self.dtable.SetColLabelValue(index,col)
            for col in range(df.shape[1]):
                col_list = df.iloc[:,col]
                for row in range(df.shape[0]):
                    self.dtable.SetCellValue(row,col,str(col_list[row]))
            self.dh7.Add(self.dtable)        
            self.Layout()        
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
    #save the existing table as CSV file        
    def click_save_table(self,event):
        tname = self.table_created.GetValue()
        query = "select * from {}".format(tname)
        c = create_connection(self.dbname)
        cur = c.cursor()
        df = pd.read_sql_query(query,c)
        file_dialog = wx.FileDialog(self,"Save csv File", wildcard = "CSV files (*.csv)|*.csv",style = wx.FD_SAVE)
        if file_dialog.ShowModal() == wx.ID_CANCEL:
            return
        filename = file_dialog.GetPath()
        try:
            df.to_csv(filename)
        except IOError:
            wx.MessageBox("This file can't be saved successfully.","Info",wx.OK|wx.ICON_INFORMATION)        
    # create interface for creating new database and new tables    
    def new_database(self):
        self.v2.Clear(True)
        label = wx.StaticText(self,label = 'Database:'+self.dbname)
        dh5 = wx.BoxSizer(wx.HORIZONTAL)
        label1 = wx.StaticText(self,label = 'Table Name:')
        self.enter_table_name = wx.TextCtrl(self)
        self.v2.Add(label)
        dh5.Add(label1)
        dh5.Add(self.enter_table_name)
        self.v2.Add(dh5)
        dh2 = wx.BoxSizer(wx.HORIZONTAL)
        label2 = wx.StaticText(self,label = "Number of Records:")
        self.enter_num_records = wx.TextCtrl(self)
        dh2.Add(label2)
        dh2.Add(self.enter_num_records)
        dh3 = wx.BoxSizer(wx.HORIZONTAL)
        label3 = wx.StaticText(self,label = "Features:")
        self.enter_features = wx.TextCtrl(self)
        label4 = wx.StaticText(self,label = "type:")
        self.feature_type_choices = wx.ComboBox(self,choices = ["varchar","integer","decimal"])
        add_feature_button = wx.Button(self,label = "add")
        add_feature_button.Bind(wx.EVT_BUTTON,self.click_add_features)
        dh3.Add(label3)
        dh3.Add(self.enter_features)
        dh3.Add(label4)
        dh3.Add(self.feature_type_choices)
        dh3.Add(add_feature_button)
        self.dh4 = wx.BoxSizer(wx.HORIZONTAL)
        label5 = wx.StaticText(self,label = "Features created:")
        create_table_button = wx.Button(self,label = "create table")
        create_table_button.Bind(wx.EVT_BUTTON,self.click_create_table)
        self.dh4.Add(label5)
        self.v2.Add(dh2)
        self.v2.Add(dh3)
        self.v2.Add(self.dh4)
        self.v2.Add(create_table_button)
        self.dh6 = wx.BoxSizer(wx.VERTICAL)
        self.v2.Add(self.dh6)
        self.current_table = {}
        self.current_table_name = ''
        self.num_records = 0
        self.Layout()
    #add buttons labeled with features entered by the user
    #the user could drop the feature by clicking the buttons
    def click_add_features(self,event):
        feature_button = wx.Button(self,label = self.enter_features.GetValue())
        feature_button.Bind(wx.EVT_BUTTON,self.click_delete_feature)
        self.dh4.Add(feature_button)
        self.current_table[self.enter_features.GetValue()] = self.feature_type_choices.GetValue()
        self.Layout()
    #The user clicks the button labeled with feature name, this feature will not be added to the new table
    def click_delete_feature(self,event):
        feature = event.GetEventObject().GetLabel()
        children = self.dh4.GetChildren()
        count = 0
        for child in children:
            if isinstance(child.GetWindow(),wx.Button) and child.GetWindow().GetLabel() == feature:
                self.dh4.Hide(count)
                self.dh4.Remove(count)
            count += 1
        del self.current_table[feature]
        self.Layout()  
    
    #clear the text fields
    def click_clear_option(self):
        self.current_table = {}
        self.current_table_name = ''
        self.num_records = 0
        self.enter_table_name.SetValue('')
        self.enter_num_records.SetValue('')
        self.enter_features.SetValue('')
        self.feature_type_choices.SetValue('')
        children = self.dh4.GetChildren()
        count = 0
        for child in children:
            if isinstance(child.GetWindow(),wx.Button):
                self.dh4.Hide(count)
                self.dh4.Remove(count)
                count = 0
            count += 1
        self.Layout()
    # create a new table to the database using SQLITE queries
    def click_create_table(self,event):
        self.dh6.Clear(True)
        self.current_table2 = self.current_table
        sl = wx.StaticLine(self,size = (500,1),style = wx.LI_HORIZONTAL)
        self.dh6.Add(sl)
        self.current_table_name2 = self.enter_table_name.GetValue()
        self.num_records2 = int(self.enter_num_records.GetValue())
        self.click_clear_option()
        c = create_connection(self.dbname)
        attr_str = []
        for item in self.current_table2.keys():
            if self.current_table2[item] == 'varchar':
                attr_str.append(item + ' '+ 'varchar(255)')
            elif self.current_table2[item] == 'integer':
                attr_str.append(item + ' '+ 'int')
            elif self.current_table2[item] == 'decimal':
                attr_str.append(item + ' '+ 'decimal(9,2)')    
        table_query = "create table {} ({});".format(self.current_table_name2,','.join(attr_str))
        print(table_query)
        try:
            create_table(c,table_query)
            self.dtable = wx.grid.Grid(self,1)
            self.dtable.CreateGrid(self.num_records2,len(list(self.current_table2.keys())))
            for count,item in zip(range(len(list(self.current_table2.keys()))),self.current_table2.keys()):
                self.dtable.SetColLabelValue(count,item)
            self.dtable.AutoSize()
            dh7 = wx.BoxSizer(wx.HORIZONTAL)
            insert_button = wx.Button(self,label = "insert")
            insert_button.Bind(wx.EVT_BUTTON, self.click_insert_table)

            dh7.Add(insert_button)
            self.dh6.Add(dh7)
            self.dh6.Add(self.dtable)
            self.Layout()
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
    # insert the values entered by the user into the new table 
    def click_insert_table(self,event):
        try:
            c = create_connection(self.dbname)
            cur = c.cursor()
            df = pd.read_sql_query("select * from %s" %self.current_table_name2, c)
            display(df)
            num_q = '?,'*len(df.columns)
            num_q = num_q[:-1]
            for row in range(self.num_records2):
                row_values = []
                for col in range(len(df.columns)):
                    val = self.dtable.GetCellValue(row,col)
                    if val == '':
                        val = None
                    row_values.append(val)
                print(row_values)
                with c:
                    query = "insert or ignore into {}({}) values ({});".format(self.current_table_name2,','.join(df.columns),num_q)
                    print(query)
                    insert_table(c,query,row_values)

        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
            
    #create a new interface for user to choose sample data
    def click_sample_data(self,event):
        self.v.Clear(True)
        h1 = wx.BoxSizer(wx.HORIZONTAL)
        self.v1 = wx.BoxSizer(wx.VERTICAL)
        samples = ['HOMO Energies','Density','composition entries and formation energies','crystal structure entries','Smiles','Molecular Properties']
        self.sample_choices = wx.ComboBox(self,choices = samples)
        choose_button = wx.Button(self,label = "choose")
        choose_button.Bind(wx.EVT_BUTTON,self.click_choose_sample)
        self.samplename = None
        self.sample_data = None        
        h1.Add(self.sample_choices)
        h1.Add(choose_button)
        self.v.Add(h1)
        self.v.Add(self.v1)
        self.Layout()
    #generate the corresponding sample data chosen by the user 
    #use functions from ChemML library to generate first four sample data
    #use ChEMBL wrapper to generate the last two sample data
    def click_choose_sample(self,event):
        self.v1.Clear(True)
        self.sample_name = self.sample_choices.GetValue()
        if self.sample_name == '':                   
            wx.MessageBox("No sample data is chosen!","Info",wx.OK|wx.ICON_INFORMATION)
        elif self.sample_name == 'HOMO Energies':
            smi,homo = load_cep_homo()
            self.table = wx.grid.Grid(self,1)
            self.table.CreateGrid(10,2)
            self.table.SetColLabelValue(0,smi.columns[0])
            self.table.SetColLabelValue(1,homo.columns[0])
            smi_list = smi.iloc[:,0].to_list()[:10]
            homo_list = homo.iloc[:,0].to_list()[:10]
            for row in range(10):
                self.table.SetCellValue(row,0,smi_list[row])
                self.table.SetCellValue(row,1,str(homo_list[row]))
            save_button = wx.Button(self,label = 'save(.csv)')
            save_button.Bind(wx.EVT_BUTTON,self.click_save_sample)
            self.table.AutoSize()
            self.v1.Add(self.table)
            self.v1.Add(save_button)
        elif self.sample_name == 'Density':
            smi,density,features = load_organic_density()
            self.table = wx.grid.Grid(self,1)
            self.table.CreateGrid(10,202)
            self.table.SetColLabelValue(0,smi.columns[0])
            self.table.SetColLabelValue(1,density.columns[0])
            for index, colname in zip(range(2,202),features.columns):
                self.table.SetColLabelValue(index,colname)
            smi_list = smi.iloc[:,0].to_list()[:10]
            density_list = density.iloc[:,0].to_list()[:10] 
            for row in range(10):
                self.table.SetCellValue(row,0,smi_list[row])
                self.table.SetCellValue(row,1,str(density_list[row]))
            count = 2    
            for col in range(200):
                col_list = features.iloc[:,col].to_list()[:10]
                for row in range(10):
                    self.table.SetCellValue(row,count,str(col_list[row]))
                count += 1
            save_button = wx.Button(self,label = 'save(.csv)')
            save_button.Bind(wx.EVT_BUTTON,self.click_save_sample)
            self.table.AutoSize()
            self.v1.Add(self.table)
            self.v1.Add(save_button)
            
        elif self.sample_name == 'composition entries and formation energies':
            entries, df = load_comp_energy()
            self.table = wx.grid.Grid(self,1)
            self.table.CreateGrid(10,2)
            self.table.SetColLabelValue(0,"composition entries")
            self.table.SetColLabelValue(1,df.columns[0])
            df_list = df.iloc[:,0].to_list()[:10] 
            for row in range(10):
                self.table.SetRowSize(row,200)
                self.table.SetCellValue(row,0,str(entries[row]))
                self.table.SetCellValue(row,1,str(df_list[row]))
            save_button = wx.Button(self,label = 'save(.csv)')
            save_button.Bind(wx.EVT_BUTTON,self.click_save_sample)
            self.table.AutoSize()
            self.v1.Add(self.table)
            self.v1.Add(save_button)
        elif self.sample_name == 'crystal structure entries':
            entries = load_crystal_structures()
            self.table = wx.grid.Grid(self,1)
            self.table.CreateGrid(10,1)
            self.table.SetColLabelValue(0,'crystal structure entries')
            for row in range(10):
                self.table.SetCellValue(row,0,str(entries[row]))    
            save_button = wx.Button(self,label = 'save(.csv)')
            save_button.Bind(wx.EVT_BUTTON,self.click_save_sample)
            self.table.AutoSize()
            self.v1.Add(self.table)
            self.v1.Add(save_button)
        
        elif self.sample_name == 'Smiles':
            id_list = new_client.molecule.only(['molecule_chembl_id'])
            chembl_id_list = [list(id_list[i].values())[0] for i in  range(100)]
            mol_list =  [new_client.molecule.get(i) for i in chembl_id_list]
            results = []
            for mol in mol_list:
                results.append(mol['molecule_structures']['canonical_smiles'])
            self.table = wx.grid.Grid(self,1)
            self.table.CreateGrid(10,1)
            self.table.SetColLabelValue(0,'Smiles')
            for row,mol in zip(range(10),results):
                self.table.SetCellValue(row,0,str(mol))  
            self.sample_data = results       
            save_button = wx.Button(self,label = 'save(.csv)')
            save_button.Bind(wx.EVT_BUTTON,self.click_save_sample)
            self.table.AutoSize()
            self.v1.Add(self.table)
            self.v1.Add(save_button)
        elif self.sample_name == 'Molecular Properties':
            properties_dict = {'Smiles':'canonical_smiles','ALogP':'alogp', 'Aromatic Rings':'aromatic_rings', 'CX LogD pH7.4':'cx_logd','CX LogP':'cx_logp', 'CX Acidic pKa':'cx_most_apka', 'CX Basic pKa':'cx_most_bpka', 'Full Molecular Formula':'full_molformula', 'Molecular Weight':'full_mwt', 'HBA':'hba', 'HBA(Lipinski)':'hba_lipinski', 'HBD(Lipinski)':'hbd', 'HBD(Lipinski)':'hbd_lipinski', 'Heavy Atoms':'heavy_atoms', 'Molecular species':'molecular_species', 'Monoisotopic Molecular Weight':'mw_monoisotopic', '#Ro5 Violations (Lipinski)':'num_lipinski_ro5_violations', '#Ro5 Violations':'num_ro5_violations', 'Polar Surface Area':'psa','#Rotatable Bonds':'rtb','Molecular Type':'molecule_type','Name':'pref_name','Standard InChI':'standard_inchi','QED Score':'qed_weighted'}
            id_list = new_client.molecule.only(['molecule_chembl_id'])
            chembl_id_list = [list(id_list[i].values())[0] for i in  range(100)]
            mol_list =  [new_client.molecule.get(i) for i in chembl_id_list]
            results = {}
            for mol in mol_list:
                temp = {}
                for pro in properties_dict.values():
                    if pro == 'canonical_smiles' or pro == 'standard_inchi':
                        temp[pro] = mol['molecule_structures'][pro]
                    elif pro == 'molecule_type' or pro == 'pref_name':
                        temp[pro] = mol[pro]
                    else:
                        temp[pro] = mol['molecule_properties'][pro]
                results[mol['molecule_structures']['canonical_smiles']] = temp
            self.table = wx.grid.Grid(self,1)
            self.table.CreateGrid(10,len(list(properties_dict.values())))
            for num,pro in zip(range(len(list(properties_dict.values()))),properties_dict.values()):
                self.table.SetColLabelValue(num,pro)
            for row,mol in zip(range(10),list(results.keys())[:10]):
                for col,pro in zip(range(len(list(results[mol].keys()))),results[mol].values()):
                    self.table.SetCellValue(row,col,str(pro))   
            self.sample_data = results       
            save_button = wx.Button(self,label = 'save(.csv)')
            save_button.Bind(wx.EVT_BUTTON,self.click_save_sample)
            self.table.AutoSize()
            self.v1.Add(self.table)
            self.v1.Add(save_button)
        self.Layout()
    
    #The user could save the sample data as CSV file for further analysis
    def click_save_sample(self,event):
        samples = ['HOMO Energies','Density','composition entries and formation energies','crystal structure entries','Smiles','Molecular Properties']
        df = None
        if self.sample_name == 'HOMO Energies':
            smi,homo = load_cep_homo()     
            smi_list = smi.iloc[:,0].to_list()
            homo_list = homo.iloc[:,0].to_list()       
            d = {}
            d[smi.columns[0]] = smi_list
            d[homo.columns[0]] = homo_list
            df = pd.DataFrame(d)
        elif self.sample_name == 'Density':
            smi,density,features = load_organic_density()
            d = {}
            smi_list = smi.iloc[:,0].to_list()
            den_list = density.iloc[:,0].to_list()
            d[smi.columns[0]] = smi_list
            d[density.columns[0]] = den_list
            for num,item in zip(range(len(list(features.columns))),features.columns):
                d[item] = features.iloc[:,num].to_list()
            df = pd.DataFrame(d)
        elif self.sample_name == 'composition entries and formation energies':
            entries, df = load_comp_energy()
            d = {}
            df_list = df.iloc[:,0].to_list()
            d["composition entries"] = entries
            d[df.columns[0]] = df_list
            df = pd.DataFrame(d)
        elif self.sample_name == 'crystal structure entriescs':
            entries = load_crystal_structures()
            d = {}
            d["crystal structure entries"] = entries
            df = pd.DataFrame(d)
        elif self.sample_name == 'Smiles':
            d = {}
            d['smiles'] = self.sample_data
            df = pd.DataFrame(d)
        elif self.sample_name == 'Molecular Properties':
            d = {}
            for item in list(self.sample_data.values())[0].keys():
                d[item] = []
            for item in self.sample_data.values():
                for item2 in item.keys():
                    d[item2].append(item[item2])
            df = pd.DataFrame(d)
        file_dialog = wx.FileDialog(self,"Save csv File", wildcard = "CSV files (*.csv)|*.csv",style = wx.FD_SAVE)
        if file_dialog.ShowModal() == wx.ID_CANCEL:
            return
        filename = file_dialog.GetPath()
        try:
            df.to_csv(filename)
        except IOError:
            wx.MessageBox("This file can't be saved successfully.","Info",wx.OK|wx.ICON_INFORMATION)
    
    # generating the random data is based on the number of records entered and the features chosen by the user
    # The ChEMBL wrapper is used in this function for retreiving the properties of the chemical molecules
    #it may take longer for generating large dataset
    def click_random_data(self,event):
        self.v.Clear(True)
        label1 = wx.StaticText(self,label = "Generate random chemical molecules")
        label2 = wx.StaticText(self,label = "Number:")
        self.enternum = wx.TextCtrl(self)
        label_limit = wx.StaticText(self,label = '<= 500')
        label3 = wx.StaticText(self,label = "Feature:")
        generate_button = wx.Button(self, label = "generate")
        generate_button.Bind(wx.EVT_BUTTON,self.click_generate)
        add_button = wx.Button(self,label = 'Add')
        add_button.Bind(wx.EVT_BUTTON,self.click_add)
        self.choices_dict = {'Smiles':'canonical_smiles','ALogP':'alogp', 'Aromatic Rings':'aromatic_rings', 'CX LogD pH7.4':'cx_logd','CX LogP':'cx_logp', 'CX Acidic pKa':'cx_most_apka', 'CX Basic pKa':'cx_most_bpka', 'Full Molecular Formula':'full_molformula', 'Molecular Weight':'full_mwt', 'HBA':'hba', 'HBA(Lipinski)':'hba_lipinski', 'HBD(Lipinski)':'hbd', 'HBD(Lipinski)':'hbd_lipinski', 'Heavy Atoms':'heavy_atoms', 'Molecular species':'molecular_species', 'Monoisotopic Molecular Weight':'mw_monoisotopic', '#Ro5 Violations (Lipinski)':'num_lipinski_ro5_violations', '#Ro5 Violations':'num_ro5_violations', 'Polar Surface Area':'psa','#Rotatable Bonds':'rtb','Molecular Type':'molecule_type','Name':'pref_name','Standard InChI':'standard_inchi','RA Score':'RA','QED Score':'qed_weighted','SYBA Score':'SYBA','NP Score':'NP','SA Score':'SA'}
        self.features_chosen = []
        self.features_left = list(self.choices_dict.keys())
        self.features_box = wx.ComboBox(self,choices = list(self.choices_dict.keys()))
        self.result_dict = {}
        h1 = wx.BoxSizer(wx.HORIZONTAL)
        self.h3 = wx.BoxSizer(wx.HORIZONTAL)
        label4 = wx.StaticText(self,label = "Feature(s) Chosen:")
        self.h3.Add(label4)
        self.h4 = wx.BoxSizer(wx.VERTICAL)
        h1.Add(label2)
        h1.Add(self.enternum)
        h1.Add(label_limit)
        self.h2 = wx.BoxSizer(wx.HORIZONTAL)
        self.h2.Add(label3)
        self.h2.Add(self.features_box)
        self.h2.Add(add_button)
        self.v.Add(label1)
        self.v.Add(h1)
        self.v.Add(self.h2)
        self.v.Add(self.h3)  
        sl = wx.StaticLine(self,size = (5,1),style = wx.LI_HORIZONTAL)
        self.v.Add(generate_button)
        self.v.Add(sl)
        self.h5 = wx.BoxSizer(wx.HORIZONTAL)
        self.v.Add(self.h5)
        self.h7 = wx.BoxSizer(wx.HORIZONTAL)
        self.h8 = wx.BoxSizer(wx.HORIZONTAL)
        self.h9 = wx.BoxSizer(wx.VERTICAL)
        self.v.Add(self.h4)
        self.v.Add(self.h7)
        self.v.Add(self.h8)
        self.v.Add(self.h9)
        self.Layout()
    # add the button labeled with the chosen feature
    #the user could drop the feature by click the button
    def click_add(self,event):
        children = self.h2.GetChildren()
        for child in children:
            if isinstance(child.GetWindow(),wx.ComboBox):
                if child.GetWindow().GetValue() != '':
                    self.features_chosen.append(child.GetWindow().GetValue())
                    label_button = wx.Button(self,label = child.GetWindow().GetValue())
                    label_button.Bind(wx.EVT_BUTTON,self.click_delete)
                    self.h3.Add(label_button)
                    self.features_left.remove(child.GetWindow().GetValue())
                    self.features_box.Set(self.features_left)
                else:
                    wx.MessageBox("No feature is chosen!","Info",wx.OK|wx.ICON_INFORMATION)
        self.Layout()
        #delete the chosen feature, and add it back to the combo box for user to choose
    def click_delete(self,event):
        feature = event.GetEventObject().GetLabel()
        self.features_chosen.remove(feature)
        self.features_left.append(feature)
        self.features_box.Set(self.features_left)
        children = self.h3.GetChildren()
        count = 0
        for child in children:
            if isinstance(child.GetWindow(),wx.Button) and child.GetWindow().GetLabel() == feature:
                self.h3.Hide(count)
                self.h3.Remove(count)
            count += 1
        self.Layout()  

     #generate random data with ChEMBL wrapper
    #the model prediction for the scores may be applied here if user choose them
    # obtain corresponding molecular properties by the keys in the resulting dictionary
    #if the number of records is less than 10, this amount of records will be displayed on the interface as preview
    #if the number of records is large than 10, the first 10 records will be display on the interface as preview
    def click_generate(self,event):
        self.h4.Clear(True)
        self.h5.Clear(True)
        self.h7.Clear(True)
        self.h8.Clear(True)
        self.h9.Clear(True)
        self.num = int(self.enternum.GetValue())
        random_index = random.sample(range(0,500),self.num)
        id_list = new_client.molecule.only(['molecule_chembl_id'])
        score_dict = {'RA Score':'RA','SYBA Score':'SYBA','NP Score':'NP','SA Score':'SA'}
        random_id_list = [list(id_list[i].values())[0] for i in random_index]
        mol_list =  [new_client.molecule.get(random_id) for random_id in random_id_list]    
        id_list = new_client.molecule.only(['molecule_chembl_id'])
        chembl_id_list = [list(id_list[i].values())[0] for i in  range(self.num)]
        mol_list =  [new_client.molecule.get(i) for i in chembl_id_list]  
        ra_nn_scorer = None
        syba = None
        fs = None
        if 'RA Score' in self.features_chosen:
            ra_nn_scorer = RAscore_NN.RAScorerNN()
        if 'SYBA Score' in  self.features_chosen:
            syba = SybaClassifier()
            syba.fitDefaultScore()
        if 'SYBA Score' in  self.features_chosen:        
            model_file = open('publicnp.model','rb')
            fs = pickle.load(model_file)
        self.result_dict = {}
        for item in self.features_chosen:
            temp = []
            for m in mol_list:
                feature = self.choices_dict[item]
                if feature not in score_dict.values():
                    if feature == 'canonical_smiles' or feature == 'standard_inchi':
                        if m['molecule_structures'][feature] == None:
                            temp.append('None')
                        else:
                            temp.append(m['molecule_structures'][feature])
                    elif feature == 'molecule_type' or feature == 'pref_name':
                        if m[feature] == None:
                            temp.append('None')
                        else:
                            temp.append(m[feature])     
                    else:
                        if m['molecule_properties'] == None:
                            temp.append('None')
                        elif m['molecule_properties'][feature] == None:
                            temp.append('None')
                        else:
                            temp.append(m['molecule_properties'][feature])   
                else:
                    mol_str = m['molecule_structures']['canonical_smiles']
                    mol_chem = Chem.MolFromSmiles(mol_str)
                    if feature == 'RA':
                        temp.append(ra_nn_scorer.predict(mol_str))
                    elif feature == 'SYBA':
                        temp.append(syba.predict(mol_str))
                    elif feature == 'SA':
                        temp.append(sascorer.calculateScore(mol_chem))
                    elif feature == 'NP':
                        temp.append(npscorer.scoreMol(mol_chem,fs))
                    
            self.result_dict[feature] = temp 
        
        self.table = wx.grid.Grid(self,1)
        num2 = 0
        if self.num <= 10:
            num2 = self.num
        else:
            num2 = 10
        self.table.CreateGrid(num2,len(list(self.result_dict.keys())))
        count = 0
        for item in self.result_dict.keys():
            self.table.SetColLabelValue(count,item)
            row = 0
            for val in self.result_dict[item]:
                self.table.SetCellValue(row,count,str(val))
                row += 1
                if row == 10:
                    break
            count += 1
        print(self.result_dict)
        self.table.AutoSize()
        save_button = wx.Button(self,label = "save(.csv)")
        save_button.Bind(wx.EVT_BUTTON,self.click_save_data)
        visual_button = wx.Button(self,label = "visualization")
        visual_button.Bind(wx.EVT_BUTTON,self.click_visualize)
        cluster_button = wx.Button(self,label = "clustering")
        cluster_button.Bind(wx.EVT_BUTTON,self.click_run_cluster)
        pca_button = wx.Button(self,label = 'PCA')
        pca_button.Bind(wx.EVT_BUTTON,self.click_run_pca)
        nn_button = wx.Button(self,label = 'Neural Network')
        nn_button.Bind(wx.EVT_BUTTON,self.click_run_nn)
        self.h5.Add(save_button)
        self.h5.Add(visual_button)
        self.h5.Add(cluster_button)
        self.h5.Add(pca_button)
        self.h5.Add(nn_button)
        self.h4.Add(self.table)
        df1 = pd.DataFrame(self.result_dict)
        df = df1.describe(include = 'all')
        l = ['count','unique','top','freq','mean','std','min','25%','50%','75%','max']
        self.table2 = wx.grid.Grid(self,1)
        self.table2.CreateGrid(len(l),df.shape[1]+1)
        self.table2.SetColLabelValue(0,'Description')
        for index,col in zip(range(1,len(df.columns)+1),df.columns):
            self.table2.SetColLabelValue(index,col)

        for row in range(len(l)):
            self.table2.SetCellValue(row,0,l[row])        
        
        for col in range(1,df.shape[1]+1):
            col_list = df.iloc[:,col-1]
            for row in range(df.shape[0]):
                self.table2.SetCellValue(row,col,str(col_list[row]))
        self.table2.AutoSize()
        label = wx.StaticText(self,label = 'Summary:')
        self.h4.Add(label)
        self.h4.Add(self.table2)         
        self.Layout()    
    # run PCA analysis based on the random dataset generated 
    #three types of combination of the parameters will be applied on PCA analysis
    # the resulting principal components and the corrsponding pie charts will be diplayed on the interface
    def click_run_pca(self,event):
        self.h7.Clear(True)
        self.h8.Clear(True)
        self.h9.Clear(True)
        resultd = self.result_dict
        df = pd.DataFrame(self.result_dict)
        strd = ['canonical_smiles','full_molformula','molecular_species','molecule_type','pref_name','standard_inchi']
        for item in self.result_dict.keys():
            if item in strd:
                codes,uniques = pd.factorize(self.result_dict[item])
                resultd[item] = codes
        df = pd.DataFrame(resultd)
        arr = df.to_dict('split')['data']
        for i in range(len(arr)):
            for j in range(len(arr[i])):
                if arr[i][j] == 'None':
                    arr[i][j] = 0
                else:
                    arr[i][j] = float(arr[i][j])
        mle_result = []
        full_result = []
        full_result_percent_comp = []
        arpack_result = []
        arr_scale = StandardScaler()
        arr = arr_scale.fit_transform(arr)
        try:
            if len(arr) >= len(arr[0]):
                pca = PCA(n_components = 'mle',svd_solver = 'full')
                pca.fit(arr)
                mle_result.append(pca.explained_variance_ratio_)
            for i in np.arange(0.1,1,0.1):
                pca = PCA(n_components = i,svd_solver = 'full')
                pca.fit(arr)
                full_result.append(pca.explained_variance_ratio_)
                full_result_percent_comp.append(i*100)
            for i in range(1,min(len(arr),len(arr[0])),1):
                pca = PCA(n_components = i,svd_solver = 'arpack')
                pca.fit(arr)
                arpack_result.append(pca.explained_variance_ratio_)

            if len(mle_result) > 0:
                self.table1 = wx.grid.Grid(self,1)
                self.table1.CreateGrid(len(mle_result[0]),2)
                self.table1.SetColLabelValue(0,'Principal Component')
                self.table1.SetColLabelValue(1,'Explained Variance Ratio(MLE)')
                for row in range(len(mle_result[0])):
                    self.table1.SetCellValue(row,0,'PC'+str(row+1))
                for row in range(len(mle_result[0])):
                    self.table1.SetCellValue(row,1,str(mle_result[0][row])) 
                self.table1.AutoSize()
                label1 = wx.StaticText(self,label = "LAPACK solver + Minka's MLE:" )
                vchart1 = wx.BoxSizer(wx.VERTICAL)
                vchart1.Add(label1)
                vchart1.Add(self.table1)
                plt.pie(mle_result[0],labels = ['PC'+str(i+1) for i in range(len(mle_result[0]))])
                plt.title('Pie Chart of '+ 'Explained Variance Ratio(MLE)')
                plt.savefig('pie1.png')
                plt.close()
                pie_image_b = wx.Image("pie1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                pie_image_bmp = wx.StaticBitmap(self,-1,pie_image_b)
                vchart1.Add(pie_image_bmp) 
                self.h7.Add(vchart1)
            if len(full_result) > 0:
                self.table2 = wx.grid.Grid(self,1)
                self.table2.CreateGrid(len(full_result[len(full_result)-1]),len(full_result)+1)
                self.table2.SetColLabelValue(0,'Principal Component')
                for col in range(1,len(full_result)+1):
                    self.table2.SetColLabelValue(col,'Explained variance Ratio(Components:'+str(full_result_percent_comp[col-1])+'%)')
                for row in range(len(full_result[len(full_result)-1])):
                    self.table2.SetCellValue(row,0,'PC'+str(row+1))
                for col in range(1,len(full_result)+1):
                    col_list = full_result[col-1]
                    for row in range(len(full_result[col-1])):
                        self.table2.SetCellValue(row,col,str(col_list[row])) 
                self.table2.AutoSize()
                label2 = wx.StaticText(self,label = 'LAPACK solver:')
                vchart2 = wx.BoxSizer(wx.VERTICAL)
                vchart2.Add(label2)
                vchart2.Add(self.table2)
                hchart2 = wx.BoxSizer(wx.HORIZONTAL)
                for i in range(len(full_result_percent_comp)):
                    plt.pie(full_result[i],labels = ['PC'+ str(i) for i in range(1,len(full_result[i])+1)])
                    plt.title('Pie Chart of '+ 'Explained Variance Ratio(Components:'+str(full_result_percent_comp[i])+'%)')
                    plt.savefig('pie1.png')
                    plt.close()
                    pie_image_b = wx.Image("pie1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                    pie_image_bmp = wx.StaticBitmap(self,-1,pie_image_b)
                    hchart2.Add(pie_image_bmp)
                vchart2.Add(hchart2)             
                self.h7.Add(vchart2)


            if len(arpack_result) > 0:
                self.table3 = wx.grid.Grid(self,1)
                self.table3.CreateGrid(len(arpack_result[len(arpack_result)-1]),len(arpack_result)+1)
                self.table3.SetColLabelValue(0,'Principal Component')
                for col in range(1,len(arpack_result)+1):
                    self.table3.SetColLabelValue(col,'Explained variance Ratio(components:'+str(len(arpack_result[col-1]))+')')
                for row in range(len(arpack_result[len(arpack_result)-1])):
                    self.table3.SetCellValue(row,0,'PC'+str(row+1))
                for col in range(1,len(arpack_result)+1):
                    col_list = arpack_result[col-1]
                    for row in range(len(arpack_result[col-1])):
                        self.table3.SetCellValue(row,col,str(col_list[row])) 
                self.table3.AutoSize()
                label3 = wx.StaticText(self,label = 'ARPACK solver:')
                vchart3 = wx.BoxSizer(wx.VERTICAL)
                vchart3.Add(label3)
                vchart3.Add(self.table3)
                hchart3 = wx.BoxSizer(wx.HORIZONTAL)
                for i in range(len(arpack_result)):
                    plt.pie(arpack_result[i],labels = ['PC'+ str(i) for i in range(1,len(arpack_result[i])+1)])
                    plt.title('Pie Chart of '+ 'Explained Variance Ratio(Components:'+str(len(arpack_result[i]))+')')
                    plt.savefig('pie1.png')
                    plt.close()
                    pie_image_b = wx.Image("pie1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                    pie_image_bmp = wx.StaticBitmap(self,-1,pie_image_b)
                    hchart3.Add(pie_image_bmp)
                vchart3.Add(hchart3)             
                self.h7.Add(vchart3)


            self.SetupScrolling()    
            self.Layout()
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
    
    #the user could choose different combinations of the parameters for building the neural network
    #in addition, the user could choose certain feature of the random dataset as prediction feature
    
    def click_run_nn(self,event):
        self.h7.Clear(True)
        self.h8.Clear(True)
        self.h9.Clear(True)
        label = wx.StaticText(self,label = "Prediction Feature:" )
        self.pred_fea_choices =wx.ComboBox(self,choices = list(self.result_dict.keys()))
        self.choose_hidden_layers = wx.ComboBox(self,choices = [str(i) for i in range(1,5)])
        label2 = wx.StaticText(self,label = 'Parameters:')
        parameters = ['1 hidden layer + 32 neurons', '2 hidden layers + 32 and 64 neurons', '3 hidden layers + 32, 64, 128 neurons','4 hidden layers + 32,64,128,256 neurons']
        self.parameters_choices = wx.ComboBox(self,choices = parameters)
        label3 = wx.StaticText(self,label = 'Activation:')
        activations = ['relu','sigmoid','tanh']
        self.activations_choices = wx.ComboBox(self,choices = activations)
        label4 = wx.StaticText(self,label = 'Number of epochs:')
        self.num_epochs_choices = wx.ComboBox(self,choices = [str(i) for i in range(10,310,10)])
        label5 = wx.StaticText(self,label = 'batch size:')
        self.size_batch_choices = wx.ComboBox(self,choices = [str(i) for i in range(10,110,10)])
        run_button = wx.Button(self,label = 'run')
        run_button.Bind(wx.EVT_BUTTON,self.click_run_nn2)
        self.h7.Add(label)
        self.h7.Add(self.pred_fea_choices)
        self.h7.Add(label2)
        self.h7.Add(self.parameters_choices)
        self.h7.Add(label3)
        self.h7.Add(self.activations_choices)
        self.h7.Add(label4)
        self.h7.Add(self.num_epochs_choices)
        self.h7.Add(label5)
        self.h7.Add(self.size_batch_choices)
        self.h7.Add(run_button)
        self.SetupScrolling()
        self.Layout()
   #build a neural network based on the random dataset using MLP function from ChemML library 
  #the results will be shown as the mean squared errors and the line plot of actual values versus the predicted values
    def click_run_nn2(self,event):
        self.h8.Clear(True)
        self.h9.Clear(True)
        prediction_feature = self.pred_fea_choices.GetValue()
        para = self.parameters_choices.GetValue()
        act  = self.activations_choices.GetValue()
        num_epochs = self.num_epochs_choices.GetValue()
        size_batch = self.size_batch_choices.GetValue()
        if prediction_feature == '' or para == '' or act == '' or num_epochs == '' or size_batch == '':
            wx.MessageBox("Please choose all items!","Info",wx.OK|wx.ICON_INFORMATION)
        try:
            resultd = self.result_dict.copy()
            strd = ['canonical_smiles','full_molformula','molecular_species','molecule_type','pref_name','standard_inchi']
            for item in self.result_dict.keys():
                if item in strd:
                    codes,uniques = pd.factorize(self.result_dict[item])
                    resultd[item] = codes
            targetd = {}
            targetd[prediction_feature] = []
            print(resultd)
            print(self.result_dict)
            for item in list(resultd[prediction_feature]):
                if item == 'None':
                    targetd[prediction_feature].append(0)
                else:
                    targetd[prediction_feature].append(float(item))
            target_df = pd.DataFrame(targetd)
            del resultd[prediction_feature]        
            df = pd.DataFrame(resultd)
            arr = df.to_dict('split')['data']
            for i in range(len(arr)):
                for j in range(len(arr[i])):
                    if arr[i][j] == 'None':
                        arr[i][j] = 0
                    else:
                        arr[i][j] = float(arr[i][j])
            arr_scale = StandardScaler()
            arr = arr_scale.fit_transform(arr)
            print(target_df)
            print(df)
            num_hidden_layer = int(para[0])
            num_neurons = []
            if num_hidden_layer == 1:
                num_neurons = [32]
            elif num_hidden_layer == 2:
                num_neurons = [32,64]

            elif num_hidden_layer == 3:
                num_neurons = [32,64,128]

            elif num_hidden_layer == 4:
                num_neurons = [32,64,128,256]

            xtrain,xtest,ytrain,ytest = train_test_split(arr,target_df,test_size = 0.3,random_state = 40)
            yscale = StandardScaler()        
            ytrain = yscale.fit_transform(ytrain)
            m = MLP(nhidden = num_hidden_layer,nneurons = num_neurons,activations = [act]*num_hidden_layer,nepochs = int(num_epochs) ,batch_size = int(size_batch),loss = 'mean_squared_error',regression = True,nclasses = None,layer_config_file = None,opt_config_file = None)
            m.fit(X = xtrain, y = ytrain)
            predy = m.predict(xtest)
            predy = yscale.inverse_transform(predy)
            testy = ytest[prediction_feature]
            metricsd = regression_metrics(testy,list(predy))

            self.table3 = wx.grid.Grid(self,1)
            error_keys_list = list(metricsd.keys())[4:]
            self.table3.CreateGrid(1,len(error_keys_list))
            
            for col,item in zip(range(len(error_keys_list)),error_keys_list):
                self.table3.SetColLabelValue(col,item)
                self.table3.SetCellValue(0,col,str(metricsd[item]))
            self.table3.AutoSize()
            self.h8.Add(self.table3)
            plt.scatter(predy,testy,color = 'green')
            plt.title('Actual vs. Predicted ({})'.format(prediction_feature))
            plt.xlabel('Predicted Value')
            plt.ylabel('Actual Value')
            plt.savefig('scatter1.png')
            plt.close()
            scatter_image_b = wx.Image("scatter1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
            scatter_image_bmp = wx.StaticBitmap(self,-1,scatter_image_b)
            self.h9.Add(scatter_image_bmp)            
            
            self.SetupScrolling()
            self.Layout()
        except ValueError as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
    # perform K-means clustering algorithm to the random dataset 
    #random method and k-means++ method are involved
    #the results shown as the line plots of inertia values versus the number of cluster and the scatter plots of the largest cluster group versus the number of the clusters 
    def click_run_cluster(self,event):
        self.h7.Clear(True)
        self.h8.Clear(True)
        self.h9.Clear(True)
        resultd = self.result_dict
        df = pd.DataFrame(self.result_dict)
        strd = ['canonical_smiles','full_molformula','molecular_species','molecule_type','pref_name','standard_inchi']
        for item in self.result_dict.keys():
            if item in strd:
                codes,uniques = pd.factorize(self.result_dict[item])
                resultd[item] = codes
        df = pd.DataFrame(resultd)
        arr = df.to_dict('split')['data']
        for i in range(len(arr)):
            for j in range(len(arr[i])):
                if arr[i][j] == 'None':
                    arr[i][j] = 0
                else:
                    arr[i][j] = float(arr[i][j])                    
        result_inertia1 = []
        result_inertia2 = []
        result_labels1 = []
        result_labels2 = []
        arr_scale = StandardScaler()
        arr = arr_scale.fit_transform(arr)
        try:
            for i in range(1,11):
                kmeans_cluster1 = KMeans(init = 'random',n_clusters = i,n_init = 10,max_iter = 300,random_state = 30)
                kmeans_cluster2 = KMeans(init = 'k-means++',n_clusters = i,n_init = 10,max_iter = 300,random_state = 30)
                kmeans_cluster1.fit(arr)
                kmeans_cluster2.fit(arr)
                result_inertia1.append(kmeans_cluster1.inertia_) 
                result_inertia2.append(kmeans_cluster2.inertia_)
                result_labels1.append(kmeans_cluster1.labels_)
                result_labels2.append(kmeans_cluster2.labels_)
            plt.plot(range(1,11),result_inertia1,color = 'black')
            plt.xlabel('number of clusters')
            plt.ylabel('Inertia value')
            plt.title('Inertia value vs. number of clusters(random method)')
            plt.savefig('kmeans1.png')
            k_image_b = wx.Image('kmeans1.png',wx.BITMAP_TYPE_PNG).ConvertToBitmap()
            k_image_bmp = wx.StaticBitmap(self,-1,k_image_b)
            self.h7.Add(k_image_bmp)
            plt.close()
            plt.plot(range(1,11),result_inertia1,color = 'black')
            plt.xlabel('number of clusters')
            plt.ylabel('Inertia value')
            plt.title('Inertia value vs. number of clusters(k-means++ method)')
            plt.savefig('kmeans2.png')
            k_image_b = wx.Image('kmeans2.png',wx.BITMAP_TYPE_PNG).ConvertToBitmap()
            k_image_bmp = wx.StaticBitmap(self,-1,k_image_b)
            self.h7.Add(k_image_bmp)
            plt.close()
            most_labels1 = []
            most_labels2 = []
            for item1,item2 in zip(result_labels1,result_labels2):
                most_labels1.append(np.bincount(item1).argmax())
                most_labels2.append(np.bincount(item2).argmax())
            arr1 = np.array(range(1,11))
            arr2 = np.array(most_labels1)
            plt.scatter(arr1,arr2,color = 'green')
            plt.title('largest group in clusters(random method)')
            plt.xlabel('number of clusters')
            plt.ylabel('largest group')
            plt.savefig('scatter1.png')
            plt.close()
            scatter_image_b = wx.Image("scatter1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
            scatter_image_bmp = wx.StaticBitmap(self,-1,scatter_image_b)
            self.h8.Add(scatter_image_bmp)
            arr2 = np.array(most_labels2)
            plt.scatter(arr1,arr2,color = 'green')
            plt.title('largest group in clusters(k-means++ method)')
            plt.xlabel('number of clusters')
            plt.ylabel('largest group')
            plt.savefig('scatter2.png')
            plt.close()
            scatter_image_b = wx.Image("scatter2.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
            scatter_image_bmp = wx.StaticBitmap(self,-1,scatter_image_b)
            self.h8.Add(scatter_image_bmp)

            self.SetupScrolling()
            self.Layout()                
        except ValueError as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
            


   
   #save the resulting random dataset as CSV file     
    def click_save_data(self,event):
        df = pd.DataFrame(self.result_dict)
        file_dialog = wx.FileDialog(self,"Save csv File", wildcard = "CSV files (*.csv)|*.csv",style = wx.FD_SAVE)
        if file_dialog.ShowModal() == wx.ID_CANCEL:
            return
        filename = file_dialog.GetPath()
        try:
            df.to_csv(filename)
        except IOError:
            wx.MessageBox("This file can't be saved successfully.","Info",wx.OK|wx.ICON_INFORMATION)    
    # display histogram, pie chart, line plot, and scatter plot based on the feature(s) of random dataset selected by the user
    def click_visualize(self,event):
        self.h7.Clear(True)
        self.h8.Clear(True)
        self.h9.Clear(True)        
        self.choice1 = wx.ComboBox(self,choices = list(self.result_dict.keys()))
        self.choice2 = wx.ComboBox(self,choices = list(self.result_dict.keys()))
        self.choice3 = wx.ComboBox(self,choices = list(self.result_dict.keys()))
        self.image_bitmap = wx.StaticBitmap(self)
        label = wx.StaticText(self,label = 'vs.')
        hist_button = wx.Button(self,label = 'histogram')
        hist_button.Bind(wx.EVT_BUTTON,self.click_histogram)
        pie_chart_button = wx.Button(self,label = 'pie chart')
        pie_chart_button.Bind(wx.EVT_BUTTON,self.click_pie_chart)
        line_button = wx.Button(self,label = 'line plot')
        line_button.Bind(wx.EVT_BUTTON,self.click_line_plot)
        scatter_button = wx.Button(self,label = 'scatter plot')
        scatter_button.Bind(wx.EVT_BUTTON,self.click_scatter_plot)
        self.h7.Add(self.choice1)
        self.h7.Add(hist_button)
        self.h7.Add(pie_chart_button)
        self.h8.Add(self.choice2)
        self.h8.Add(label)
        self.h8.Add(self.choice3)
        self.h8.Add(line_button)
        self.h8.Add(scatter_button)
        self.h9.Add(self.image_bitmap)
        self.SetupScrolling()
        self.Layout()
    # generate and display the histogram of the feature
    def click_histogram(self,event):
        self.h9.Clear(True)
        fea = self.choice1.GetValue()
        arr = np.array(self.result_dict[fea])
        plt.figure(figsize = (15,7)) 
        plt.hist(arr,color = 'yellow',bins = 10)
        plt.xticks(rotation = 45)
        plt.xlabel(fea)
        plt.ylabel("frequency")
        plt.title('Distribution of '+fea)        
        plt.savefig('hist1.png')
        plt.close()
        hist_image_b = wx.Image("hist1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
        hist_image_bmp = wx.StaticBitmap(self,-1,hist_image_b)
        self.h9.Add(hist_image_bmp)
        self.SetupScrolling()
        
        self.Layout()
    # generate and display the pie chart of the feature
    # the data will be divided into serveral range for delivering the straingt view to the user
    def click_pie_chart(self,event):
        self.h9.Clear(True)
        feature = self.choice1.GetValue() 
        features_str = ['cx_logd', 'alogp','cx_logp', 'cx_most_apka', 'cx_most_bpka', 'full_mwt', 'mw_monoisotopic',  'psa','qed_weighted']
        features_num = ['aromatic_rings', 'cx_logd', 'alogp', 'cx_logp', 'cx_most_apka', 'cx_most_bpka', 'full_mwt', 'hba', 'hba_lipinski', 'heavy_atoms', 'hbd_lipinski','mw_monoisotopic', 'num_lipinski_ro5_violations', 'num_ro5_violations', 'psa', 'rtb']
        bins_result = None
        if feature in features_num:
            arr = []
            if feature in features_str:
                for item in self.result_dict[feature]:
                    if item != 'None':
                        arr.append(float(item))
                    else:
                        arr.append(np.nan)
            else:
                for item in self.result_dict[feature]:
                    if item != 'None':
                        arr.append(item)
                    else:
                        arr.append(np.nan)
            df = pd.DataFrame()
            df[feature] = arr
            print(arr)
            df.fillna(0)
            df['partition'] = pd.qcut(df[feature],q = 5, duplicates= 'drop')
            bins_result = df['partition'].value_counts()               
            print(bins_result.index)
            display(bins_result)               
        if feature not in features_num:                    
            df = pd.DataFrame()
            df[feature] = self.result_dict[feature]
            bins_result = df[feature].value_counts()
            print(bins_result.index)
            display(bins_result) 
            print(type(bins_result))     
        plt.pie(bins_result,labels = bins_result.index)
        plt.title('pie chart of '+feature)
        plt.savefig('pie1.png')
        plt.close()
        pie_image_b = wx.Image("pie1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
        pie_image_bmp = wx.StaticBitmap(self,-1,pie_image_b)
        self.h9.Add(pie_image_bmp)
        self.SetupScrolling()
        
        self.Layout()
    # generate and display the line plot to indicate the relation between two features chosen
    # the x axis will be set to large scale if the size of data is large
    def click_line_plot(self,event):
        self.h9.Clear(True)
        features_num = ['RA','SYBA','NP','SA','aromatic_rings', 'cx_logd', 'alogp', 'cx_logp', 'cx_most_apka', 'cx_most_bpka', 'full_mwt', 'hba', 'hba_lipinski', 'heavy_atoms', 'hbd_lipinski','mw_monoisotopic', 'num_lipinski_ro5_violations', 'num_ro5_violations', 'psa', 'rtb']
        feature1 = self.choice3.GetValue() 
        feature2 = self.choice2.GetValue() 
        arr = np.array(self.result_dict[feature1])
        arr2 = np.array(self.result_dict[feature2])
        plt.figure(figsize = (15,7))         
        plt.plot(arr,arr2,color = 'black')
        
        if feature1 in features_num:
            plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(15))
        if feature2 in features_num:    
            plt.gca().yaxis.set_major_locator(ticker.MaxNLocator(15))                    
        plt.title(self.choice2.GetValue() + ' vs. '+ self.choice3.GetValue())
        plt.xlabel(self.choice2.GetValue())
        plt.ylabel(self.choice3.GetValue())
        plt.savefig('line1.png')
        plt.close()
        line_image_b = wx.Image("line1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
        line_image_bmp = wx.StaticBitmap(self,-1,line_image_b)
        self.h9.Add(line_image_bmp)
        self.SetupScrolling()
        
        self.Layout()
    # generate and display the scatter plots to indicate the relation between two features chosen
    # the x axis will be set to large scale if the size of data is large
    def click_scatter_plot(self,event):
        self.h9.Clear(True)
        features_num = ['RA','SYBA','NP','SA','aromatic_rings', 'cx_logd', 'alogp', 'cx_logp', 'cx_most_apka', 'cx_most_bpka', 'full_mwt', 'hba', 'hba_lipinski', 'heavy_atoms', 'hbd_lipinski','mw_monoisotopic', 'num_lipinski_ro5_violations', 'num_ro5_violations', 'psa', 'rtb']
        feature1 = self.choice3.GetValue() 
        feature2 = self.choice2.GetValue() 
        arr = np.array(self.result_dict[feature1])
        arr2 = np.array(self.result_dict[feature2])
        plt.figure(figsize = (15,7))         
        plt.scatter(arr,arr2,color = 'green')
        
        if feature1 in features_num:
            plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(15))
        if feature2 in features_num:    
            plt.gca().yaxis.set_major_locator(ticker.MaxNLocator(15))        

        plt.title(self.choice2.GetValue() + ' vs. '+ self.choice3.GetValue())
        plt.xlabel(self.choice2.GetValue())
        plt.ylabel(self.choice3.GetValue())
        plt.savefig('scatter.png')
        plt.close()
        scatter_image_b = wx.Image("scatter.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
        scatter_image_bmp = wx.StaticBitmap(self,-1,scatter_image_b)
        self.h9.Add(scatter_image_bmp)
        self.SetupScrolling()
        
        self.Layout()
    
    
   
        

In [6]:
#create button for user to upload the dataset of chemical molecules in SMILES format
class score_tab_content(scrolled.ScrolledPanel):
    def __init__(self,parent):
        scrolled.ScrolledPanel.__init__(self,parent)
        s = wx.BoxSizer(wx.VERTICAL)
        label = wx.StaticText(self,label = "Calculate scores for molecules:")
        label1 = wx.StaticText(self,label = "Molecules:")
        upload_button = wx.Button(self,label = "upload csv file")
        upload_button.Bind(wx.EVT_BUTTON,self.click_run_score)
        label2 = wx.StaticText(self,label = 'Results:')
        h = wx.BoxSizer(wx.HORIZONTAL)
        h.Add(label1)
        h.Add(upload_button)
        s.Add(label)
        s.Add(h)
        self.hsizer1 = wx.BoxSizer(wx.HORIZONTAL)
        self.hsizer2 = wx.BoxSizer(wx.HORIZONTAL)
        self.hsizer3 = wx.BoxSizer(wx.HORIZONTAL)
        sl1 = wx.StaticLine(self,size = (4500,1),style = wx.LI_HORIZONTAL)
        sl2 = wx.StaticLine(self,size = (4500,1),style = wx.LI_HORIZONTAL)
        self.v = wx.BoxSizer(wx.VERTICAL)
        self.v.Add(label2)
        self.v.Add(self.hsizer1)
        self.v.Add(sl1)
        self.v.Add(self.hsizer2)
        self.v.Add(sl2)
        self.v.Add(self.hsizer3)
        s.Add(self.v)
        self.SetSizer(s)
        self.Layout()
    # calculate 5 types of scores of each molecules
    # sort the resulting scores into different range
    #display the results as histograms for user to have the clear view of classifying the molecules as easy-to-synthesized, hard-to-synthesized. etc
    #display the smiles of the molecules that is located on the extreme level like "very easy"
    def click_run_score(self,event):
        self.hsizer1.Clear(True)
        self.hsizer2.Clear(True)
        self.hsizer3.Clear(True)
        file_dialog = wx.FileDialog(self,"Choose CSV File", wildcard = "csv files (*.csv)|*.csv",style = wx.FD_OPEN|wx.FD_FILE_MUST_EXIST)
        if file_dialog.ShowModal() == wx.ID_CANCEL:
            return
        filename = file_dialog.GetPath()
        try:
            df = pd.read_csv(filename)
            mol_list = df.iloc[:,0]
#             ra_nn_scorer = RAscore_NN.RAScorerNN()
#             syba = SybaClassifier()
#             syba.fitDefaultScore()
#             model_file = open('publicnp.model','rb')
#             fs = pickle.load(model_file)
            score_list = {'ra':[],'syba':[],'sa':[],'np':[],'qed':[]}
            for mol_str in mol_list:
                mol_chem = Chem.MolFromSmiles(mol_str)
                score_list['ra'].append(ra_nn_scorer.predict(mol_str))
                score_list['syba'].append(syba.predict(mol_str))
                score_list['sa'].append(sascorer.calculateScore(mol_chem))
                score_list['np'].append(npscorer.scoreMol(mol_chem,fs))
                score_list['qed'].append(QED.qed(mol_chem))
            ra_list = []
            syba_list = []
            sa_list = []
            np_list = []
            qed_list = []
            level1 = ['very easy','easy','average','difficult','very difficult']
            level2 = ['easy','difficult']
            level3 = ['very unlikely','unlikely','neutral','likely','very likely']
            for item in score_list.keys():
                if item == 'ra':
                    df = pd.DataFrame()
                    df['ra'] = score_list[item]
                    df['partition'] = pd.cut(df['ra'],bins = [0,0.2,0.4,0.6,0.8,1],labels = level1)
                    plt.figure(figsize = (15,7)) 
                    plt.hist(df['partition'],color = 'green')
                    plt.xlabel('level')
                    plt.ylabel("frequency")
                    plt.title('Retrosynthetic Accessibility Score')        
                    plt.savefig('hist1.png')
                    plt.close()
                    hist_image_b = wx.Image("hist1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                    hist_image_bmp = wx.StaticBitmap(self,-1,hist_image_b)
                    ra_score = list(score_list[item])
                    label1 = wx.StaticText(self,label = 'Easiest molecule to retrosynthesize in dataset: {}'.format(mol_list[ra_score.index(max(ra_score))]))
                    label2 = wx.StaticText(self,label = 'Most difficult molecule to retrosynthesize in dataset: {}'.format(mol_list[ra_score.index(min(ra_score))]))
                    h0 = wx.BoxSizer(wx.VERTICAL)
                    h0.Add(label1)
                    h0.Add(label2)
                    h0.Add(hist_image_bmp)
                    self.hsizer1.Add(h0)      
                if item == 'sa':
                    df = pd.DataFrame()
                    df['sa'] = score_list[item]
                    df['partition'] = pd.cut(df['sa'],bins = [1,3,5,7,9,11],labels = level1)
                    plt.figure(figsize = (15,7)) 
                    plt.hist(df['partition'],color = 'green')
                    plt.xlabel('level')
                    plt.ylabel("frequency")
                    plt.title('Synthetic Accessibility Score')        
                    plt.savefig('hist1.png')
                    plt.close()
                    hist_image_b = wx.Image("hist1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                    hist_image_bmp = wx.StaticBitmap(self,-1,hist_image_b)
                    sa_score = list(score_list[item])
                    label1 = wx.StaticText(self,label = 'Easiest molecule to synthesize in dataset: {}'.format(mol_list[sa_score.index(min(sa_score))]))
                    label2 = wx.StaticText(self,label = 'Most difficult molecule to synthesize in dataset: {}'.format(mol_list[sa_score.index(max(sa_score))]))
                    h1 = wx.BoxSizer(wx.VERTICAL)
                    h1.Add(label1)
                    h1.Add(label2)
                    h1.Add(hist_image_bmp)
                    self.hsizer1.Add(h1)
                
                if item == 'syba':
                    df = pd.DataFrame()
                    df['syba'] = score_list[item]
                    df['partition'] = pd.cut(df['syba'],bins = [-np.inf,0,np.inf],labels = level2)
                    plt.figure(figsize = (15,7)) 
                    plt.hist(df['partition'],color = 'green')
                    plt.xlabel('level')
                    plt.ylabel("frequency")
                    plt.title('Synthetic Bayesian Classifier Score')        
                    plt.savefig('hist1.png')
                    plt.close()
                    hist_image_b = wx.Image("hist1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                    hist_image_bmp = wx.StaticBitmap(self,-1,hist_image_b)
                    syba_score = list(score_list[item])
                    partition_list = df['partition'].to_list()
                    l1 = 'None'
                    l2 = 'None'
                    if 'easy'in partition_list:
                        l1 = mol_list[partition_list.index('easy')]
                    if 'difficult' in partition_list:
                        l2 = mol_list[partition_list.index('difficult')]
                    label1 = wx.StaticText(self,label = 'Easy molecule to synthesize in dataset:(One example) {}'.format(l1))
                    label2 = wx.StaticText(self,label = 'Difficult molecule to synthesize in dataset:(One example) {}'.format(l2))
                    
                    h2 = wx.BoxSizer(wx.VERTICAL)
                    h2.Add(label1)
                    h2.Add(label2)
                    h2.Add(hist_image_bmp)                    
                    self.hsizer2.Add(h2)
                
                if item == 'np':
                    df = pd.DataFrame()
                    df['np'] = score_list[item]
                    df['partition'] = pd.cut(df['np'],bins = [-5,-3,-1,1,3,5],labels = level3)
                    plt.figure(figsize = (15,7)) 
                    plt.hist(df['partition'],color = 'green')
                    plt.xlabel('level')
                    plt.ylabel("frequency")
                    plt.title('Natural Product-likeness Score')        
                    plt.savefig('hist1.png')
                    plt.close()
                    hist_image_b = wx.Image("hist1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                    hist_image_bmp = wx.StaticBitmap(self,-1,hist_image_b)
                    np_score = list(score_list[item])
                    label1 = wx.StaticText(self,label = 'Molecule that is most likely to be a natural product in dataset: {}'.format(mol_list[np_score.index(max(np_score))]))
                    label2 = wx.StaticText(self,label = 'Molecule that is least likely to be a natural product in dataset: {}'.format(mol_list[np_score.index(min(np_score))]))
                    h3 = wx.BoxSizer(wx.VERTICAL)
                    h3.Add(label1)
                    h3.Add(label2)
                    h3.Add(hist_image_bmp)
                    self.hsizer2.Add(h3)                              
                if item == 'qed':
                    df = pd.DataFrame()
                    df['qed'] = score_list[item]
                    df['partition'] = pd.cut(df['qed'],bins = [-5,-3,-1,1,3,5],labels = level3)
                    plt.figure(figsize = (15,7)) 
                    plt.hist(df['partition'],color = 'green')
                    plt.xlabel('level')
                    plt.ylabel("frequency")
                    plt.title('Quantitative Estimate of Drug-likeness Score')        
                    plt.savefig('hist1.png')
                    plt.close()
                    hist_image_b = wx.Image("hist1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                    hist_image_bmp = wx.StaticBitmap(self,-1,hist_image_b)
                    qed_score = list(score_list[item])
                    label1 = wx.StaticText(self,label = 'Molecule that is most likely to be a drug candidate in dataset: {}'.format(mol_list[qed_score.index(max(qed_score))]))
                    label2 = wx.StaticText(self,label = 'Molecule that is least likely to be a drug candidate in dataset: {}'.format(mol_list[qed_score.index(min(qed_score))]))
                    h4 = wx.BoxSizer(wx.VERTICAL)
                    h4.Add(label1)
                    h4.Add(label2)
                    h4.Add(hist_image_bmp)                    
                    self.hsizer3.Add(h4)                                
            self.SetupScrolling()
            self.Layout()

                
        except IOError:
            wx.MessageBox("This file can't be opened.","Info",wx.OK|wx.ICON_INFORMATION)
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)

    

In [7]:
# rating that the "user" molecule give to the "item" molecule will be calculated by the structural similarity between 
# the "user" and the "item", the average molecular weight between two molecules, the number of common substrcuts between two molecules.
# and average of five types of scores mentioned above
# this functions create the rating matrix
def process_rating(mol_list,user_list):
    df = pd.DataFrame()
    for mol_str in mol_list:
        mol1_smi = Chem.MolFromSmiles(mol_str)
        mol1_sma = Chem.MolFromSmarts(mol_str)
        rating = []
        for user in user_list:
            rating_content = []
            mol2_smi = Chem.MolFromSmiles(user)
            rating_content.append(len(list(mol2_smi.GetSubstructMatches(mol1_sma))))
            mol1_fp = Chem.RDKFingerprint(mol1_smi)
            mol2_fp = Chem.RDKFingerprint(mol2_smi)
            rating_content.append(DataStructs.TanimotoSimilarity(mol1_fp,mol2_fp))
            mol1_mw = Descriptors.ExactMolWt(mol1_smi)
            mol2_mw = Descriptors.ExactMolWt(mol2_smi)
            rating_content.append((mol1_mw + mol2_mw)/2)
            mol1_ra = ra_nn_scorer.predict(mol_str)
            mol2_ra = ra_nn_scorer.predict(user)
            rating_content.append((mol1_ra+mol2_ra)/2)
            mol1_syba = syba.predict(mol_str)
            mol2_syba = syba.predict(user)
            rating_content.append((mol1_syba+mol2_syba)/2)
            mol1_sa = sascorer.calculateScore(mol1_smi)
            mol2_sa = sascorer.calculateScore(mol2_smi)
            rating_content.append((mol1_sa+mol2_sa)/2)
            mol1_np = npscorer.scoreMol(mol1_smi,fs)
            mol2_np = npscorer.scoreMol(mol2_smi,fs)
            rating_content.append((mol1_np+mol2_np)/2)
            mol1_qed = QED.qed(mol1_smi)
            mol2_qed = QED.qed(mol2_smi)
            rating_content.append((mol1_qed+mol2_qed)/2)
            normalized_rating_content = normalize([np.array(rating_content)])
            rating.append(np.mean(normalized_rating_content))
        df[mol_str] = rating
        df.index = user_list
    return df
# this function calculates cosine similarities between users
def process_similarity(user_rating_df,user_list):
    df = pd.DataFrame()
    user_rating_list = user_rating_df.to_dict('split')['data']
    for user,item1 in zip(user_list,user_rating_list):
        similarity_val_list = []
        for item2 in user_rating_list:
            similarity_value = cosine_similarity([item1],[item2])[0][0]
            similarity_val_list.append(similarity_value)
        df[user] = similarity_val_list
    df.index = user_list
    return df
# the accuracy rate is calculated by the average of cosine_similarity between the ratings given from the test molecule to the top 10 molecules,
# and the ratings given by the most similar "user" to its top 10 molecules
def accuracy_rate(test_mol_rating,top10mols):
    ar_list = []
    testmolratings = []
    top10ratings = []
    
    for item in top10mols.index:
        similarity_value = cosine_similarity([[test_mol_rating[item]]],[[top10mols.loc[item]]])[0][0]
        testmolratings.append(test_mol_rating[item])
        top10ratings.append(top10mols.loc[item])
        ar_list.append(similarity_value)
    return np.mean(ar_list),testmolratings,top10ratings

In [8]:
#recommender system with collaborative filtering technique
#user need to upload two datasets of molecules in SMILES format
# the first dataset is treated as "item" dataset, the other one is treated as "user"  dataset
#the "user" will rate each "item" with different properties that the moldecules have such as molecular weight
#the rating matrix built from the "users"'ratings will be used to determine the top 10 molecules that the user will recommend
class rs_tab_content(scrolled.ScrolledPanel):
    def __init__(self,parent):
        wx.Panel.__init__(self,parent)
        s = wx.BoxSizer(wx.VERTICAL)
        label1 = wx.StaticText(self,label = "Molecules:")
        upload_button = wx.Button(self,label = "upload csv file: >= 10 molecules")
        upload_button.Bind(wx.EVT_BUTTON,self.click_upload_mols)
        label2 = wx.StaticText(self,label = 'Users:')
        upload2_button = wx.Button(self,label = 'upload csv file >= 10 molecules')
        upload2_button.Bind(wx.EVT_BUTTON,self.click_upload_users)
        run_button = wx.Button(self,label = 'run')
        run_button.Bind(wx.EVT_BUTTON,self.click_run_rs)

        label3 = wx.StaticText(self,label = 'Results:')
        self.mol_df = None
        self.user_df = None
        self.mol_list = []
        self.user_list = []
        self.user_rating_df = None
        self.user_similarity = None
        h = wx.BoxSizer(wx.HORIZONTAL)
        h.Add(label1)
        h.Add(upload_button)
        h.Add(label2)
        h.Add(upload2_button)
        h.Add(run_button)
        s.Add(h)
        s.Add(label3)
        self.v = wx.BoxSizer(wx.VERTICAL)
        s.Add(self.v)
        self.SetSizer(s)
    #upload the "item" dataset
    def click_upload_mols(self,event):
        file_dialog = wx.FileDialog(self,"Choose CSV File", wildcard = "csv files (*.csv)|*.csv",style = wx.FD_OPEN|wx.FD_FILE_MUST_EXIST)
        if file_dialog.ShowModal() == wx.ID_CANCEL:
            return
        filename = file_dialog.GetPath()
        try:
            self.mol_df = pd.read_csv(filename)
            display(self.mol_df)
        except IOError:
            wx.MessageBox("This file can't be opened.","Info",wx.OK|wx.ICON_INFORMATION)
    #upload the "user" dataset
    def click_upload_users(self,event):
        file_dialog = wx.FileDialog(self,"Choose CSV File", wildcard = "csv files (*.csv)|*.csv",style = wx.FD_OPEN|wx.FD_FILE_MUST_EXIST)
        if file_dialog.ShowModal() == wx.ID_CANCEL:
            return
        filename = file_dialog.GetPath()
        try:
            self.user_df = pd.read_csv(filename)
            display(self.user_df)
        except IOError:
            wx.MessageBox("This file can't be opened.","Info",wx.OK|wx.ICON_INFORMATION)  
    #build a recommender system  
    #creating rating matrix and similarity dataframe
    #display the top 10 molecules of the top user
    def click_run_rs(self,event):
        self.v.Clear(True)
        try:
            self.score_list = {'ra':[],'syba':[],'sa':[],'np':[],'qed':[]}
            self.mol_list = self.mol_df.iloc[:,0].to_list()
            self.user_list = self.user_df.iloc[:,0].to_list()
            self.user_rating_df = process_rating(self.mol_list,self.user_list)
            self.user_similarity = process_similarity(self.user_rating_df,self.user_list)
            average_df = self.user_similarity.mean(axis = 1).sort_values(ascending = False)
            top_user = average_df.index[0]
            top_10_molecules = self.user_rating_df.loc[top_user]
            top_10_molecules = top_10_molecules.squeeze()
            top_10_molecules = top_10_molecules.sort_values(ascending = False)
            results = top_10_molecules[:10].index.to_list()

            self.rstable = wx.grid.Grid(self,1)
            self.rstable.CreateGrid(len(results),1)
            self.rstable.SetColLabelValue(0,'Top 10 molecules:')
            for row in range(len(results)):
                self.rstable.SetCellValue(row,0,results[row])
            self.rstable.AutoSize()    
            self.v.Add(self.rstable)
            label = wx.StaticText(self,label = 'Recommend molecules:')
            h2 = wx.BoxSizer(wx.HORIZONTAL)
            label2 = wx.StaticText(self,label = 'Enter a chemical molecule(smiles): ')
            self.enter_mol = wx.TextCtrl(self,size = (500,50),style = wx.TE_MULTILINE) 
            run_rs_button = wx.Button(self,label = 'run')
            run_rs_button.Bind(wx.EVT_BUTTON,self.click_run_rs2)
            h2.Add(label2)
            h2.Add(self.enter_mol)
            h2.Add(run_rs_button)
            self.v2 = wx.BoxSizer(wx.VERTICAL)
            self.v.Add(label)
            self.v.Add(h2)
            self.v.Add(self.v2)
            self.SetupScrolling()        
            self.Layout()
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)
    #recommend the test "user" with the top 10 molecules that the most similar user has
    #display the line plot of the ratings given from test user to the top 10 molecules versus the ratings given from
    #the most similar user to the top 10 molecules
    # the accuracy rate of the recommendation will also be displayed
    def click_run_rs2(self,event):
        self.v2.Clear(True)
        try:
            mol = self.enter_mol.GetValue().strip()
            if mol in self.user_list:
                top_10_molecules = self.user_rating_df.loc[mol]
                top_10_molecules = top_10_molecules.squeeze()
                top_10_molecules = top_10_molecules.sort_values(ascending = False)
                results = top_10_molecules[:10].index.to_list()

                self.rstable2 = wx.grid.Grid(self,1)
                self.rstable2.CreateGrid(len(results),1)
                self.rstable2.SetColLabelValue(0,'Top 10 molecules:')
                for row in range(len(results)):
                    self.rstable2.SetCellValue(row,0,results[row])
                self.rstable2.AutoSize()
                accuracyrate, testmolratings, top10ratings = accuracy_rate(self.user_rating_df.loc[mol],top_10_molecules[:10])
                arr = np.array(['mol'+str(i) for i in range(1,11)])
                arr2 = np.array(testmolratings)
                arr3 = np.array(top10ratings)
                plt.figure(figsize = (15,7))         
                plt.plot(arr,arr2,color = 'black')
                plt.plot(arr,arr3,color = 'blue')  
                plt.title('Rating: '+ mol + ' vs. The most Similar User' )
                plt.xlabel('top 10 molecules')
                plt.ylabel('rating')
                plt.savefig('line1.png')
                plt.close()
                line_image_b = wx.Image("line1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                line_image_bmp = wx.StaticBitmap(self,-1,line_image_b)
                label = wx.StaticText(self,label = 'Accuracy Rate: '+str(accuracyrate*100)+'%')
                self.v2.Add(self.rstable2)
                self.v2.Add(line_image_bmp)
                self.v2.Add(label)
                self.SetupScrolling()
                self.Layout()
            else:
                u_rating_df = process_rating(self.mol_list,[mol])
                display(u_rating_df) 
                self.user_rating_df = pd.concat([self.user_rating_df,u_rating_df],ignore_index = True)
                self.user_list.append(mol)
                self.user_rating_df.index = self.user_list
                display(self.user_rating_df)
                self.user_similarity = process_similarity(self.user_rating_df,self.user_list)
                display(self.user_similarity)
                similar_users = self.user_similarity.loc[mol]
                similar_users = similar_users.squeeze()
                print(similar_users)
                top_similar_user = similar_users.sort_values(ascending = False).index[1]
                top_similar_user_rating = self.user_rating_df.loc[top_similar_user]
                top_similar_user_rating = top_similar_user_rating.squeeze()
                top_similar_user_rating = top_similar_user_rating.sort_values(ascending = False)
                display(top_similar_user_rating)
                results = top_similar_user_rating[:10].index.to_list()
                self.rstable2 = wx.grid.Grid(self,1)
                self.rstable2.CreateGrid(len(results),1)
                self.rstable2.SetColLabelValue(0,'Top 10 molecules:')
                for row in range(len(results)):
                    self.rstable2.SetCellValue(row,0,results[row])
                self.rstable2.AutoSize() 
                accuracyrate, testmolratings, top10ratings = accuracy_rate(u_rating_df.loc[mol],top_similar_user_rating[:10])
                arr = np.array(['mol'+str(i) for i in range(1,11)])
                arr2 = np.array(testmolratings)
                arr3 = np.array(top10ratings)
                plt.figure(figsize = (15,7))         
                plt.plot(arr,arr2,color = 'black')
                plt.plot(arr,arr3,color = 'blue')  
                plt.title('Rating: '+ mol + ' vs. The most Similar User' )
                plt.xlabel('top 10 molecules')
                plt.ylabel('rating')
                plt.savefig('line1.png')
                plt.close()
                line_image_b = wx.Image("line1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
                line_image_bmp = wx.StaticBitmap(self,-1,line_image_b)
                label = wx.StaticText(self,label = 'Accuracy Rate: '+str(accuracyrate*100)+'%')
                self.v2.Add(self.rstable2)
                self.v2.Add(line_image_bmp)
                self.v2.Add(label)
                self.SetupScrolling()
                self.Layout()
        except Error as error:
            wx.MessageBox(str(error),"Info",wx.OK|wx.ICON_INFORMATION)

In [9]:
#create interface for user to enter a chemical reaction and search the reactants with high structural similarities 
#if no possible reactants could be found, the orignial reactants will be returned as possible reactants
class reaction_tab_content(wx.Panel):
    def __init__(self,parent):
        wx.Panel.__init__(self,parent)
        self.s = wx.BoxSizer(wx.VERTICAL)
        self.s4 = wx.BoxSizer(wx.HORIZONTAL)
        label1 = wx.StaticText(self,label = "Reaction:")
        self.entertext1 = wx.TextCtrl(self,size = (500,100),style = wx.TE_MULTILINE)
        search_button = wx.Button(self,label = "Search with PubChem")
        search_button.Bind(wx.EVT_BUTTON, self.click_search)
        search_button2 = wx.Button(self,label = "Search with ChEMBL")
        search_button2.Bind(wx.EVT_BUTTON, self.click_search2)
        search_button3 = wx.Button(self,label = "Search with Dataset")
        search_button3.Bind(wx.EVT_BUTTON, self.click_search3)
        self.s4.Add(search_button)
        self.s4.Add(search_button2)
        self.s4.Add(search_button3)
        self.s_total = wx.BoxSizer(wx.VERTICAL)
        self.s.Add(label1)
        self.s.Add(self.entertext1)
        self.s.Add(self.s4)
        self.s2 = wx.BoxSizer(wx.VERTICAL)
        self.s3 = wx.BoxSizer(wx.VERTICAL)
        self.s_total.Add(self.s)
        self.s_total.Add(self.s2)
        self.s_total.Add(self.s3)
        self.SetSizer(self.s_total)
        self.current_reactants = []
        self.current_reactions = []
    # extract the reactants from the chemical reaction
    #search the similar reactants with the corresponding method chosen by the user
    #search with PubChemPy
    def click_search(self,event):
        self.s2.Clear(True)
        reaction = self.entertext1.GetValue().strip()
        reactants = reaction.split(">>")[0].split('.')
        reactants2 = []
        results = {}
        
        for reactant in reactants:
            mol = Chem.MolFromSmiles(reactant)
            [atom.SetAtomMapNum(0) for atom in mol.GetAtoms()]
            r = str(Chem.MolToSmiles(mol))
            reactants2.append(r)
            try:
                temp = pcp.get_compounds(r,'smiles',searchtype = 'similarity',listkey_count = 5)
                if len(temp) == 0:
                    results[r] = 'None'
                else:
                    results[r] = [item.isomeric_smiles for item in temp]
            except:
                results[r] = 'None'
        print(results)        
        results2 = {}
        #test reactions
        for reactant in reactants2:
            if results[reactant] == 'None':
                results2[reactant] = [reactant]
                continue
            else:
                rxn = AllChem.ReactionFromSmarts(reaction)
                reactant_index = reactants2.index(reactant)
                temp = [Chem.MolFromSmiles(r2) for r2 in reactants2]
                reactant_list = []
                for possible_reactant in results[reactant]:
                    temp[reactant_index] = Chem.MolFromSmiles(possible_reactant)
                    products = rxn.RunReactants(tuple(temp))
                    if len(products) > 0:
                        reactant_list.append(possible_reactant)
                        reaction_str = '.'.join([Chem.MolToSmiles(t) for t in temp]) + ">>" + '.'.join([Chem.MolToSmiles(prod) for prod in products[0]])                        
                        self.current_reactions.append(reaction_str)
                if len(reactant_list) == 0:
                    results2[reactant] = [reactant]
                else:
                    results2[reactant] = reactant_list
        self.current_reactants = results2
        self.h = wx.BoxSizer(wx.HORIZONTAL)
        count = 0
        for r in results2.keys():
            combo_box = wx.ComboBox(self,choices = results2[r])
            self.h.Add(combo_box)
            if count < len(reactants) - 1:
                plus_label = wx.StaticText(self,label = '+')
                self.h.Add(plus_label)
            count += 1
        produce_button = wx.Button(self,label = "Run")
        produce_button.Bind(wx.EVT_BUTTON,self.click_produce)
        label2 = wx.StaticText(self,label = "Results:")
        self.h.Add(produce_button)
        self.s2.Add(label2)
        self.s2.Add(self.h)
        self.Layout()
    #search with ChEMBL wrapper
    def click_search2(self,event):
        self.s2.Clear(True)
        reaction = self.entertext1.GetValue().strip()
        reactants = reaction.split(">>")[0].split('.')
        reactants2 = []
        results = {}
        
        for reactant in reactants:
            mol = Chem.MolFromSmiles(reactant)
            [atom.SetAtomMapNum(0) for atom in mol.GetAtoms()]
            r = str(Chem.MolToSmiles(mol))
            reactants2.append(r)
            similar_mols = new_client.similarity.filter(smiles = r,similarity = 80).only(['molecule_chembl_id'])
            if len(similar_mols)==0:
                results[r] = 'None' 
            else:
                results[r] = similar_mols
        results2 = {}
        #test reactions
        for reactant in reactants2:
            if results[reactant] == 'None':
                results2[reactant] = [reactant]
                continue
            else:
                rxn = AllChem.ReactionFromSmarts(reaction)
                reactant_index = reactants2.index(reactant)
                temp = [Chem.MolFromSmiles(r2) for r2 in reactants2]
                reactant_list = []
                for possible_reactant in results[reactant]:
                    m = new_client.molecule.get(possible_reactant['molecule_chembl_id'])
                    temp[reactant_index] = Chem.MolFromSmiles(m['molecule_structures']['canonical_smiles'])
                    products = rxn.RunReactants(tuple(temp))
                    if len(products) > 0:
                        reactant_list.append(m['molecule_structures']['canonical_smiles'])
                        reaction_str = '.'.join([Chem.MolToSmiles(t) for t in temp]) + ">>" + '.'.join([Chem.MolToSmiles(prod) for prod in products[0]])                        
                        self.current_reactions.append(reaction_str)                        
                if len(reactant_list) == 0:
                    results2[reactant] = [reactant]
                else:
                    results2[reactant] = reactant_list                
        self.current_reactants = results2

        self.h = wx.BoxSizer(wx.HORIZONTAL)
        count = 0
        for r in results2.keys():
            combo_box = wx.ComboBox(self,choices = results2[r])
            self.h.Add(combo_box)
            if count < len(reactants) - 1:
                plus_label = wx.StaticText(self,label = '+')
                self.h.Add(plus_label)
            count += 1
        produce_button = wx.Button(self,label = "Run")
        produce_button.Bind(wx.EVT_BUTTON,self.click_produce)
        label2 = wx.StaticText(self,label = "Results:")
        self.h.Add(produce_button)
        self.s2.Add(label2)
        self.s2.Add(self.h)
        self.Layout()
    #search with the uploaded dataset
    def click_search3(self,event):
        self.s2.Clear(True)
        reaction_input = self.entertext1.GetValue().strip()
        print(reaction_input)
        file_dialog = wx.FileDialog(self,"Choose Text File", wildcard = "TXT files (*.txt)|*.txt",style = wx.FD_OPEN|wx.FD_FILE_MUST_EXIST)
        if file_dialog.ShowModal() == wx.ID_CANCEL:
            return
        filename = file_dialog.GetPath()
        try:
            test_reactants = []
            fileupload = open(filename,"r")
            for line in fileupload:
                test_reactants.append(line.strip())
            reaction = self.entertext1.GetValue().strip()
            reactants = reaction.split(">>")[0].split('.')
            print(reaction)
            print(reactants)
            reactants2 = []
            results = {}
            
            for reactant in reactants:
                mol = Chem.MolFromSmiles(reactant)
                [atom.SetAtomMapNum(0) for atom in mol.GetAtoms()]
                r = str(Chem.MolToSmiles(mol))
                reactants2.append(r)
                mol1 = Chem.MolFromSmiles(r)
                temp = []
                for test_r in test_reactants:
                    mol2 = Chem.MolFromSmiles(test_r)
                    fp1 = Chem.RDKFingerprint(mol1)
                    fp2 = Chem.RDKFingerprint(mol2)
                    if DataStructs.TanimotoSimilarity(fp1,fp2) >= 0.8:
                        temp.append(test_r)
                if len(temp) == 0:
                    results[r] = 'None'
                else:
                    results[r] = temp

            results2 = {}
            #test reactions
            for reactant in reactants2:
                if results[reactant] == 'None':
                    results2[reactant] = [reactant]
                    continue
                else:
                    rxn = AllChem.ReactionFromSmarts(reaction)
                    reactant_index = reactants2.index(reactant)
                    temp = [Chem.MolFromSmiles(r2) for r2 in reactants2]
                    reactant_list = []
                    for possible_reactant in results[reactant]:
                        temp[reactant_index] = Chem.MolFromSmiles(possible_reactant)
                        products = rxn.RunReactants(tuple(temp))
                        if len(products) > 0:
                            reactant_list.append(possible_reactant)
                            reaction_str = '.'.join([Chem.MolToSmiles(t) for t in temp]) + ">>" + '.'.join([Chem.MolToSmiles(prod) for prod in products[0]])                        
                            self.current_reactions.append(reaction_str)                        

                    results2[reactant] = reactant_list
            self.current_reactants = results2
            self.h = wx.BoxSizer(wx.HORIZONTAL)
            count = 0
            for r in results2.keys():
                combo_box = wx.ComboBox(self,choices = results2[r])
                self.h.Add(combo_box)
                if count < len(reactants) - 1:
                    plus_label = wx.StaticText(self,label = '+')
                    self.h.Add(plus_label)
                count += 1
            produce_button = wx.Button(self,label = "Run")
            produce_button.Bind(wx.EVT_BUTTON,self.click_produce)
            label2 = wx.StaticText(self,label = "Results:")
            self.h.Add(produce_button)
            self.s2.Add(label2)
            self.s2.Add(self.h)
            self.Layout()        
            self.Show()      
        except IOError:
            wx.MessageBox("This file can't be opened.","Info",wx.OK|wx.ICON_INFORMATION)
     # use RDKit package to run the reaction with the possible reactants   
    # the image of chemical reaction will also be displayed on the interface
    def click_produce(self,event):
        self.s3.Clear(True)
        children = self.h.GetChildren()
        reactants_chosen = [] 
        for child in children:
            if isinstance(child.GetWindow(),wx.ComboBox):
                reactants_chosen.append(child.GetWindow().GetValue())
            else:
                continue
        rxn = AllChem.ReactionFromSmarts(self.entertext1.GetValue().strip())
        print(reactants_chosen)
        products = rxn.RunReactants(tuple([Chem.MolFromSmiles(r) for r in reactants_chosen]))
        reaction_str = '.'.join(reactants_chosen) + ">>" + '.'.join([Chem.MolToSmiles(prod) for prod in products[0]])
        print(reaction_str)
        rxn_img = Draw.ReactionToImage(AllChem.ReactionFromSmarts(reaction_str))
        display(rxn_img)
        rxn_img.save("rxn_image1.png")
        rxn_image_b = wx.Image("rxn_image1.png",wx.BITMAP_TYPE_PNG).ConvertToBitmap()
        rxn_image_bmp = wx.StaticBitmap(self,-1,rxn_image_b)
        reaction_label = wx.StaticText(self,label = reaction_str)
        save_pdf_button = wx.Button(self,label = "Save Reactants(.csv)")
        save_pdf_button.Bind(wx.EVT_BUTTON,self.click_save_reactants_csv)
        save_txt_button = wx.Button(self,label = "Save Reactants(.txt)")
        save_txt_button.Bind(wx.EVT_BUTTON,self.click_save_reactants_txt)
        save_reaction_csv_button = wx.Button(self,label = "Save Reactions(.csv)")
        save_reaction_csv_button.Bind(wx.EVT_BUTTON,self.click_save_reactions_csv)
        save_reaction_txt_button = wx.Button(self,label = "Save Reactions(.txt)")
        save_reaction_txt_button.Bind(wx.EVT_BUTTON,self.click_save_reactions_txt)
        h2 = wx.BoxSizer(wx.HORIZONTAL)
        h2.Add(save_pdf_button)
        h2.Add(save_txt_button)
        h2.Add(save_reaction_csv_button)
        h2.Add(save_reaction_txt_button)
        self.s3.Add(reaction_label)
        self.s3.Add(rxn_image_bmp)
        self.s3.Add(h2)
        self.Layout()
     # save the possible reactants as csv file    
    def click_save_reactants_csv(self,event):
        csv_f = open("reactants.csv","w")
        for r in self.current_reactants.keys():
            csv.writer(csv_f).writerow(self.current_reactants[r])
        csv_f.close()
        wx.MessageBox("This file is saved successfully. The filename is reactants.csv","Info",wx.OK|wx.ICON_INFORMATION)   
        
     # save the possible reactants as text file    
    def click_save_reactants_txt(self,event):
        txt_f = open("reactants.txt","a")
        for r in self.current_reactants.keys():
            s = ','.join(self.current_reactants[r])
            txt_f.write(s+'\n')
        txt_f.close()
        wx.MessageBox("This file is saved successfully. The filename is reactants.txt","Info",wx.OK|wx.ICON_INFORMATION) 
     # save the reactions generated by the possible reactants as csv file    
    def click_save_reactions_csv(self,event):
        df = pd.DataFrame()
        df["reactions"] = self.current_reactions
        df.to_csv("reactions.csv")
        wx.MessageBox("This file is saved successfully. The filename is reactions.csv","Info",wx.OK|wx.ICON_INFORMATION)   
     # save the reactions generated by the possible reactants as text file    
    def click_save_reactions_txt(self,event):   
        txt_f = open("reactions.txt","a")
        for r in self.current_reactions:
            txt_f.write(r+'\n')
        txt_f.close() 
        wx.MessageBox("This file is saved successfully. The filename is reactions.txt","Info",wx.OK|wx.ICON_INFORMATION)   


In [None]:
if __name__ == '__main__':
    app = wx.App()
    fr = proj_frame()
    app.MainLoop()

{'alogp': ['2.11', '1.33', '2.27', '1.46', '2.11', '1.46', '2.88', '3.11', '1.21', '9.29']}
[2.11, 1.33, 2.27, 1.46, 2.11, 1.46, 2.88, 3.11, 1.21, 9.29]
CategoricalIndex([(1.209, 1.434], (1.434, 1.85], (1.85, 2.174], (2.174, 2.926],
                  (2.926, 9.29]],
                 categories=[(1.209, 1.434], (1.434, 1.85], (1.85, 2.174], (2.174, 2.926], (2.926, 9.29]], ordered=True, dtype='category')


(1.209, 1.434]    2
(1.434, 1.85]     2
(1.85, 2.174]     2
(2.174, 2.926]    2
(2.926, 9.29]     2
Name: partition, dtype: int64

  kmeans_cluster1.fit(arr)
  kmeans_cluster2.fit(arr)
  kmeans_cluster1.fit(arr)
  kmeans_cluster2.fit(arr)
  plt.pie(mle_result[0],labels = ['PC'+str(i+1) for i in range(len(mle_result[0]))])


{'alogp': ['2.11', '1.33', '2.27', '1.46', '2.11', '1.46', '2.88', '3.11', '1.21', '9.29']}
{'alogp': ['2.11', '1.33', '2.27', '1.46', '2.11', '1.46', '2.88', '3.11', '1.21', '9.29']}


# References:

https://www.wxpython.org/

https://github.com/hachmannlab/chemml

https://github.com/reymond-group/RAscore

https://github.com/rdkit/rdkit/blob/master/Contrib/SA_Score/sascorer.py

https://www.rdkit.org/docs/source/rdkit.Chem.QED.html

https://github.com/rdkit/rdkit/blob/master/Contrib/NP_Score/npscorer.py

https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00439-2

https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

https://en.wikipedia.org/wiki/Elbow_method_(clustering)

https://towardsdatascience.com/intro-to-recommender-system-collaborative-filtering-64a238194a26

http://rdkit.blogspot.com/2015/01/chemical-reaction-notes-i.html