In [None]:
!pip install scikit-learn

In [None]:
from sklearn.cluster import KMeans
import statistics
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score, LeaveOneOut, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
import numpy as np
import pandas as pd
import numpy as np
from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel, RationalQuadratic, Matern
from sklearn import preprocessing 

In [None]:
def resampling(X_data, y_data, n_samples, noise):
   '''
   Bootstrap data by resamping from dataset to a size of n_samples and adding Gaussian noise to the y data
   Inputs: X_data = array of feature vectors, y_data = array of outputs, n_samples = Desired final data size, 
   noise = Standard deviation of Gaussian distribution
   
    Outputs: resampled list of feature vectors and resampled list of column densities 
    '''
    X_resample, y_resample = resample(X_data, y_data, n_samples=n_samples) # Resample n samples from X and y data
    noise_array = np.random.normal(0., noise, size=y_resample.size) # Sample from Gaussian distribution with mean 0 and standard deviation = noise n times
    y_resample = y_resample + noise_array # Add Gaussian noise to y data
    return X_resample, y_resample

def scaleData(X_train,X_test,X_tot):
    '''
    Standardizes feature vectors so tha the individual features more
    or less look like standard normally distributed data: Gaussian with zero mean and unit variance.
    '''
    
    scaler = preprocessing.StandardScaler().fit(X_train) #creating Standard Scaler Object 
    X_train_scaled = scaler.transform(X_train) #Scaling X_train
    X_test_scaled = scaler.transform(X_test) #Scaling X_test
    X_tot_scaled = scaler.transform(X_tot)
    return X_train_scaled, X_test_scaled, X_tot_scaled

def splitBootScale(x,y,bootSize = 800,trainSize = 0.8):
    
    '''
    Splits data 80/20 into train and test sets, bootstraps to a total of 800 samples and scales
    the resulting feature vectors
    
    '''
    testSize = 1-trainSize
    X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = trainSize, test_size = testSize, random_state=85)
    print(len(X_train), len(X_test))
    bootTrainSize = round(bootSize*trainSize)
    print(bootTrainSize)
    bootTestSize = round(bootSize*testSize)
    print(bootTestSize)
    X_train_boot, y_train_boot = resampling(X_train, y_train, bootTrainSize, 0.5)
    X_test_boot, y_test_boot = resampling(X_test, y_test, bootTestSize, 0.5)
    X_train_bootScaled, X_test_bootScaled, x_scaled = scaleData(X_train_boot,X_test_boot,x)
    return X_train_bootScaled, X_test_bootScaled, y_train_boot, y_test_boot, x_scaled
    


In [None]:
'''
Three functions that are necessary to convert lists into strings and vice versa. For example,
converting the list [1,2,3] to the string "[1,2,3]". This is necessary in order to legibly store long
vectors in a .csv final using the Pandas module. 

'''


def str2float(string):
    split = list(string.split(','))
    floats_split = []
    for i in range(len(split)):
        floats = float(split[i])
        floats_split.append(floats)
    return floats_split

def stringToList(vectors):
    bracket_removed_mol2vec = []
    for i in range(len(vectors)):
        new_strings = vectors[i].replace('[', '')
        newer_strings = new_strings.replace(']', '')
        bracket_removed_mol2vec.append(newer_strings)

    xList = []
    for i in range(len(bracket_removed_mol2vec)):
        float_vec = str2float(bracket_removed_mol2vec[i])
        xList.append(float_vec)
    
    return xList

def listToString(vectors):
    string_indices = []
    for i in range(len(vectors)):
        knn_string = ', '.join(str(k) for k in vectors[i])
        string_indices.append(knn_string)

    bracket_string_indices = []
    for i in range(len(string_indices)):
        bracket_string = '[' + string_indices[i] + ']'
        bracket_string_indices.append(bracket_string)
    
    return bracket_string_indices

In [None]:
'''
Uploads information about detected species

Inputs: None
Outputs: List of feature vectors, log 10 column densities, column densities and SMILES strings
of each detected molecule
'''

def uploadIso():
    fullPath = os.path.join(os.getcwd(), 'detectionDataset.csv')
    fullUpload = pd.read_csv(fullPath)
    
    mol2vec_strings = list(fullUpload['mol2vec'])
    detectedSmiles = list(fullUpload['smiles'])
    xList = stringToList(mol2vec_strings)
    cd = np.asarray(newDataset['N'])
    cdLog = np.log10(cd)
    
    return xList, cdLog, cd, detectedSmiles

In [None]:
'''
Function that performs K Means Clustering on the molecules in the dataset and returns the molecules 
that are in the cluster that contains the most detected species. 

Inputs: Feature vectors and corresponding smiles strings of entire dataset
Returns: Feature vectors and corresponding smiles strings of the molecules that are in the cluster
that contains the most detected species. Also saves these resulting molecules and vectors
in a file called commonCluster.csv
'''


def runClustering(smiles, vectors)
    
    clusterModel = KMeans(n_clusters=10)
    clusterResult = clusterModel.fit(vectors)
    clusterLabels = clusterResult.labels_
    
    fullPath = os.path.join(os.getcwd(), 'detectionDataset.csv')
    fullUpload = pd.read_csv(fullPath)

    detectSmiles = list(fullUpload['smiles'])

    clusterDetection = []

    for i in detectSmiles:
        idx = smiles.index(i)
        clusterDetection.append(clusterLabels[idx])
    
    mostCommonCluster = statistics.mode(clusterDetection)
    
    clusterVectors = []
    clusterSmiles = []
    for i in range(len(clusterLabels)):
        if clusterLabels[i] == mostCommonCluster:
            clusterVectors.append(vectors[i])
            clusterSmiles.append(smiles[i])
            
    clusterDF = pd.DataFrame()
    clusterDF['smiles'] = clusterSmiles
    clusterDF['mol2vec'] = clusterVectors
    
    savePath = os.path.join(os.getcwd(), 'commonCluster.csv')
    clusterDF.to_csv(savePath)
    
    return clusterSmiles, clusterVectors
        
    

In [None]:
'''
Calculates the nearest neighbors to the detected molecules

Inputs: number of nearest neighbors to include
Returns: List of triple of all nearest neighbors to the detected species that includes the smiles string
of each molecule, the distance to the nearest detected species and the index of the species. Also saves
the nearest neighbor SMILES strings and feature vectors in a file called closestNeighbors.csv

'''

def getClosestNeighbors(n_neighbors):
    
    completeList = []
    idxList = []
    
    clusterPath = os.path.join(os.getcwd(), 'commonCluster.csv')
    clusters = pd.read_csv(clusterPath)
    clusterSmiles = list(clusters['smiles'])
    clusterStrings = list(clusters['mol2vec'])
    clusterVectors = stringToList(clusterStrings)
    
    detectVectors, cdLog, cd, detectionSmiles = uploadIso()
    

    for i in range(len(detectionSmiles)):
        print(i)
        individualList = []
        testMol = np.asarray(clusterVectors[i])
        
        for j in range(len(clusterSmiles)):
            clusterMol = np.asarray(clusterVectors[j])
            clusterSmile = clusterStrings[j]
            distance = np.linalg.norm(testMol-clusterMol)
            tu = (clusterSmile, distance, j)
            individualList.append(tu)
        
        individualList.sort(key=lambda y: y[1])
        filteredList = individualList[0:n_neighbors]
        
        completeList.append(filteredList)
        
        
    for subList in completeList:
        for tup in subList:
            idxList.append(tup[2])
    
    deleteDuplicates(idxList)
    
    closestDF = clusters.iloc[idxList]
    
    savePath = os.path.join(os.getcwd(), 'closestNeighbors.csv')
    closestDF.to_csv(savePath)
    
    return completeList     
        


In [None]:
'''
Function to delete multiple indices from a list
retrieved from https://thispointer.com/python-remove-elements-from-list-by-index/
'''
def deleteMultiple(list_object, indices):
    indices = sorted(indices, reverse=True)
    for idx in indices:
        if idx < len(list_object):
            list_object.pop(idx)


In [None]:
'''
Training Gaussian Process Regression on all detected species then returning column density
predictions on a list of new species

Inputs: feature vectors and SMILES strings of new molecules t
Outputs: A triple containing the column density prediction information with the format:

(SMILES string, predicted column density, 1 sigma uncertainty of column density prediction)

'''


def newPredictions(X_test, testSmiles, bootSize = 800):
    xDetect, cdLog, cd, detectedSmiles = uploadIso()
    removeIdx = []
    for i in range(len(testSmiles)):
        if testSmiles[i] in detectedSmiles:
            removeIdx.append(i)
    deleteMultiple(testSmiles, removeIdx)
    deleteMultiple(X_test, removeIdx)
    X_train, y_train = resampling(xDetect, cdLog, bootSize, 0.5)
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    #BR_best_params = {'model__alpha_1': 100000.0, 'model__alpha_2': 100000.0, 'model__alpha_init': 100.0, 'model__lambda_1': 100000.0, 'model__lambda_2': 100000.0, 'model__lambda_init': 100.0, 'model__tol': 1e-07}
    model = BayesianRidge()
    result = model.fit(X_train, y_train)
    validationResults, valSD = result.predict(X_test_scaled, return_std = True)
    
    completeList = [(testSmiles[i], validationResults[i], valSD[i]) for i in range(len(testSmiles))]
    completeList.sort(key = lambda x: x[1])
    
    return completeList