In [None]:
!pip install scikit-learn

In [None]:
from sklearn.cluster import KMeans
import statistics
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score, LeaveOneOut, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel, RationalQuadratic, Matern
from sklearn import preprocessing 
import numpy as np
import pandas as pd
import numpy as np

In [None]:
'''
Three functions that are necessary to convert lists into strings and vice versa. For example,
converting the list [1,2,3] to the string "[1,2,3]". This is necessary in order to legibly store long
vectors in a .csv final using the Pandas module. 

'''


def str2float(string):
    split = list(string.split(','))
    floats_split = []
    for i in range(len(split)):
        floats = float(split[i])
        floats_split.append(floats)
    return floats_split

def stringToList(vectors):
    bracket_removed_mol2vec = []
    for i in range(len(vectors)):
        new_strings = vectors[i].replace('[', '')
        newer_strings = new_strings.replace(']', '')
        bracket_removed_mol2vec.append(newer_strings)

    xList = []
    for i in range(len(bracket_removed_mol2vec)):
        float_vec = str2float(bracket_removed_mol2vec[i])
        xList.append(float_vec)
    
    return xList

def listToString(vectors):
    string_indices = []
    for i in range(len(vectors)):
        knn_string = ', '.join(str(k) for k in vectors[i])
        string_indices.append(knn_string)

    bracket_string_indices = []
    for i in range(len(string_indices)):
        bracket_string = '[' + string_indices[i] + ']'
        bracket_string_indices.append(bracket_string)
    
    return bracket_string_indices

In [None]:
'''
Uploads information about detected species

Inputs: None
Outputs: List of feature vectors, log 10 column densities, column densities and SMILES strings
of each detected molecule
'''

def uploadIso():
    fullPath = os.path.join(os.getcwd(), 'detectionDataset.csv')
    fullUpload = pd.read_csv(fullPath)


    # Extract mol2vec
    exactList = list(fullUpload['Exact'])
    smileList = list(fullUpload['smiles'])
    idxList = []
    for i in range(len(exactList)):
        if exactList[i] == "Y":
            idxList.append(i)
            
                     
    newDataset = fullUpload.iloc[idxList]
    mol2vec_strings = list(newDataset['mol2vecIsoSameNew'])
    detectedSmiles = list(newDataset['smiles'])
    
    xList = stringToLisT(mol2vec_strings)
    
    cd = newDataset['N'].tolist()
    
    cdLog = np.array(np.log10(cd))
    
    return xList, cdLog, cd, detectedSmiles

In [None]:
'''
Uploads information about detected species (including whether their column denisty should be predicted)

Inputs: None
Outputs: List of feature vectors, log 10 column densities, column densities and SMILES strings
of each detected molecule and a list contianing information on whether the column density for a molecule should
be predicted
'''


def uploadIsoVal():
    fullPath = os.path.join(os.getcwd(), 'all_files/Updated_Smiles/detectionDataset.csv')
    fullUpload = pd.read_csv(fullPath)


    # Extract mol2vec
    exactList = list(fullUpload['Exact'])
    smileList = list(fullUpload['smiles'])
    idxList = []
    for i in range(len(exactList)):
        if exactList[i] == "Y":
            idxList.append(i)
                     
    newDataset = fullUpload.iloc[idxList]
    mol2vec_strings = list(newDataset['mol2vec'])
    detectedSmiles = list(newDataset['smiles'])
    predictList = list(newDataset['Predict'])
    cd = list(newDataset['N'])
    
    cdLog = np.array(np.log10(cd))
    
    return xList, cdLog, cd, detectedSmiles, predictList

In [None]:

import numpy as np


'''
Removes a single feature vector and corresponding column density for LOOCV

Inputs: Total list of vectors, total list of column densities and index to be removed
Returns: Resulting complete vector and column density lists with one removed from each,
the feature vector and column density that were removed for cross validation

'''
def removeValidation(vectorList, cdList, idx):
    valVectorList = []
    valCdList = []
    valVector = vectorList.pop(idx)
    cdList = list(cdList)
    valCd = cdList.pop(idx)
    valVectorList.append(valVector)
    valCdList.append(valCd)
    valVectorList = np.array(valVectorList)
    valCdList = np.array(valCdList)
    
    return vectorList, cdList, valVectorList, valCdList


'''
Sort and return an array of tuples by their second value
'''
def sortTupleArray(tup):
    tup.sort(key = lambda x: x[1])
    return tup


'''
Standardizes feature vectors so tha the individual features more
or less look like standard normally distributed data: Gaussian with zero mean and unit variance.

Input: Not yet scaled train and validation feature vectors
Returns: Scaled train and validation feature vectors
'''
def scaleDataVal(X_train,X_val):
    scaler = preprocessing.StandardScaler().fit(X_train) #creating Standard Scaler Object 
    X_train_scaled = scaler.transform(X_train) #Scaling X_train
    #X_tot_scaled = scaler.transform(X_tot)
    X_val_scaled = scaler.transform(X_val)
    return X_train_scaled, X_val_scaled


'''
Bootstraps the training data to 800 samples and scales the dataset

Input: Feature vectors and column densities in the training set as well as the feature vector used for 
cross validation

Returns: Bootstrapped feature vectors and column densities in the training set as well as
the scaled feature vector used for cross validation
'''
def splitBootScaleVal(x,y, xVal, bootSize = 800):

    bootTrainSize = bootSize
    X_train_boot, y_train_boot = resampling(x, y, bootTrainSize, 0.5)
    X_train_bootScaled, X_val_scaled = scaleDataVal(X_train_boot,xVal)
    return X_train_bootScaled, y_train_boot, X_val_scaled


'''
Training the model and predicting on the left-out sample using Gaussian process regression

Inputs: bootstrapped feature vectors and column densities in the training set as well as 
the left-out feature vector and corresponding column density for cross validation

Returns: The prediction error, column density prediction and prediction uncertainty (standard deviation)
for the left-out molecule
'''
def runGPRVal(X_train_boot, y_train_boot, X_val, y_val): 

    kernel = RBF(length_scale=7) + WhiteKernel(noise_level=0.3, noise_level_bounds = (1e-10,1e5)) + DotProduct(sigma_0=0.001, sigma_0_bounds = (1e-10,1e5))
    model = GaussianProcessRegressor(alpha=1e-10, kernel=kernel,normalize_y=True, random_state=55, n_restarts_optimizer= 20)

    result = model.fit(X_train_boot, y_train_boot)
    train_pred = result.predict(X_train_boot)
    train_error = mean_squared_error(y_train_boot, train_pred)
    validationResults, valSD = result.predict(X_val, return_std = True)
    residualList = []
    for i in range(len(validationResults)):
        residual = validationResults[i] - y_val[i]
        print(X_val[i][0])
        print("Reported CD")
        print(y_val[i])
        print("Predicted CD")
        print(validationResults[i])
        print("residual")
        print(residual)
        print("Standard Deviation")
        print(valSD[i])
        sd = valSD[i]
        validationResult = validationResults[i]
    
    return residual, validationResult, sd



'''
Training the model and predicting on the left-out sample using Bayesian Ridge Regression

Inputs: bootstrapped feature vectors and column densities in the training set as well as 
the left-out feature vector and corresponding column density for cross validation

Returns: The prediction error, column density prediction and prediction uncertainty (standard deviation)
for the left-out molecule
'''
def runBRVal(X_train_boot, y_train_boot, X_val, y_val): 

    
    model = BayesianRidge()

    result = model.fit(X_train_boot, y_train_boot)
    train_pred = result.predict(X_train_boot)
    train_error = mean_squared_error(y_train_boot, train_pred)
    validationResults, valSD = result.predict(X_val, return_std = True)
    residualList = []
    for i in range(len(validationResults)):
        residual = validationResults[i] - y_val[i]
        print(X_val[i][0])
        print("Reported CD")
        print(y_val[i])
        print("Predicted CD")
        print(validationResults[i])
        print("residual")
        print(residual)
        print("Standard Deviation")
        print(valSD[i])
        sd = valSD[i]
        validationResult = validationResults[i]
    
    return residual, validationResult, sd

In [None]:
import statistics

'''
Loops through all of the molecules in the dataset of detections and predicts their column density through 
leave-one-out cross-validation with a Gaussian process regression. 

Inputs: None
Outputs: A list of 4-tuples containing information about each column density prediction. 
The 4-tuples have the following format:

(SMILES string, column density prediction error, column density prediction, 1 sigma prediction uncertainty)
'''

def runValidationGPR():
    totalResidualList = []
    residualNumList = []
    xAll,yAll, yNonLog, smileList, predictList = uploadIsoVal()

    #print(len(xAll))
    #print(len(xAll[0]))
    
    #idx = smileList.index("[2H]OCC=O")
    #idxList = [idx]
    #for i in idxList:
    for i in range(len(smileList)):
        #if predictList[i] != "N (u)" and predictList[i] != "Y":
        print("iteration" + " " + str(i+1))
        print(smileList[i])
        xTot, yTot, yUnprocessed, smiles = uploadIso()
        #print("xTot")
        #print(len(xTot[0]))
        xSet, ySet, xVal, y_val = removeValidation(xTot,yTot,i)
        ySet = np.array(ySet)
        X_train, y_train, X_val = splitBootScaleVal(xSet, ySet, xVal)
        #print(len(X_train[0]))
        #print("X_test")
        #print(len(X_val))
        residualList, validationResult, sd = runGPRVal(X_train,y_train, X_val, y_val)
        totalResidualList.append((smileList[i],abs(residualList),validationResult,sd))
        residualNumList.append(abs(residualList))
        print("--------")


    print(len(totalResidualList))
    totalResidualList = sortTupleArray(totalResidualList)
    print(totalResidualList)
    print("AVERAGE RESIDUAL")
    averageResidual = sum(residualNumList) / len(residualNumList)
    residualNumList.sort()
    print(averageResidual)
    print("MEDIAN RESIDUAL")
    print(statistics.median(residualNumList))

    return totalResidualList


In [None]:
import statistics

'''
Loops through all of the molecules in the dataset of detections and predicts their column density through 
leave-one-out cross-validation with a Bayesian ridge regression. 

Inputs: None
Outputs: A list of 4-tuples containing information about each column density prediction. 
The 4-tuples have the following format:

(SMILES string, column density prediction error, column density prediction, 1 sigma prediction uncertainty)
'''

def runValidationBR():
    totalResidualList = []
    residualNumList = []
    xAll,yAll, yNonLog, smileList, predictList = uploadIsoVal()

    #print(len(xAll))
    #print(len(xAll[0]))
    
    #idx = smileList.index("[2H]OCC=O")
    #idxList = [idx]
    #for i in idxList:
    for i in range(len(smileList)):
        #if predictList[i] != "N (u)" and predictList[i] != "Y":
        print("iteration" + " " + str(i+1))
        print(smileList[i])
        xTot, yTot, yUnprocessed, smiles = uploadIso()
        #print("xTot")
        #print(len(xTot[0]))
        xSet, ySet, xVal, y_val = removeValidation(xTot,yTot,i)
        ySet = np.array(ySet)
        X_train, y_train, X_val = splitBootScaleVal(xSet, ySet, xVal)
        #print(len(X_train[0]))
        #print("X_test")
        #print(len(X_val))
        residualList, validationResult, sd = runBRVal(X_train,y_train, X_val, y_val)
        totalResidualList.append((smileList[i],abs(residualList),validationResult,sd))
        residualNumList.append(abs(residualList))
        print("--------")


    print(len(totalResidualList))
    totalResidualList = sortTupleArray(totalResidualList)
    print(totalResidualList)
    print("AVERAGE RESIDUAL")
    averageResidual = sum(residualNumList) / len(residualNumList)
    residualNumList.sort()
    print(averageResidual)
    print("MEDIAN RESIDUAL")
    print(statistics.median(residualNumList))

    return totalResidualList