In [2]:
import pandas as pd
import pickle
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
import numpy as np

### Data Preprocessing: Fingerprint generation

In [4]:
smi = pd.read_csv('./data/OLED_dataset_CSV.csv', encoding='windows-1252') # load your csv file

target_atom = ['Sn']

Unnecessary_chromophores = []

for i in smi['Chromophore']:
    mol = Chem.MolFromSmiles(i)
    atoms = [atom.GetSymbol() for atom in mol.GetAtoms()]
    intersection = set(target_atom).intersection(atoms)
    if len(intersection) != 0:
        Unnecessary_chromophores.append(i)
    else:
        pass

filtered_smi = smi[~smi['Chromophore'].isin(Unnecessary_chromophores)]

In [5]:
Mfp = {}

for i in filtered_smi['Chromophore']:
    smiles = i
    molecule = Chem.MolFromSmiles(smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(molecule, 3, nBits=1024)
    numerized_fingerprint = [int(fingerprint[i]) for i in range(fingerprint.GetNumBits())]
    nf = np.array(numerized_fingerprint)
    Mfp[i] = nf

In [6]:
oled = pd.read_csv('./data/OLED_dataset_CSV.csv', encoding='windows-1252')
oled = oled[oled.Solvent != 'gas']
oled_sol = oled['Solvent'].drop_duplicates()
oled_sol.to_csv('solvents.csv')

In [14]:
os = pd.read_csv('solvents.csv', encoding='windows-1252')

Mfp_sol = {}

Unnecessary_solvents = []

for i in os['Solvent']:
    mol = Chem.MolFromSmiles(i)
    atoms = [atom.GetSymbol() for atom in mol.GetAtoms()]
    intersection = set(target_atom).intersection(atoms)
    if len(intersection) != 0:
        Unnecessary_solvents.append(i)
    else:
        pass

filtered_sol = os[~os['Solvent'].isin(Unnecessary_solvents)]

for i in filtered_sol['Solvent']:
    smiles = i
    molecule = Chem.MolFromSmiles(smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(molecule, 3, nBits=1024)
    numerized_fingerprint = [int(fingerprint[i]) for i in range(fingerprint.GetNumBits())]
    nf = np.array(numerized_fingerprint)
    Mfp_sol[i] = nf

filtered_smi_sol = filtered_smi[~filtered_smi['Solvent'].isin(Unnecessary_solvents)]

### Importing basic ML models from scikit-learn

In [8]:
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
from numpy import genfromtxt
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

### Data preprocessing: setting X, y as inputs

In [18]:
feature = 'Absorption max (nm)'

n_mfp = Mfp
n_sol_mfp = Mfp_sol

oled = pd.read_csv('./data/OLED_dataset_CSV.csv', encoding='windows-1252')
oled = oled[oled.Solvent != 'gas']

target_atom = ['Sn']

Unnecessary_chromophores = []

for i in oled['Chromophore']:
    mol = Chem.MolFromSmiles(i)
    atoms = [atom.GetSymbol() for atom in mol.GetAtoms()]
    intersection = set(target_atom).intersection(atoms)
    if len(intersection) != 0:
        Unnecessary_chromophores.append(i)
    else:
        pass

filtered_smi = oled[~oled['Chromophore'].isin(Unnecessary_chromophores)]

Unnecessary_solvents = []

for i in filtered_smi['Solvent']:
    mol = Chem.MolFromSmiles(i)
    atoms = [atom.GetSymbol() for atom in mol.GetAtoms()]
    intersection = set(target_atom).intersection(atoms)
    if len(intersection) != 0:
        Unnecessary_solvents.append(i)
    else:
        pass

filtered_smi_sol = filtered_smi[~filtered_smi['Solvent'].isin(Unnecessary_solvents)]


oled_dropped = filtered_smi_sol.dropna(subset=feature, axis=0)
print('Total data points after pre-processing: ', len(oled_dropped))

chromophore = []

#duplicates = count_duplicates(chromophore)
#print(duplicates)

solvent = []
mw = []

for i in oled_dropped['Chromophore']:
    chromophore.append(n_mfp[i])
                    
for i in oled_dropped['Solvent']:
    solvent.append(n_sol_mfp[i])

for i in oled_dropped['Molecular weight (g mol-1)']:
    mw.append(i)


mw_reshaped = np.reshape(mw, (-1,1))


oled_dropped_prop = oled_dropped[feature]
oled_dropped_prop.to_csv('temp.csv', index=False, header=False)
prop_np = genfromtxt('temp.csv', delimiter=',')
prop_np_reshaped = np.reshape(prop_np, (-1,1))
a = np.concatenate((chromophore, solvent), axis=1)
b = np.concatenate((a, mw_reshaped), axis=1)

X = b
y = prop_np_reshaped

Total data points after pre-processing:  17275


In [20]:
y.shape

(17275, 1)

### Train test split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Basic linear regression model definition & Train and test example

In [22]:
# linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# r2 score
from sklearn.metrics import r2_score
y_pred = lr.predict(X_test)
r2_score = r2_score(y_test, y_pred)
print('R2 score: ', r2_score)

# MAE
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print('MAE: ', mae)

# MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print('MSE: ', mse)


R2 score:  -4815359645012936.0
MAE:  919001893.7641113
MSE:  5.285872100442415e+19


### Advanced models such as RidgeCV (prerequisite: what the cross validation is?)

In [None]:
rcv = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], cv=3).fit(X_train, y_train)
rcv_scores = cross_val_score(rcv, X_train, y_train)

print('RidgeCV train score =', round(rcv.score(X_train, y_train), 3))
print('RidgeCV test score =', round(rcv.score(X_test, y_test), 3))
print('RidgeCV cross validation score =', rcv_scores)
print('RidgeCV best alpha =', rcv.alpha_, '\n\n')

#save the rcv model
with open('RidgeCV_RDS_ALL.pkl', 'wb') as f:
    pickle.dump(rcv, f)


lcv = LassoCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], cv=3, random_state=42, max_iter=10000).fit(X_train, y_train)
lcv_scores = cross_val_score(lcv, X_train, y_train)

print('LassoCV train score =', round(lcv.score(X_train, y_train), 3))
print('LassoCV test score =', round(lcv.score(X_test, y_test), 3))
print('LassoCV cross validation score =', lcv_scores) 
print('LassoCV best alpha =', lcv.alpha_, '\n\n')

### HW: try to build a random forest model!

In [None]:
# your code here