## ML-powered Predictions on OLED Molecules

In [2]:
#Imports

import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

In [5]:
#Load data and remove unwanted chromophores
smi = pd.read_csv('./data/OLED_dataset_CSV.csv', encoding='windows-1252') # load csv file (20236, 14)

target_atom = ['Sn']
Unnecessary_chromophores = []

smi["Molecules"] = smi["Chromophore"].apply(lambda x: Chem.MolFromSmiles(x)) #Add column of Molecular objects

for _, row in smi.iterrows():
    atoms = {atom.GetSymbol() for atom in row["Molecules"].GetAtoms()}
    if set(target_atom).intersection(atoms):
        Unnecessary_chromophores.append(row["Chromophore"])


filtered_smi = smi[~smi['Chromophore'].isin(Unnecessary_chromophores)]

In [31]:
#Create dictionary of SMILES: Morgan fingerprint

Mfp_Chrom = {}

for _, row in filtered_smi.iterrows():
    fp = AllChem.GetMorganFingerprintAsBitVect(row["Molecules"], 3, nBits=1024)
    nf = np.array(fp).tolist()
    Mfp_Chrom[row["Chromophore"]] = nf

In [131]:
#Get Unique Solvents
filtered_smi = filtered_smi[filtered_smi.Solvent != "gas"] #remove "gas" from solvents
filtered_smi_sol = filtered_smi.drop_duplicates(subset=["Solvent"]) #remove duplicates from solvents


#Create dictionary of SMILES: Morgan fingerprint (solvents)

Mfp_Sol = {}

for _, row in filtered_smi_sol.iterrows():
    mol = Chem.MolFromSmiles(row["Solvent"])
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024)
    nf = np.array(fp).tolist()
    Mfp_Sol[row["Solvent"]] = nf


Defining inputs from pre-processed data

In [132]:
feature = smi.columns[3] #Absorption max (nm)

oled_dropped = filtered_smi.dropna(subset=feature, axis=0) #(17275, 15) i.e. 17275 chromophores with reported abs

print("Data points before preprocessing ", len(smi))
print("Data points after preprocessing ", len(oled_dropped))

#Building MF arrays of chromophores and solvents
chromophore = [] #list of 17275 chromophore MFPs
solvent = [] #17275 solvents
mw = [] #17275 mws

for _, row in oled_dropped.iterrows():
    chromophore.append(Mfp_Chrom[row["Chromophore"]])
    solvent.append(Mfp_Sol[row["Solvent"]])
    mw.append(row["Molecular weight (g mol-1)"])

chromophore_reshaped = np.reshape(chromophore, (-1, 1024)) # (17275, 1024) matrix of chromophore MFP
solvent_reshaped = np.reshape(solvent, (-1, 1024)) # (17275, 1024) matrix of solvent MFP
mw_reshaped = np.reshape(mw, (-1,1)) # (17275, 1) matrix of molecular weights


Data points before preprocessing  20236
Data points after preprocessing  17275


In [133]:
a = np.concatenate((chromophore_reshaped, solvent_reshaped), axis = 1) # (17275, 2048) matrix of chromophore & solvent MFPs
b = np.concatenate((a, mw_reshaped), axis = 1) # (17275, 2049) matrix of MFPs & mws

feature_reshaped = np.reshape(oled_dropped[feature], (-1,1)) # (17275, 1) vector of features (absorption)

X = b
Y = feature_reshaped

### Machine Learning models

In [151]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=12)

In [152]:
from sklearn.preprocessing import StandardScaler, PowerTransformer

#Standardise mw data only after split to avoid data leakage
scaler_X = StandardScaler().fit(np.array([X_train[:,-1]]).T) #fit scaler to training mws

X_mw_train_standardized = scaler_X.transform(np.array([X_train[:,-1]]).T) #standardize to training mw
X_train_standardized = np.concatenate((X_train[:, :-1], X_mw_train_standardized), axis=1) #add standardized training mw to training MFPs

X_mw_test_standardized = scaler_X.transform(np.array([X_test[:,-1]]).T) #standardize test mw
X_test_standardized = np.concatenate((X_test[:, :-1], X_mw_test_standardized), axis=1) #add standardized test mw to testing MFPs

pt = PowerTransformer(method = "box-cox", standardize = True).fit(y_train)
y_train_transformed = pt.transform(y_train)
y_test_transformed = pt.transform(y_test)

In [153]:
print(f"X_train mean: {X_train.mean():.3f}, std: {X_train.std():.3f}")
print(f"X_test mean: {X_test.mean():.3f}, std: {X_test.std():.3f}")
print()
print(f"X_train_standardized mean: {X_train_standardized.mean():.3f}, std: {X_train_standardized.std():.3f}")
print(f"X_test_standardized mean: {X_test_standardized.mean():.3f}, std: {X_test_standardized.std():.3f}")
print()
print()
print(f"y_train mean: {y_train.mean():.3f}, std: {y_train.std():.3f}")
print(f"y_test mean: {y_test.mean():.3f}, std: {y_test.std():.3f}")
print()
print(f"y_train_transformed mean: {y_train_transformed.mean():.3f}, std: {y_train_transformed.std():.3f}")
print(f"y_test_transformed mean: {y_test_transformed.mean():.3f}, std: {y_test_transformed.std():.3f}")

X_train mean: 0.257, std: 11.488
X_test mean: 0.258, std: 11.656

X_train_standardized mean: 0.031, std: 0.176
X_test_standardized mean: 0.031, std: 0.176


y_train mean: 427.875, std: 105.856
y_test mean: 427.055, std: 105.034

y_train_transformed mean: 0.000, std: 1.000
y_test_transformed mean: -0.007, std: 0.994


In [163]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

# r2 score
from sklearn.metrics import r2_score
y_pred = lr.predict(X_test)
r2_score = r2_score(y_test, y_pred)
print('R2 score: ', r2_score)

# MAE
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print('MAE: ', mae)

# MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print('MSE: ', mse)

R2 score:  -1.8612837471370256e+16
MAE:  2074627859.577473
MSE:  2.0533765825912032e+20


In [155]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_standardized, y_train_transformed)

# r2 score
from sklearn.metrics import r2_score
y_pred = lr.predict(X_test_standardized)
r2_score = r2_score(y_test_transformed, y_pred)
print('R2 score: ', r2_score)

# MAE
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test_transformed, y_pred)
print('MAE: ', mae)

# MSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test_transformed, y_pred)
print('MSE: ', mse)

R2 score:  -6.224008093178616e+20
MAE:  3547563245.775686
MSE:  6.150942798166587e+20


In [178]:
import random

states = {}

for i in range(10):
    state = random.randint(1, 100)
    # Code
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=state)

    scaler_X = StandardScaler().fit(np.array([X_train[:,-1]]).T)
    X_mw_train_standardized = scaler_X.transform(np.array([X_train[:,-1]]).T)

    X_train_standardized = np.concatenate((X_train[:, :-1], X_mw_train_standardized), axis=1)



    X_mw_test_standardized = scaler_X.transform(np.array([X_test[:,-1]]).T)
    X_test_standardized = np.concatenate((X_test[:, :-1], X_mw_test_standardized), axis=1)

    pt = PowerTransformer(method = "box-cox", standardize = True).fit(y_train)
    y_train_transformed = pt.transform(y_train)
    y_test_transformed = pt.transform(y_test)

    #Without standardization
    lr1 = LinearRegression()
    lr1.fit(X_train, y_train)


    y_pred1 = lr.predict(X_test)
    score1 = lr.score(X_test, y_test)

    #With standardization
    lr2 = LinearRegression()
    lr2.fit(X_train_standardized, y_train_transformed)


    y_pred2 = lr.predict(X_test_standardized)
    score2 = lr.score(X_test_standardized, y_test_transformed)

    states[state] = (score1, score2)

    #Increment state
    #state += 10

In [179]:
states_df = pd.DataFrame.from_dict(states, orient="index", columns = ["R2 score w/out Standardization", "R2 score w Standardization"])
states_df.index.name = "Random_state"
states_df

Unnamed: 0_level_0,R2 score w/out Standardization,R2 score w Standardization
Random_state,Unnamed: 1_level_1,Unnamed: 2_level_1
15,-1289767000000000.0,-1.445306e+19
23,-1081449000000000.0,-1.256128e+19
81,-4811101000000000.0,-5.063567e+19
51,-4264195000000000.0,-4.826623e+19
88,-1463525000000000.0,-1.566885e+19
50,-1185482000000000.0,-1.264228e+19
69,-896699500000000.0,-9.206402e+18
20,-1907086000000000.0,-2.110657e+19
63,-2315057000000000.0,-2.643406e+19


In [None]:
#Plot of LR results

import matplotlib.pyplot as plt

#Limits of plot with offset
offset = 50
min_limit = min(min(y_test), min(y_pred)) - offset
max_limit = max(max(y_test), max(y_pred)) + offset

plt.figure(figsize=(6, 6))

plt.scatter(y_test, y_pred, color='blue', marker = 'x') # Data

plt.plot([min_limit, max_limit], [min_limit, max_limit], color='black')  # Diagonal line


plt.xlim(min_limit, max_limit)  # Set x-axis limits
plt.ylim(min_limit, max_limit)  # Set y-axis limits

plt.title('Linear regression')
plt.xlabel('Actual Absorption max / nm')
plt.ylabel('Predicted Absorption max / nm')
#plt.text(min_limit+25, max_limit-75, f" R2 score: {r2_score:.2f}\n MAE: {mae:.2f} nm")
plt.show()

## Non-linear ML models