# Multi-layer perceptron models on OLED features
This notebook uses MLP models to predict various OLED properties. Properties of interest are:
1. Absorption max /nm
2. Emission max /nm
3. Lifetime /ns
4. Quantum yield (PLQY)
5. Absorption full width at half maximum (FWHM) /cm<sup>-1</sup>
6. Emission full width at half maximum (FWHM) /cm<sup>-1</sup>  

The dataset was taken from DOI: [10.1021/jacsau.1c00035](https://pubs.acs.org/doi/10.1021/jacsau.1c00035)

### Data preprocessing

In [8]:
#Imports

import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

#Load data and remove unwanted chromophores
smi = pd.read_csv('./data/OLED_dataset_CSV.csv', encoding='windows-1252') # load csv file (20236, 14)

target_atom = ['Sn']
Unnecessary_chromophores = []

smi["Molecules"] = smi["Chromophore"].apply(lambda x: Chem.MolFromSmiles(x)) #Add column of Molecular objects

for _, row in smi.iterrows():
    atoms = {atom.GetSymbol() for atom in row["Molecules"].GetAtoms()}
    if set(target_atom).intersection(atoms):
        Unnecessary_chromophores.append(row["Chromophore"])


filtered_smi = smi[~smi['Chromophore'].isin(Unnecessary_chromophores)]

#Create dictionary of SMILES: Morgan fingerprint

Mfp_Chrom = {}

for _, row in filtered_smi.iterrows():
    fp = AllChem.GetMorganFingerprintAsBitVect(row["Molecules"], 3, nBits=1024)
    nf = np.array(fp).tolist()
    Mfp_Chrom[row["Chromophore"]] = nf

#Get Unique Solvents
filtered_smi = filtered_smi[filtered_smi.Solvent != "gas"] #remove "gas" from solvents
filtered_smi_sol = filtered_smi.drop_duplicates(subset=["Solvent"]) #remove duplicates from solvents


#Create dictionary of SMILES: Morgan fingerprint (solvents)

Mfp_Sol = {}

for _, row in filtered_smi_sol.iterrows():
    mol = Chem.MolFromSmiles(row["Solvent"])
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024)
    nf = np.array(fp).tolist()
    Mfp_Sol[row["Solvent"]] = nf

##### Define inputs given feature

In [9]:
def data_processed(ft, print_size=False):
    oled_dropped = filtered_smi.dropna(subset=ft,axis=0)

    #Building MF arrays of chromophores and solvents from oled_dropped
    chromophore = [] 
    solvent = [] 
    mw = [] 

    for _, row in oled_dropped.iterrows():
        chromophore.append(Mfp_Chrom[row["Chromophore"]])
        solvent.append(Mfp_Sol[row["Solvent"]])
        mw.append(row["Molecular weight (g mol-1)"])

    chromophore_reshaped = np.reshape(chromophore, (-1, 1024))
    solvent_reshaped = np.reshape(solvent, (-1, 1024))
    mw_reshaped = np.reshape(mw, (-1,1))

    #Concatenate MFPs
    a = np.concatenate((chromophore_reshaped, solvent_reshaped), axis = 1) # (17275, 2048) matrix of chromophore & solvent MFPs
    b = np.concatenate((a, mw_reshaped), axis = 1) # (17275, 2049) matrix of MFPs & mws

    #Define inputs & features
    X = b
    Y = np.reshape(oled_dropped[ft], (-1,1))

    if print_size:
        print("Data points before preprocessing ", len(smi))
        print("Data points after preprocessing ", len(oled_dropped))

    else:
        return X, Y, len(oled_dropped)

## MLP models
#### Required functions

In [10]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer

#Evaluate models
def model_eval(y_test, y_pred, print_res=True):
    # R2 Score
    r2 = r2_score(y_test, y_pred)
    # MAE
    mae = mean_absolute_error(y_test, y_pred)
    # MSE
    mse = mean_squared_error(y_test, y_pred)

    if print_res:
        print('R2 score: ', r2)
        print('MAE: ', mae)
        print('MSE: ', mse)

    else:
        return r2, mae, mse
    
#Split data into training & test sets & standardise after split to avoid data leakage
def split_standard_transform(inputs, features, state):
    #Split data
    X_train, X_test, y_train, y_test = train_test_split(inputs, features, test_size=0.1, random_state=state)

    # X has format [---MFP of Chromophore---][---MFP of Solvent---][MW]
    # MFPs are binary -> only standardize MW
    scaler_X = StandardScaler().fit(np.array([X_train[:,-1]]).T) #fit scaler to training mws

    X_mw_train_standardized = scaler_X.transform(np.array([X_train[:,-1]]).T) #standardize to training mw
    X_train_standardized = np.concatenate((X_train[:, :-1], X_mw_train_standardized), axis=1) #add standardized training mw to training MFPs

    X_mw_test_standardized = scaler_X.transform(np.array([X_test[:,-1]]).T) #standardize test mw
    X_test_standardized = np.concatenate((X_test[:, :-1], X_mw_test_standardized), axis=1) #add standardized test mw to testing MFPs

    pt = PowerTransformer(standardize = True).fit(y_train)
    y_train_transformed = pt.transform(y_train).ravel()
    y_test_transformed = pt.transform(y_test).ravel()

    return X_train_standardized, X_test_standardized, y_train_transformed, y_test_transformed

# X_train, X_test, y_train, y_test = split_standard_transform(X, Y, 42)

### Feature = Absorption max /nm

In [11]:
from sklearn.neural_network import MLPRegressor

In [12]:
feature = "Absorption max (nm)"
data_processed(feature, True)

Data points before preprocessing  20236
Data points after preprocessing  17275


In [13]:
X, Y,_ = data_processed(feature)
X_train, X_test, y_train, y_test = split_standard_transform(X, Y, 42)

MLP = MLPRegressor(hidden_layer_sizes=(100,), max_iter = 200, random_state=23)
MLP.fit(X_train, y_train)

y_pred = MLP.predict(X_test)
model_eval(y_test, y_pred)

R2 score:  0.9424571904867128
MAE:  0.14246132189188312
MSE:  0.0585926777760493


### Feature = Emission max /nm


In [14]:
feature2 = "Emission max (nm)"
data_processed(feature2, True)

Data points before preprocessing  20236
Data points after preprocessing  18140


In [15]:
X2, Y2,_  = data_processed(feature2)
X_train2, X_test2, y_train2, y_test2 = split_standard_transform(X2, Y2, 42)

MLP = MLPRegressor(hidden_layer_sizes=(100,), max_iter = 200, random_state=23)
MLP.fit(X_train2, y_train2)

y_pred2 = MLP.predict(X_test2)
model_eval(y_test2, y_pred2)

R2 score:  0.9082105164000472
MAE:  0.20119742627519246
MSE:  0.09138516832704399


### Feature = Lifetime /ns

In [16]:
feature3 = "Lifetime (ns)"
data_processed(feature3, True)

Data points before preprocessing  20236
Data points after preprocessing  6960


In [17]:
X3, Y3,_  = data_processed(feature3)
X_train3, X_test3, y_train3, y_test3 = split_standard_transform(X3, Y3, 42)

MLP = MLPRegressor(hidden_layer_sizes=(100,), max_iter = 200, random_state=23)
MLP.fit(X_train3, y_train3)

y_pred3 = MLP.predict(X_test3)
model_eval(y_test3, y_pred3)

R2 score:  0.7309199256261549
MAE:  0.35456538524992137
MSE:  0.26857598480407574


### Feature = Quantum yield (PLQY)

In [18]:
feature4 = "Quantum yield"
data_processed(feature4, True)

Data points before preprocessing  20236
Data points after preprocessing  13836


In [19]:
X4, Y4,_  = data_processed(feature4)
X_train4, X_test4, y_train4, y_test4 = split_standard_transform(X4, Y4, 42)

MLP = MLPRegressor(hidden_layer_sizes=(100,), max_iter = 200, random_state=23)
MLP.fit(X_train4, y_train4)

y_pred4 = MLP.predict(X_test4)
model_eval(y_test4, y_pred4)

R2 score:  0.6934420183128704
MAE:  0.4054639654273137
MSE:  0.31489363994065556


### Feature = Absorption FWHM /cm<sup>-1</sup>

In [20]:
feature5 = "abs FWHM (nm)"
data_processed(feature5, True)

Data points before preprocessing  20236
Data points after preprocessing  3588


In [21]:
X5, Y5,_  = data_processed(feature4)
X_train5, X_test5, y_train5, y_test5 = split_standard_transform(X5, Y5, 42)

MLP = MLPRegressor(hidden_layer_sizes=(100,), max_iter = 200, random_state=23)
MLP.fit(X_train5, y_train5)

y_pred5 = MLP.predict(X_test5)
model_eval(y_test5, y_pred5)

R2 score:  0.6934420183128704
MAE:  0.4054639654273137
MSE:  0.31489363994065556


### Feature = Emission FWHM /cm<sup>-1</sup>

In [22]:
feature6 = "emi FWHM (nm)"
data_processed(feature6, True)

Data points before preprocessing  20236
Data points after preprocessing  7197


In [23]:
X6, Y6,_  = data_processed(feature6)
X_train6, X_test6, y_train6, y_test6 = split_standard_transform(X6, Y6, 42)

MLP = MLPRegressor(hidden_layer_sizes=(100,), max_iter = 200, random_state=23)
MLP.fit(X_train6, y_train6)

y_pred6 = MLP.predict(X_test6)
model_eval(y_test6, y_pred6)

R2 score:  0.6929802861128018
MAE:  0.36590055094832336
MSE:  0.2922739404477624


## Applying Model to all features

In [24]:
features = ['Absorption max (nm)', 'Emission max (nm)', 'Lifetime (ns)', 'Quantum yield', 'abs FWHM (nm)', 'emi FWHM (nm)']

mlp_res = {}

for feat in features:
    X, Y, data_points = data_processed(feat)
    X_train, X_test, y_train, y_test = split_standard_transform(X, Y, 42)

    MLP = MLPRegressor(hidden_layer_sizes=(100,), max_iter = 200, random_state=23)
    MLP.fit(X_train, y_train)

    y_pred = MLP.predict(X_test)
    r2, mae, mse = model_eval(y_test, y_pred, print_res = False)
    mlp_res[feat] = (r2, mae, mse, data_points)

In [25]:
results_table = pd.DataFrame.from_dict(mlp_res)
rows = ["R2 Score", "MAE", "MSE", "N"]
results_table.index = rows

results_table

Unnamed: 0,Absorption max (nm),Emission max (nm),Lifetime (ns),Quantum yield,abs FWHM (nm),emi FWHM (nm)
R2 Score,0.942457,0.908211,0.73092,0.693442,0.791628,0.69298
MAE,0.142461,0.201197,0.354565,0.405464,0.279734,0.365901
MSE,0.058593,0.091385,0.268576,0.314894,0.200608,0.292274
N,17275.0,18140.0,6960.0,13836.0,3588.0,7197.0
