In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from joblib import Parallel, delayed
from scipy.signal import savgol_filter as sgf
from scipy.optimize import curve_fit
from sklearn import linear_model
from sklearn import metrics

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
Synth = pd.read_csv('CsPbBr_synthesis.csv', index_col = 0)

In [3]:
def conc_factor(di=1,mw=1,density=1,conc=0):
    result = 0
    if conc != 0:
        return conc/500/di
    else:
        volume = 1 #in ul
        mass = (volume/1000000)/di*density #g
        amount = mass/mw #mol
        result = amount*1000*1000000/500
        return result #converting into mM
    
        
#Dilution ratio:
#Cs Pb 200,OLA 100, Br OA 80
SynData = pd.DataFrame()
#Cesium Concentration in mM
SynData['Cs'] = Synth['Cs']*conc_factor(di=200,conc=1000)
#Lead Concentration in mM
SynData['Pb'] = Synth['Pb']*conc_factor(di=200,conc=667)
#Oleylamine Concentration in mM
SynData['OLA'] = Synth['OLA']*conc_factor(di=100,mw=267.5,density=813)
#Oleic Acid Concentration in mM
SynData['OA'] = (Synth['OA']+Synth['Cs']+Synth['Pb'])*conc_factor(di=80,mw=282.5,density=895)
#Benzoyl Bromide Concentration in mM
SynData['Br'] = Synth['Br']*conc_factor(di=80,mw=185,density=1570)
#Temperature in Celcius
SynData['Temp'] = Synth['Temp'] + 273
#Normalized Concentration of Products
SynData['PbBr3-'] = Synth['PbBr3-']
SynData['Cs4PbBr6'] = Synth['Cs4PbBr6']
SynData['CsPb2Br5'] = Synth['CsPb2Br5']
SynData['PbBr2'] = Synth['PbBr2']
SynData['1ML'] = Synth['1 ML']
SynData['2ML'] = Synth['2 ML']
SynData['3ML'] = Synth['3 ML']
SynData['4ML'] = Synth['4 ML']
SynData['CsPbBr3'] = Synth['CsPbBr3']
    
#SynData.head()

In [5]:
SynData.shape

(1351, 15)

In [4]:
#Defining a function to create copy of a DataFrame
def dfcopy(DataFrame):
    result = pd.DataFrame()
    for i in DataFrame.columns:
        result[i] = DataFrame[i]
    return result 

#Defining a function for avg = 1:
def dfscale(DataFrame):
    result = pd.DataFrame()
    for i in DataFrame.columns:
        avg = DataFrame[i].mean()
        result[i] = DataFrame[i]/avg
    return result

In [5]:
#Defining a function to perform feature engineering
def addfeatures(train_a1):
    train_a1eus = dfcopy(train_a1)
    train_a1eus['OA/OLA ^2'] = (train_a1['OA']/train_a1['OLA'])**2
    train_a1eus['Pb/OLA ^2'] = (train_a1['Pb']/train_a1['OLA'])**2
    train_a1eus['Pb/OA ^2'] = (train_a1['Pb']/train_a1['OA'])**2
    train_a1eus['Cs/OLA ^0.5'] = (train_a1['Cs']/train_a1['OLA'])**(1/2)
    train_a1eus['Pb/L ^0.5'] = (train_a1['Pb']/(train_a1['OA']+train_a1['OLA']))**(1/2)
    train_a1eus['Cs ^2'] = train_a1['Cs']**2
    train_a1eus['OLA ^2'] = train_a1['OLA']**2
    train_a1eus['Br ^2'] = train_a1['Br']**2
    train_a1eus['Temp ^2'] = train_a1['Temp']**2
    return train_a1eus

In [22]:
#Def: Obtaining RMSE for each species
from sklearn.metrics import mean_squared_error

def rmsemodel(model,feature,label):
    prediction = model.predict(feature)
    prediction_df = pd.DataFrame(prediction,index = label.index,columns = label.columns)
    results = []
    for i in label.columns:
        val = mean_squared_error(label[i],prediction_df[i])
        results.append(val**0.5)
    return results

In [25]:
#Def: Obtaining relative RMSE for each species
def rrmsemodel(model,feature,label):
    prediction = model.predict(feature)
    prediction_df = pd.DataFrame(prediction,index = label.index,columns = label.columns)
    results = []
    for i in label.columns:
        val = mean_squared_error(label[i],prediction_df[i])
        results.append(val**0.5/label.mean()[i])
    return results

In [18]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(SynData, test_size=0.2, random_state=42)

labels = ['PbBr3-','Cs4PbBr6','CsPb2Br5','PbBr2','CsPbBr3','1ML','2ML','3ML','4ML']
attributes = ['Cs','Pb','OLA','OA','Br','Temp']
test_l = test_set[labels]
test_a = test_set[attributes]
train_l = train_set[labels]
train_a = train_set[attributes]

train_aeus = addfeatures(train_a)
train_ae = dfscale(train_aeus)
test_aeus = addfeatures(test_a)
test_ae = dfscale(test_aeus)

**Linear**

In [16]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(train_ae,train_l)

LinearRegression()

In [29]:
results = pd.DataFrame(index=labels)
results['Linear']=rmsemodel(linreg,test_ae,test_l)
rresults = pd.DataFrame(index=labels)
rresults['Linear']=rrmsemodel(linreg,test_ae,test_l)

**Decision Tree**

In [31]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

dtr = DecisionTreeRegressor()
depth = {'max_depth':[5,6,7,8,9,10]}
dtr_op = GridSearchCV(dtr,depth,cv=5,scoring='r2',return_train_score=True)
dtr_op.fit(train_ae,train_l)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [5, 6, 7, 8, 9, 10]},
             return_train_score=True, scoring='r2')

In [32]:
results['Decision Tree'] = rmsemodel(dtr_op,test_ae,test_l)
rresults['Decision Tree'] = rrmsemodel(dtr_op,test_ae,test_l)

**SVM(rbf)**

In [33]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

#Optimizing Gaussian Kernel
#Dictionary of gaussian regressors
G_SVR = {}
Gaussian = {}
ghyper = {'C':[0.1,0.5,1,2,4,6,8,10,100,120,150],'gamma':[0.1,0.5,1,2,4,6,8,10,12,15]}

for i in labels:
    G_SVR[i] = SVR(kernel='rbf')
    Gaussian[i] = GridSearchCV(G_SVR[i], ghyper, cv=5,scoring='r2',return_train_score=True)
    Gaussian[i].fit(train_ae, train_l[i])

In [35]:
for i in labels:
    print(i,Gaussian[i].best_params_)

PbBr3- {'C': 100, 'gamma': 0.1}
Cs4PbBr6 {'C': 100, 'gamma': 0.1}
CsPb2Br5 {'C': 2, 'gamma': 1}
PbBr2 {'C': 10, 'gamma': 0.1}
CsPbBr3 {'C': 150, 'gamma': 15}
1ML {'C': 8, 'gamma': 0.1}
2ML {'C': 6, 'gamma': 0.5}
3ML {'C': 2, 'gamma': 4}
4ML {'C': 4, 'gamma': 0.5}


In [37]:
gvalues = []
for i in labels:
    re = Gaussian[i].predict(test_ae)
    val = mean_squared_error(test_l[i],re)
    gvalues.append(val)
    
grvalues = []
for i in labels:
    re = Gaussian[i].predict(test_ae)
    val = mean_squared_error(test_l[i],re)
    grvalues.append(val/test_l.mean()[i])
    
results['SVM(rbf)'] = gvalues
rresults['SVM(rbf)'] = grvalues

**Random Forest**

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

rf = RandomForestRegressor(random_state=42)
rfspace = [Integer(5, 100, name='max_depth'),
          Integer(5, 15, name='max_features'),
          Integer(2, 100, name='min_samples_split'),
          Integer(1, 100, name='min_samples_leaf')]

@use_named_args(rfspace)
def objective(**params):
    rf.set_params(**params)
    results = cross_val_score(rf, train_ae, train_l, cv=5, n_jobs=-1,scoring="neg_root_mean_squared_error")
    return -np.mean(results)


rfop = gp_minimize(objective, rfspace, n_calls=50, random_state=0)



In [41]:
print("""Best parameters:
,max_depth=%d
,max_features=%d
,min_samples_split=%d
,min_samples_leaf=%d""" % (rfop.x[0], rfop.x[1],
                            rfop.x[2], rfop.x[3]))

Best parameters:
,max_depth=100
,max_features=5
,min_samples_split=2
,min_samples_leaf=1


In [42]:
from sklearn.ensemble import RandomForestRegressor

rf_opt=RandomForestRegressor(random_state=42,
                             max_depth=100,max_features=5,min_samples_split=2,min_samples_leaf=1
                             )
rf_opt.fit(train_ae,train_l)

RandomForestRegressor(max_depth=100, max_features=5, random_state=42)

In [43]:
results['RF']=rmsemodel(rf_opt,test_ae,test_l)
rresults['RF']=rrmsemodel(rf_opt,test_ae,test_l)

In [47]:
results

Unnamed: 0,Linear,Decision Tree,SVM(rbf),RF
PbBr3-,0.108883,0.090039,0.008557,0.059922
Cs4PbBr6,0.111681,0.089861,0.007306,0.063375
CsPb2Br5,0.123528,0.110516,0.012661,0.091431
PbBr2,0.088345,0.122326,0.009452,0.098334
CsPbBr3,0.080492,0.086778,0.014638,0.071432
1ML,0.33357,0.306693,0.103752,0.267364
2ML,0.292265,0.336492,0.076724,0.245823
3ML,0.311361,0.293156,0.072599,0.234048
4ML,0.152413,0.153056,0.024775,0.137317


In [45]:
rresults

Unnamed: 0,Linear,Decision Tree,SVM(rbf),RF
PbBr3-,1.558221,1.288539,0.122465,0.857539
Cs4PbBr6,1.587399,1.277247,0.103849,0.900788
CsPb2Br5,1.882526,1.684226,0.192955,1.393382
PbBr2,1.60798,2.22647,0.172042,1.789797
CsPbBr3,3.996246,4.308332,0.726768,3.546439
1ML,1.782971,1.639314,0.554566,1.429094
2ML,1.675398,1.928929,0.439816,1.409172
3ML,2.23241,2.101881,0.520526,1.678084
4ML,3.065163,3.078105,0.498251,2.76157
