#### Design Pool generation for architecture related features (Step-1)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle

In [2]:
pwd = os.getcwd() # Current working directory

In [3]:
df = pd.read_csv(pwd+"/Sn_based/Sn_cleaned.csv") # Loading cleaned dataset for Sn based perovskites
df.head()

Unnamed: 0,Ref_DOI_number,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,...,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure,JV_default_Voc,JV_default_Jsc,JV_default_FF,JV_default_PCE
0,10.1002/adfm.201807696,pin,False,False,SLG | ITO,PCBM-60 | BCP,142.0,Spin-coating | Evaporation,True,False,...,PEDOT:PSS,114.0,Spin-coating,Ag,200.0,Evaporation,0.58,21.2,0.633,7.78
1,10.1002/adfm.201807696,pin,False,False,SLG | ITO,PCBM-60 | BCP,142.0,Spin-coating | Evaporation,True,False,...,PEDOT:PSS,114.0,Spin-coating,Ag,200.0,Evaporation,0.6,20.5,0.65,7.95
2,10.1002/adfm.201807696,pin,False,False,SLG | ITO,PCBM-60 | BCP,142.0,Spin-coating | Evaporation,True,False,...,PEDOT:PSS,114.0,Spin-coating,Ag,200.0,Evaporation,0.61,21.0,0.67,8.71
3,10.1002/adfm.201807696,pin,False,False,SLG | ITO,PCBM-60 | BCP,142.0,Spin-coating | Evaporation,True,False,...,PEDOT:PSS,114.0,Spin-coating,Ag,200.0,Evaporation,0.54,20.3,0.63,5.94
4,10.1021/acsenergylett.9b00954,pin,False,False,SLG | ITO,C60 | LiF,31.0,Evaporation | Evaporation,True,False,...,PEDOT:PSS,38.0,Spin-coating,Al,100.0,Evaporation,0.56,18.3,0.493,5.07


In [6]:
# Selecting columns
cols = ['Cell_architecture', 'Cell_flexible',
       'Cell_semitransparent', 'Substrate_stack_sequence',
       'ETL_stack_sequence', 'ETL_thickness', 'ETL_deposition_procedure',
       'Perovskite_dimension_2D', 'Perovskite_dimension_2D3D_mixture',
       'Perovskite_dimension_3D',
       'Perovskite_dimension_3D_with_2D_capping_layer',
       'Perovskite_composition_perovskite_ABC3_structure',
       'Perovskite_composition_long_form',
       'Perovskite_thickness',
       'Perovskite_composition_inorganic', 'Perovskite_band_gap',
       'Perovskite_band_gap_graded', 'Perovskite_deposition_procedure',
       'Perovskite_deposition_solvents',
       'Perovskite_deposition_quenching_induced_crystallisation',
       'Perovskite_deposition_thermal_annealing_temperature',
       'Perovskite_deposition_thermal_annealing_time',
       'Perovskite_deposition_solvent_annealing', 'HTL_stack_sequence',
       'HTL_thickness_list', 'HTL_deposition_procedure',
       'Backcontact_stack_sequence', 'Backcontact_thickness_list',
       'Backcontact_deposition_procedure', 'JV_default_PCE']
df = df[cols] # selecting the columns which are in the 'cols' list
df.shape

(256, 30)

In [7]:
feats = df.iloc[:,:-1] # Selecting feature columns
feats.shape  # shape of the features

(256, 29)

In [9]:
# Seperating features into groups of numerical, categorical and boolean types
numerical = list(feats.select_dtypes(include="float64"))
categorical = list(feats.select_dtypes(include="object"))
boolean = list(feats.select_dtypes(include="bool"))

print("Numerical: ", len(numerical))
print("Categorical: ", len(categorical))
print("Boolean: ", len(boolean))

Numerical:  5
Categorical:  13
Boolean:  11


In [10]:
# Features related to device architecture
feats_arch = [        
            'Cell_architecture', 'Cell_flexible',
            'Cell_semitransparent', 'Substrate_stack_sequence',
            'ETL_stack_sequence',
            'ETL_thickness', 
            'Perovskite_dimension_2D', 'Perovskite_dimension_2D3D_mixture',
            'Perovskite_dimension_3D',
            'Perovskite_dimension_3D_with_2D_capping_layer',
            'Perovskite_composition_perovskite_ABC3_structure',
            'Perovskite_composition_long_form',
            'Perovskite_composition_inorganic',
            'Perovskite_thickness',
            'Perovskite_band_gap', 'Perovskite_band_gap_graded',
            'HTL_stack_sequence',
            'HTL_thickness_list',
            'Backcontact_stack_sequence',
            'Backcontact_thickness_list']

# Features related to deposition
feats_deposition = [
            'ETL_deposition_procedure',
            'Perovskite_deposition_procedure', 
            'Perovskite_deposition_solvents',
            'Perovskite_deposition_quenching_induced_crystallisation',
            'Perovskite_deposition_thermal_annealing_temperature',
            'Perovskite_deposition_thermal_annealing_time',
            'Perovskite_deposition_solvent_annealing',
            'HTL_deposition_procedure',
            'Backcontact_deposition_procedure'
]
# printing number of features in each group
print(f"Architecture features: {len(feats_arch)}\nDeposition features: {len(feats_deposition)}\nTotal features: {feats.shape[1]}") 

Architecture features: 20
Deposition features: 9
Total features: 29


In [11]:
# Number of unique labels on each feature related to device architecture
archi_unique = {}
for f in feats_arch:
    lst = list(df[f].unique())
    archi_unique[f] = lst
for k in archi_unique.keys():
    print(k+" : "+str(len(archi_unique[k])))

Cell_architecture : 2
Cell_flexible : 1
Cell_semitransparent : 1
Substrate_stack_sequence : 2
ETL_stack_sequence : 19
ETL_thickness : 64
Perovskite_dimension_2D : 2
Perovskite_dimension_2D3D_mixture : 2
Perovskite_dimension_3D : 2
Perovskite_dimension_3D_with_2D_capping_layer : 1
Perovskite_composition_perovskite_ABC3_structure : 2
Perovskite_composition_long_form : 80
Perovskite_composition_inorganic : 2
Perovskite_thickness : 55
Perovskite_band_gap : 45
Perovskite_band_gap_graded : 1
HTL_stack_sequence : 20
HTL_thickness_list : 69
Backcontact_stack_sequence : 7
Backcontact_thickness_list : 18


In [12]:
# Most frequent values of numerical features
for t in numerical:
    m = df[t].value_counts().idxmax()
    print("Most frequent "+t+" : "+str(m))

Most frequent ETL_thickness : 26.0
Most frequent Perovskite_thickness : 350.0
Most frequent Perovskite_band_gap : 1.6
Most frequent HTL_thickness_list : 344.0
Most frequent Backcontact_thickness_list : 100.0


### Design pool for architecture related features

In [None]:
"""feats_arch = [        
                  'Cell_architecture',   
                  'Cell_flexible',       
                  'Cell_semitransparent', 
                  'Substrate_stack_sequence',
                  'ETL_stack_sequence',
                  'ETL_thickness', 
                  'Perovskite_dimension_2D',  
                  'Perovskite_dimension_2D3D_mixture',
                  'Perovskite_dimension_3D', 
                  'Perovskite_dimension_3D_with_2D_capping_layer', 
                  'Perovskite_composition_perovskite_ABC3_structure', 
                  'Perovskite_composition_long_form', 
                  'Perovskite_composition_inorganic', 
                  'Perovskite_thickness', 
                  'Perovskite_band_gap', 
                  'Perovskite_band_gap_graded', 
                  'HTL_stack_sequence', 
                  'HTL_thickness_list', 
                  'Backcontact_stack_sequence', 
                  'Backcontact_thickness_list'
                ]
"""
# Making a list of unique labels in each column
cell_arch_lst = archi_unique['Cell_architecture']
cell_flex_lst = archi_unique['Cell_flexible']
cell_semi_lst = archi_unique['Cell_semitransparent']
subs_lst = archi_unique['Substrate_stack_sequence']
etl_lst = archi_unique['ETL_stack_sequence']
# etl_th_lst = [26.0]
per_2d_lst = archi_unique['Perovskite_dimension_2D']
per_2d3d_lst = archi_unique['Perovskite_dimension_2D3D_mixture']
per_3d_lst = archi_unique['Perovskite_dimension_3D']
per_3d2d_cap_lst = archi_unique['Perovskite_dimension_3D_with_2D_capping_layer'] 
per_abc3_lst = archi_unique['Perovskite_composition_perovskite_ABC3_structure']
per_lst = archi_unique['Perovskite_composition_long_form']
per_ino_lst = archi_unique['Perovskite_composition_inorganic']
# per_th_lst = [350.0]
# per_band_lst = [1.6]
per_band_grad_lst = archi_unique['Perovskite_band_gap_graded']
htl_lst = archi_unique['HTL_stack_sequence']
# htl_th_lst = [344.0]
back_lst = archi_unique['Backcontact_stack_sequence']
# back_th = [100.0]

cell_arch = []
cell_flex = []
cell_semi = []
subs = []
etl = [] 
# etl_th_lst = [26.0]
per_2d = []
per_2d3d = []
per_3d = []
per_3d2d_cap = []
per_abc3 = []
per = []
per_ino = []
# per_th_lst = [350.0]
# per_band_lst = [1.6]
per_band_grad = []
htl = [] 
# htl_th_lst = [344.0]
back = []
# back_th = [100.0]

# Permutating through each layers of features
for l0 in cell_arch_lst:
    for l1 in cell_flex_lst:
        for l2 in cell_semi_lst:
            for l3 in subs_lst:
                for l4 in etl_lst:
                    for l5 in per_2d_lst:
                        for l6 in per_2d3d_lst:
                            for l7 in per_3d_lst:
                                for l8 in per_3d2d_cap_lst:
                                    for l9 in per_abc3_lst:
                                        for l10 in per_lst:
                                            for l11 in per_ino_lst:
                                                for l12 in per_band_grad_lst:
                                                    for l13 in htl_lst:
                                                        for l14 in back_lst:
                                                            cell_arch.append(l0)
                                                            cell_flex.append(l1)
                                                            cell_semi.append(l2)
                                                            subs.append(l3)
                                                            etl.append(l4)
                                                            per_2d.append(l5)
                                                            per_2d3d.append(l6)
                                                            per_3d.append(l7)
                                                            per_3d2d_cap.append(l8)
                                                            per_abc3.append(l9)
                                                            per.append(l10)
                                                            per_ino.append(l11)
                                                            per_band_grad.append(l12)
                                                            htl.append(l13)
                                                            back.append(l14)

# Create a design pool of all possible combination of arcthitecture related features                                                       
dp_archi = pd.DataFrame(
{   'Cell_architecture': cell_arch,
    'Cell_flexible': cell_flex,
    'Cell_semitransparent': cell_semi,
    'Substrate_stack_sequence': subs,
    'ETL_stack_sequence': etl,
    'Perovskite_dimension_2D': per_2d,
    'Perovskite_dimension_2D3D_mixture': per_2d3d,
    'Perovskite_dimension_3D': per_3d,
    'Perovskite_dimension_3D_with_2D_capping_layer': per_3d2d_cap,
    'Perovskite_composition_perovskite_ABC3_structure': per_abc3,
    'Perovskite_composition_long_form': per,
    'Perovskite_composition_inorganic': per_ino,
    'Perovskite_band_gap_graded': per_band_grad,
    'HTL_stack_sequence': htl,
    'Backcontact_stack_sequence': back
})
dp_archi.shape # Initial shape of the design pool

(27238400, 15)

#### Addition other columns

In [14]:
# Taking the most frequent values of thickness and band_gap
for f in ['ETL_thickness', 'Perovskite_thickness','Perovskite_band_gap', 'HTL_thickness_list', 'Backcontact_thickness_list']:
    dp_archi[f] = df[f].mode()[0]

# Taking all the other feature related to depostion constant by taking the most frequent value
for f in feats_deposition:
    dp_archi[f] = df[f].mode()[0]

dp_archi = dp_archi[list(feats.columns)]
dp_archi.shape # Final shape of the design pool of architecture related features

(27238400, 29)

In [15]:
dp_archi.head()

Unnamed: 0,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D,...,Perovskite_deposition_quenching_induced_crystallisation,Perovskite_deposition_thermal_annealing_temperature,Perovskite_deposition_thermal_annealing_time,Perovskite_deposition_solvent_annealing,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure
0,pin,False,False,SLG | ITO,PCBM-60 | BCP,26.0,Evaporation | Evaporation,True,False,False,...,True,100,10,False,PEDOT:PSS,344.0,Spin-coating,Ag,100.0,Evaporation
1,pin,False,False,SLG | ITO,PCBM-60 | BCP,26.0,Evaporation | Evaporation,True,False,False,...,True,100,10,False,PEDOT:PSS,344.0,Spin-coating,Al,100.0,Evaporation
2,pin,False,False,SLG | ITO,PCBM-60 | BCP,26.0,Evaporation | Evaporation,True,False,False,...,True,100,10,False,PEDOT:PSS,344.0,Spin-coating,Au,100.0,Evaporation
3,pin,False,False,SLG | ITO,PCBM-60 | BCP,26.0,Evaporation | Evaporation,True,False,False,...,True,100,10,False,PEDOT:PSS,344.0,Spin-coating,Ag | Au,100.0,Evaporation
4,pin,False,False,SLG | ITO,PCBM-60 | BCP,26.0,Evaporation | Evaporation,True,False,False,...,True,100,10,False,PEDOT:PSS,344.0,Spin-coating,Cu,100.0,Evaporation


In [16]:
pickle.dump(dp_archi, open(pwd+"/design_pool/design_pool_archi.pkl","wb")) # Saving the design pool as pickle file