In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import KNNImputer
from category_encoders import TargetEncoder
import pickle

In [2]:
pwd = os.getcwd() # Current directory

In [3]:
df = pd.read_csv(pwd+"/Perovskite_dataset/perovskite_cleaned_dataset.csv",low_memory=False) # Loading cleaned dataset
df.shape # Shape of the cleaned dataset

(26457, 34)

In [4]:
df.head()

Unnamed: 0,Ref_DOI_number,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,...,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure,JV_default_Voc,JV_default_Jsc,JV_default_FF,JV_default_PCE
0,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,,Spin-coating,Au,90.0,Evaporation,0.002,0.22,0.57,0.0
1,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,,Spin-coating,Au,90.0,Evaporation,0.12,0.49,0.25,0.0
2,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,,Spin-coating,Au,90.0,Evaporation,0.135,3.69,0.26,0.13
3,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,,Spin-coating,Au,90.0,Evaporation,0.227,1.32,0.41,0.12
4,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,,Spin-coating,Au,90.0,Evaporation,0.19,1.57,0.34,0.1


In [5]:
df.isnull().sum() # Checking which columns have missing values

Ref_DOI_number                                                 0
Cell_architecture                                              0
Cell_flexible                                                  0
Cell_semitransparent                                           0
Substrate_stack_sequence                                       0
ETL_stack_sequence                                             0
ETL_thickness                                              18553
ETL_deposition_procedure                                       0
Perovskite_dimension_2D                                        0
Perovskite_dimension_2D3D_mixture                              0
Perovskite_dimension_3D                                        0
Perovskite_dimension_3D_with_2D_capping_layer                  0
Perovskite_composition_perovskite_ABC3_structure               0
Perovskite_composition_long_form                               0
Perovskite_thickness                                       18245
Perovskite_composition_in

In [6]:
missing_value_cols = ['ETL_thickness','Perovskite_thickness','Perovskite_band_gap',
                      'HTL_thickness_list','Backcontact_thickness_list'] # Missing value columns

#### Filling the missing values using KNN imputer

In [7]:
imputer = KNNImputer(n_neighbors=5) # Imputer will fill missing values using 5 neighboring data
df_new = df.copy() # Copy of the dataset
df_new[missing_value_cols] = imputer.fit_transform(df_new[missing_value_cols]) # Imputation

In [9]:
df_new.head()

Unnamed: 0,Ref_DOI_number,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,...,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure,JV_default_Voc,JV_default_Jsc,JV_default_FF,JV_default_PCE
0,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,354.0,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,128.0,Spin-coating,Au,90.0,Evaporation,0.002,0.22,0.57,0.0
1,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,297.0,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,130.0,Spin-coating,Au,90.0,Evaporation,0.12,0.49,0.25,0.0
2,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,297.0,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,130.0,Spin-coating,Au,90.0,Evaporation,0.135,3.69,0.26,0.13
3,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,297.0,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,130.0,Spin-coating,Au,90.0,Evaporation,0.227,1.32,0.41,0.12
4,10.1021/jp5126624,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,192.4,Spray-pyrolys | Spin-coating,False,False,...,Spiro-MeOTAD,20.0,Spin-coating,Au,90.0,Evaporation,0.19,1.57,0.34,0.1


In [10]:
df_new.to_csv(pwd+"/Perovskite_dataset/perovskite_dataset_knn_imputed.csv",index=False) # Saving the imputed data 

## Data Encoding

In [11]:
df = pd.read_csv(pwd+"/Perovskite_dataset/perovskite_dataset_knn_imputed.csv") # Loading imputed data
df = df.sample(frac=1, random_state=0).reset_index(drop=True) # Shuffling data
df.to_csv(pwd+"/Perovskite_dataset/perovskite_dataset_knn_imputed.csv",index=False) # Saving data
df.head()

Unnamed: 0,Ref_DOI_number,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,...,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure,JV_default_Voc,JV_default_Jsc,JV_default_FF,JV_default_PCE
0,10.1021/acs.chemmater.5b03902,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp | Al2O3-c,370.0,Spin-coating | Spin-coating | CBD,False,False,...,Spiro-MeOTAD,148.0,Spin-coating,Au,80.0,Evaporation,0.92,18.6,0.678,11.6
1,10.1039/c9ta08081c,pin,False,False,SLG | ITO,PCBM-60 | Bphen,279.6,Spin-coating | Spin-coating,False,False,...,PTAA,34.0,Spin-coating,Ag,100.0,Evaporation,1.08,21.33,0.783,18.2
2,10.1246/cl.150238,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,480.0,Spray-pyrolys | Spin-coating,False,False,...,CuSCN,218.0,Doctor blading,Au,50.0,Evaporation,0.689,15.3,0.436,4.6
3,10.1016/j.solener.2019.09.056,nip,False,False,SLG | ITO,TiO2-c,279.6,Spin-coating,False,False,...,Spiro-MeOTAD,34.0,Spin-coating,Ag,100.0,Evaporation,1.08,19.83,0.66,14.19
4,10.1021/acs.nanolett.8b03552,nip,False,False,SLG | FTO,TiO2-c | TiO2-mp,230.0,Spray-pyrolys | Spin-coating,True,False,...,Spiro-MeOTAD,20.0,Spin-coating,Au,80.0,Evaporation,0.813,11.296,0.708,6.52


In [14]:
# Select feature list and target variable
X = df.iloc[:,1:-4] # We don't need Ref_DOI column
y = df.iloc[:,-4:]  # Target Variables are 'JV_default_Voc','JV_default_Jsc', 'JV_default_FF', 'JV_default_PCE'
print(f"Shape of X: {str(X.shape)}\nShape of y: {str(y.shape)}")

Shape of X: (26457, 29)
Shape of y: (26457, 4)


In [15]:
print(X.dtypes) # Datatypes of features

Cell_architecture                                           object
Cell_flexible                                                 bool
Cell_semitransparent                                          bool
Substrate_stack_sequence                                    object
ETL_stack_sequence                                          object
ETL_thickness                                              float64
ETL_deposition_procedure                                    object
Perovskite_dimension_2D                                       bool
Perovskite_dimension_2D3D_mixture                             bool
Perovskite_dimension_3D                                       bool
Perovskite_dimension_3D_with_2D_capping_layer                 bool
Perovskite_composition_perovskite_ABC3_structure              bool
Perovskite_composition_long_form                            object
Perovskite_thickness                                       float64
Perovskite_composition_inorganic                              

In [16]:
print(y.dtypes) # Data types of target variables

JV_default_Voc    float64
JV_default_Jsc    float64
JV_default_FF     float64
JV_default_PCE    float64
dtype: object


In [17]:
# Seperating features of different datatypes
numerical = list(X.select_dtypes(include='float64')) 
categorical = list(X.select_dtypes(include='object'))
boolean = list(X.select_dtypes(include='bool'))
print("No. of Numerical Features = "+str(len(numerical)))
print("No. of Categorical Features = "+str(len(categorical)))
print("No. of Boolean Features = "+str(len(boolean)))

No. of Numerical Features = 5
No. of Categorical Features = 13
No. of Boolean Features = 11


In [18]:
# Converting TRUE and FALSE data to 1 and 0
X[boolean] = X[boolean].astype('int')
X[categorical] = X[categorical].astype('category')

In [19]:
# Printing No. of Unique labels on each feature
print("Columns: No. of unique labels")
for c in categorical:
    print(c+": "+str(len(X[c].unique())))

Columns: No. of unique labels
Cell_architecture: 3
Substrate_stack_sequence: 115
ETL_stack_sequence: 1050
ETL_deposition_procedure: 237
Perovskite_composition_long_form: 1608
Perovskite_deposition_procedure: 67
Perovskite_deposition_solvents: 167
Perovskite_deposition_thermal_annealing_temperature: 511
Perovskite_deposition_thermal_annealing_time: 514
HTL_stack_sequence: 1520
HTL_deposition_procedure: 97
Backcontact_stack_sequence: 173
Backcontact_deposition_procedure: 69


In [20]:
X_Jsc, X_Voc, X_FF, X_PCE = X.copy(), X.copy(), X.copy(), X.copy() # Four copy of features columns
y_Jsc, y_Voc, y_FF, y_PCE = y['JV_default_Jsc'], y['JV_default_Voc'], y['JV_default_FF'], y['JV_default_PCE'] # Copy of Target variables

## Target Encoding

#### Target Encoding using PCE

In [21]:
TE_PCE = TargetEncoder() # Loading target encoder for PCE
TE_PCE.fit(X_PCE[categorical],y_PCE) # Fitting encoder with PCE column
X_PCE[categorical] = TE_PCE.transform(X_PCE[categorical]) # Encoding categorical features
pickle.dump(TE_PCE, open(pwd+"/Encoded_dataset/TE_enc.pkl","wb")) # Saving target encoder for PCE as pickle file

In [22]:
X_PCE.head()

Unnamed: 0,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D,...,Perovskite_deposition_quenching_induced_crystallisation,Perovskite_deposition_thermal_annealing_temperature,Perovskite_deposition_thermal_annealing_time,Perovskite_deposition_solvent_annealing,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure
0,12.809252,0,0,12.609307,12.026303,370.0,13.592894,0,0,1,...,0,12.385423,12.247283,0,13.136545,148.0,12.52143,13.046064,80.0,12.675014
1,12.009877,0,0,12.57157,13.768498,279.6,12.35796,0,0,1,...,1,13.963644,14.258854,0,14.255519,34.0,12.52143,12.43139,100.0,12.675014
2,12.809252,0,0,12.609307,12.113869,480.0,12.879183,0,0,1,...,0,9.911379,9.447088,0,9.837867,218.0,11.874105,13.046064,50.0,12.675014
3,12.809252,0,0,12.57157,12.7156,279.6,12.572449,0,0,1,...,1,12.848716,12.488134,0,13.136545,34.0,12.52143,12.43139,100.0,12.675014
4,12.809252,0,0,12.609307,12.113869,230.0,12.879183,1,0,0,...,0,7.410069,10.896159,0,13.136545,20.0,12.52143,13.046064,80.0,12.675014


In [24]:
df_encoded_PCE = pd.concat([X_PCE,y_PCE],axis=1) # Joining target variable column with encoded feature columns
df_encoded_PCE.head()

Unnamed: 0,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D,...,Perovskite_deposition_thermal_annealing_temperature,Perovskite_deposition_thermal_annealing_time,Perovskite_deposition_solvent_annealing,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure,JV_default_PCE
0,12.809252,0,0,12.609307,12.026303,370.0,13.592894,0,0,1,...,12.385423,12.247283,0,13.136545,148.0,12.52143,13.046064,80.0,12.675014,11.6
1,12.009877,0,0,12.57157,13.768498,279.6,12.35796,0,0,1,...,13.963644,14.258854,0,14.255519,34.0,12.52143,12.43139,100.0,12.675014,18.2
2,12.809252,0,0,12.609307,12.113869,480.0,12.879183,0,0,1,...,9.911379,9.447088,0,9.837867,218.0,11.874105,13.046064,50.0,12.675014,4.6
3,12.809252,0,0,12.57157,12.7156,279.6,12.572449,0,0,1,...,12.848716,12.488134,0,13.136545,34.0,12.52143,12.43139,100.0,12.675014,14.19
4,12.809252,0,0,12.609307,12.113869,230.0,12.879183,1,0,0,...,7.410069,10.896159,0,13.136545,20.0,12.52143,13.046064,80.0,12.675014,6.52


In [25]:
df_encoded_PCE.to_csv(pwd+"/Encoded_dataset/df_encoded_PCE.csv",index=False) # Saving encoded dataset for PCE

#### Target Encoding using Voc

In [None]:
TE_Voc = TargetEncoder() # Target Encoder for Voc
TE_Voc.fit(X_Voc[categorical],y_Voc) # Fitting encoder with Voc column
X_Voc[categorical] = TE_Voc.transform(X_Voc[categorical]) # Transforming categorical features

In [None]:
X_Voc.head()

Unnamed: 0,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D,...,Perovskite_deposition_quenching_induced_crystallisation,Perovskite_deposition_thermal_annealing_temperature,Perovskite_deposition_thermal_annealing_time,Perovskite_deposition_solvent_annealing,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure
0,0.984353,0,0,0.9769,0.946769,370.0,0.985721,0,0,1,...,0,0.946647,0.940956,0,0.9934,148.0,0.966546,0.985235,80.0,0.968357
1,0.933317,0,0,0.954397,0.964355,279.6,0.959158,0,0,1,...,1,0.981932,1.005764,0,0.997601,34.0,0.966546,0.954824,100.0,0.968357
2,0.984353,0,0,0.9769,0.95695,480.0,0.980591,0,0,1,...,0,0.922134,0.913764,0,0.87,218.0,0.969644,0.985235,50.0,0.968357
3,0.984353,0,0,0.954397,0.9971,279.6,0.976025,0,0,1,...,1,0.974922,0.967115,0,0.9934,34.0,0.966546,0.954824,100.0,0.968357
4,0.984353,0,0,0.9769,0.95695,230.0,0.980591,1,0,0,...,0,0.89265,0.965245,0,0.9934,20.0,0.966546,0.985235,80.0,0.968357


In [None]:
df_encoded_Voc = pd.concat([X_Voc,y_Voc],axis=1) # Joining Voc column with encoded features
df_encoded_Voc.head()

Unnamed: 0,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D,...,Perovskite_deposition_thermal_annealing_temperature,Perovskite_deposition_thermal_annealing_time,Perovskite_deposition_solvent_annealing,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure,JV_default_Voc
0,0.984353,0,0,0.9769,0.946769,370.0,0.985721,0,0,1,...,0.946647,0.940956,0,0.9934,148.0,0.966546,0.985235,80.0,0.968357,0.92
1,0.933317,0,0,0.954397,0.964355,279.6,0.959158,0,0,1,...,0.981932,1.005764,0,0.997601,34.0,0.966546,0.954824,100.0,0.968357,1.08
2,0.984353,0,0,0.9769,0.95695,480.0,0.980591,0,0,1,...,0.922134,0.913764,0,0.87,218.0,0.969644,0.985235,50.0,0.968357,0.689
3,0.984353,0,0,0.954397,0.9971,279.6,0.976025,0,0,1,...,0.974922,0.967115,0,0.9934,34.0,0.966546,0.954824,100.0,0.968357,1.08
4,0.984353,0,0,0.9769,0.95695,230.0,0.980591,1,0,0,...,0.89265,0.965245,0,0.9934,20.0,0.966546,0.985235,80.0,0.968357,0.813


In [None]:
df_encoded_Voc.to_csv(pwd+"/Encoded_dataset/df_encoded_Voc.csv",index=False) # Saving encoded dataset for Voc

#### Target Encoding using Jsc

In [None]:
TE_Jsc = TargetEncoder() # Target encoder for Jsc
TE_Jsc.fit(X_Jsc[categorical],y_Jsc) # Fitting encoder with Jsc column
X_Jsc[categorical] = TE_Jsc.transform(X_Jsc[categorical]) # Transforming categorical features

In [None]:
X_Jsc.head()

Unnamed: 0,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D,...,Perovskite_deposition_quenching_induced_crystallisation,Perovskite_deposition_thermal_annealing_temperature,Perovskite_deposition_thermal_annealing_time,Perovskite_deposition_solvent_annealing,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure
0,18.602188,0,0,18.410445,18.297473,370.0,19.51287,0,0,1,...,0,18.562972,18.155324,0,18.828537,148.0,18.377828,18.710466,80.0,18.510001
1,17.917749,0,0,18.455145,19.631051,279.6,18.271921,0,0,1,...,1,19.782775,19.386968,0,19.323493,34.0,18.377828,18.58448,100.0,18.510001
2,18.602188,0,0,18.410445,18.013051,480.0,18.304941,0,0,1,...,0,16.893973,16.617931,0,15.818751,218.0,17.257697,18.710466,50.0,18.510001
3,18.602188,0,0,18.455145,18.525106,279.6,18.494272,0,0,1,...,1,18.637209,18.37185,0,18.828537,34.0,18.377828,18.58448,100.0,18.510001
4,18.602188,0,0,18.410445,18.013051,230.0,18.304941,1,0,0,...,0,11.802628,15.885777,0,18.828537,20.0,18.377828,18.710466,80.0,18.510001


In [None]:
df_encoded_Jsc = pd.concat([X_Jsc,y_Jsc],axis=1) # Joining Jsc column with encoded features
df_encoded_Jsc.head()

Unnamed: 0,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D,...,Perovskite_deposition_thermal_annealing_temperature,Perovskite_deposition_thermal_annealing_time,Perovskite_deposition_solvent_annealing,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure,JV_default_Jsc
0,18.602188,0,0,18.410445,18.297473,370.0,19.51287,0,0,1,...,18.562972,18.155324,0,18.828537,148.0,18.377828,18.710466,80.0,18.510001,18.6
1,17.917749,0,0,18.455145,19.631051,279.6,18.271921,0,0,1,...,19.782775,19.386968,0,19.323493,34.0,18.377828,18.58448,100.0,18.510001,21.33
2,18.602188,0,0,18.410445,18.013051,480.0,18.304941,0,0,1,...,16.893973,16.617931,0,15.818751,218.0,17.257697,18.710466,50.0,18.510001,15.3
3,18.602188,0,0,18.455145,18.525106,279.6,18.494272,0,0,1,...,18.637209,18.37185,0,18.828537,34.0,18.377828,18.58448,100.0,18.510001,19.83
4,18.602188,0,0,18.410445,18.013051,230.0,18.304941,1,0,0,...,11.802628,15.885777,0,18.828537,20.0,18.377828,18.710466,80.0,18.510001,11.296


In [None]:
df_encoded_Jsc.to_csv(pwd+"/Encoded_dataset/df_encoded_Jsc.csv",index=False) # Saving encoded dataset for Jsc

#### Target Encoding using FF

In [None]:
TE_FF = TargetEncoder() # Target encoder for FF
TE_FF.fit(X_FF[categorical],y_FF) # Fitting encoder with FF column
X_FF[categorical] = TE_FF.transform(X_FF[categorical]) # transforming categorical variables

In [None]:
X_FF.head()

Unnamed: 0,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D,...,Perovskite_deposition_quenching_induced_crystallisation,Perovskite_deposition_thermal_annealing_temperature,Perovskite_deposition_thermal_annealing_time,Perovskite_deposition_solvent_annealing,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure
0,0.653286,0,0,0.652116,0.649305,370.0,0.675102,0,0,1,...,0,0.643467,0.64786,0,0.658181,148.0,0.658377,0.657498,80.0,0.660435
1,0.670308,0,0,0.671036,0.719703,279.6,0.660002,0,0,1,...,1,0.692515,0.693115,0,0.692365,34.0,0.658377,0.664057,100.0,0.660435
2,0.653286,0,0,0.652116,0.645703,480.0,0.662964,0,0,1,...,0,0.601828,0.596372,0,0.624823,218.0,0.646018,0.657498,50.0,0.660435
3,0.653286,0,0,0.671036,0.650048,279.6,0.652413,0,0,1,...,1,0.662327,0.65758,0,0.658181,34.0,0.658377,0.664057,100.0,0.660435
4,0.653286,0,0,0.652116,0.645703,230.0,0.662964,1,0,0,...,0,0.652178,0.675118,0,0.658181,20.0,0.658377,0.657498,80.0,0.660435


In [None]:
df_encoded_FF = pd.concat([X_FF,y_FF],axis=1) # Joining FF column with encoded dataset
df_encoded_FF.head()

Unnamed: 0,Cell_architecture,Cell_flexible,Cell_semitransparent,Substrate_stack_sequence,ETL_stack_sequence,ETL_thickness,ETL_deposition_procedure,Perovskite_dimension_2D,Perovskite_dimension_2D3D_mixture,Perovskite_dimension_3D,...,Perovskite_deposition_thermal_annealing_temperature,Perovskite_deposition_thermal_annealing_time,Perovskite_deposition_solvent_annealing,HTL_stack_sequence,HTL_thickness_list,HTL_deposition_procedure,Backcontact_stack_sequence,Backcontact_thickness_list,Backcontact_deposition_procedure,JV_default_FF
0,0.653286,0,0,0.652116,0.649305,370.0,0.675102,0,0,1,...,0.643467,0.64786,0,0.658181,148.0,0.658377,0.657498,80.0,0.660435,0.678
1,0.670308,0,0,0.671036,0.719703,279.6,0.660002,0,0,1,...,0.692515,0.693115,0,0.692365,34.0,0.658377,0.664057,100.0,0.660435,0.783
2,0.653286,0,0,0.652116,0.645703,480.0,0.662964,0,0,1,...,0.601828,0.596372,0,0.624823,218.0,0.646018,0.657498,50.0,0.660435,0.436
3,0.653286,0,0,0.671036,0.650048,279.6,0.652413,0,0,1,...,0.662327,0.65758,0,0.658181,34.0,0.658377,0.664057,100.0,0.660435,0.66
4,0.653286,0,0,0.652116,0.645703,230.0,0.662964,1,0,0,...,0.652178,0.675118,0,0.658181,20.0,0.658377,0.657498,80.0,0.660435,0.708


In [None]:
df_encoded_FF.to_csv(pwd+"/Encoded_dataset/df_encoded_FF.csv",index=False) # Saving encoded dataset for FF