In [1]:
import pandas as pd
import numpy as np

In [3]:
#let's obtain some info about dataset
df_initial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DOI                     5000 non-null   object 
 1   Date                    4573 non-null   object 
 2   Journal                 4573 non-null   object 
 3   Title                   4573 non-null   object 
 4   Name                    4996 non-null   object 
 5   measurement_error       5000 non-null   float64
 6   measurement_wavelength  565 non-null    object 
 7   measurement_method      5000 non-null   object 
 8   normalised_name         2941 non-null   object 
 9   raw_value               5000 non-null   object 
 10  specifier               5000 non-null   object 
dtypes: float64(1), object(10)
memory usage: 429.8+ KB


In [4]:
#we will use names of substances from this column for parsing descriptors
df_initial.Name

0                   NaCl
1                 NaPh4B
2                   FITC
3                    CuO
4            Propan-1-ol
              ...       
4995                 ZnS
4996    tantalum nitride
4997           i-Butanol
4998         Cyclohexane
4999          Chloroform
Name: Name, Length: 5000, dtype: object

# Let's start our parsing with the PubChemPy API. Using it, we can collect 43 descriptors:

In [5]:
import pubchempy as pcp

In [6]:
#available descriptors
descriptors_pcp = pcp.get_compounds('NaCl', 'name', as_dataframe=True).columns.to_list()
for i in descriptors_pcp:
    print(i)

atom_stereo_count
atoms
bond_stereo_count
bonds
cactvs_fingerprint
canonical_smiles
charge
complexity
conformer_id_3d
conformer_rmsd_3d
coordinate_type
covalent_unit_count
defined_atom_stereo_count
defined_bond_stereo_count
effective_rotor_count_3d
elements
exact_mass
feature_selfoverlap_3d
fingerprint
h_bond_acceptor_count
h_bond_donor_count
heavy_atom_count
inchi
inchikey
isomeric_smiles
isotope_atom_count
iupac_name
mmff94_energy_3d
mmff94_partial_charges_3d
molecular_formula
molecular_weight
monoisotopic_mass
multipoles_3d
pharmacophore_features_3d
record
rotatable_bond_count
shape_fingerprint_3d
shape_selfoverlap_3d
tpsa
undefined_atom_stereo_count
undefined_bond_stereo_count
volume_3d
xlogp


In [7]:
# so, we create a DataFrame that will include all data about descriptors. Firstly, create the dataframe with requiring columns
df_pcp = pd.DataFrame(columns=descriptors_pcp)

In [None]:
# enumeration of all names in the initial DataFrame. Calculated for this name descriptors is inserting in the df_pcp Dataframe. 
#If this name cannot be used for calculating descriptors, nan row is incerting
df_nan = pd.DataFrame(columns=descriptors_pcp, index=[0])
for i,_ in df_initial.iterrows():
    try:
        df_pcp = pd.concat([df_pcp, pcp.get_compounds(df_initial.Name[i], 'name', as_dataframe=True).iloc[:1]])
    except:
        df_pcp = pd.concat([df_pcp, df_nan])

In [9]:
#let's check everything is OK in the obtained database
df_pcp

Unnamed: 0,atom_stereo_count,atoms,bond_stereo_count,bonds,cactvs_fingerprint,canonical_smiles,charge,complexity,conformer_id_3d,conformer_rmsd_3d,...,pharmacophore_features_3d,record,rotatable_bond_count,shape_fingerprint_3d,shape_selfoverlap_3d,tpsa,undefined_atom_stereo_count,undefined_bond_stereo_count,volume_3d,xlogp
0,0.0,"[{'aid': 1, 'number': 17, 'element': 'Cl', 'x'...",0.0,"[{'aid1': 1, 'aid2': 2, 'order': 7}]",0000000000000000000000000010000000000100000000...,[Na+].[Cl-],0.0,2.0,,,...,,"{'id': {'id': {'cid': 5234}}, 'atoms': {'aid':...",0.0,,,0.0,0.0,0.0,,
1,,,,,,,,,,,...,,,,,,,,,,
2,0.0,"[{'aid': 1, 'number': 16, 'element': 'S', 'x':...",0.0,"[{'aid1': 1, 'aid2': 28, 'order': 2}, {'aid1':...",1100000001111010001110000000000001000000000000...,C1=CC2=C(C=C1N=C=S)C(=O)OC23C4=C(C=C(C=C4)O)OC...,0.0,668.0,,,...,,"{'id': {'id': {'cid': 18730}}, 'atoms': {'aid'...",1.0,,,120.0,0.0,0.0,,4.8
3,0.0,"[{'aid': 1, 'number': 29, 'element': 'Cu', 'x'...",0.0,"[{'aid1': 1, 'aid2': 2, 'order': 2}]",0000000000000000001000000000000000000000000000...,O=[Cu],0.0,2.0,,,...,,"{'id': {'id': {'cid': 14829}}, 'atoms': {'aid'...",0.0,,,17.1,0.0,0.0,,
4,0.0,"[{'aid': 1, 'number': 8, 'element': 'O', 'x': ...",0.0,"[{'aid1': 1, 'aid2': 3, 'order': 1}, {'aid1': ...",1100000001000000001000000000000000000000000000...,CCCO,0.0,7.2,,,...,,"{'id': {'id': {'cid': 1031}}, 'atoms': {'aid':...",1.0,,,20.2,0.0,0.0,,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,"[{'aid': 1, 'number': 15, 'element': 'P', 'x':...",0.0,"[{'aid1': 1, 'aid2': 2, 'order': 1}, {'aid1': ...",1000000001100000001110000000001000000000000000...,C(CC(=O)OP(=O)(O)O)C(=O)O,0.0,228.0,,,...,,"{'id': {'id': {'cid': 54104351}}, 'atoms': {'a...",5.0,,,121.0,0.0,0.0,,-2.1
4996,0.0,"[{'aid': 1, 'number': 73, 'element': 'Ta', 'x'...",0.0,"[{'aid1': 1, 'aid2': 2, 'order': 3}]",0000000000000010000000000000000000000000000000...,N#[Ta],0.0,10.0,,,...,,"{'id': {'id': {'cid': 82832}}, 'atoms': {'aid'...",0.0,,,23.8,0.0,0.0,,
4997,0.0,"[{'aid': 1, 'number': 8, 'element': 'O', 'x': ...",0.0,"[{'aid1': 1, 'aid2': 3, 'order': 1}, {'aid1': ...",1100000001100000001000000000000000000000000000...,CC(C)CO,0.0,17.6,,,...,,"{'id': {'id': {'cid': 6560}}, 'atoms': {'aid':...",1.0,,,20.2,0.0,0.0,,0.8
4998,0.0,"[{'aid': 1, 'number': 6, 'element': 'C', 'x': ...",0.0,"[{'aid1': 1, 'aid2': 2, 'order': 1}, {'aid1': ...",1100000001100000000000000000000000000000000000...,C1CCCCC1,0.0,15.5,,,...,,"{'id': {'id': {'cid': 8078}}, 'atoms': {'aid':...",0.0,,,0.0,0.0,0.0,,3.4


# Next, we can use rdkit software to collect 43 new descriptors:

In [10]:
import rdkit
from rdkit.Chem import rdMolDescriptors
import pubchempy as pcp

In [11]:
#available descriptors
descriptors_rdkit = rdMolDescriptors.Properties.GetAvailableProperties()
for i in descriptors_rdkit:
    print(i)

exactmw
amw
lipinskiHBA
lipinskiHBD
NumRotatableBonds
NumHBD
NumHBA
NumHeavyAtoms
NumAtoms
NumHeteroatoms
NumAmideBonds
FractionCSP3
NumRings
NumAromaticRings
NumAliphaticRings
NumSaturatedRings
NumHeterocycles
NumAromaticHeterocycles
NumSaturatedHeterocycles
NumAliphaticHeterocycles
NumSpiroAtoms
NumBridgeheadAtoms
NumAtomStereoCenters
NumUnspecifiedAtomStereoCenters
labuteASA
tpsa
CrippenClogP
CrippenMR
chi0v
chi1v
chi2v
chi3v
chi4v
chi0n
chi1n
chi2n
chi3n
chi4n
hallKierAlpha
kappa1
kappa2
kappa3
Phi


In [12]:
#create a DataFrame that will include all data about descriptors
df_rdkit = pd.DataFrame(columns=descriptors_rdkit)

In [24]:
# enumeration of all names in the DataFrame. Calculated for this name descriptors is inserting in the df_rdkit Dataframe. 
#If this name cannot be used for calculating descriptors, nan row is incerting
df_nan = pd.DataFrame(columns=descriptors_rdkit, index=[0])
get_descriptors = rdMolDescriptors.Properties(descriptors_rdkit)
for i,_ in df_initial.iterrows():
    try:
        compound = pcp.get_compounds(df_initial.Name[i], 'name')
        smile = compound[0].canonical_smiles
        mol = rdkit.Chem.MolFromSmiles(smile)
        df_rdkit = pd.concat([df_rdkit, pd.DataFrame(np.array(get_descriptors.ComputeProperties(mol)).reshape(1,-1), columns=descriptors_rdkit)])
    except:
         df_rdkit = pd.concat([df_rdkit, df_nan])



In [14]:
#let's check everything is OK in the obtained database
df_rdkit

Unnamed: 0,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,NumHeteroatoms,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
0,57.958622,58.443,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,...,1.377964,0.000000,0.000000,0.000000,0.000000,1.290000,10.367820,2.290000,0.065194,11.871154
1,,,,,,,,,,,...,,,,,,,,,,
2,389.035793,389.388,6.0,2.0,1.0,2.0,7.0,28.0,39.0,7.0,...,14.170786,8.333274,4.931788,4.931788,3.815680,-3.670000,16.499791,5.661360,2.113625,3.336116
3,78.924512,79.545,1.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,...,0.709760,0.123091,0.000000,0.000000,0.000000,0.319481,2.319481,1.319481,1.449562,1.530255
4,60.057515,60.096,1.0,1.0,1.0,1.0,1.0,4.0,12.0,1.0,...,2.861427,1.523335,0.223607,0.223607,0.000000,-0.040000,3.960000,2.960000,1.960000,2.930400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,197.992939,198.067,7.0,3.0,4.0,3.0,4.0,12.0,19.0,8.0,...,5.836061,2.808234,0.763725,0.763725,0.387274,-0.910000,11.090000,4.199494,7.200011,3.881032
4996,194.951070,194.955,1.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,...,0.894427,0.200000,0.000000,0.000000,0.000000,0.450260,2.450260,1.450260,0.671200,1.776757
4997,74.073165,74.123,1.0,1.0,1.0,1.0,1.0,5.0,15.0,1.0,...,3.731671,1.879177,0.365148,0.365148,0.000000,-0.040000,4.960000,2.212525,3.960000,2.194825
4998,84.093900,84.162,0.0,0.0,0.0,0.0,0.0,6.0,18.0,0.0,...,4.242641,3.000000,1.500000,1.500000,1.060660,0.000000,4.166667,2.222222,1.000000,1.543210


# Next, we can use pymatgen software (MaterialsProject API) to collect 74 new descriptors:

In [15]:
import pymatgen
from pymatgen.ext.matproj import MPRester

In [17]:
#we will always take the first record of the substance
from pymatgen.ext.matproj import MPRester
with MPRester(api_key='px27m7fBcI17PKBcjzdb6AtN8j7v9SQH') as m:
     results = m.summary.search(formula="NaCl")

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
#let's get the list of available descriptors
descriptors_pymatgen = []
for i in range(len(list(results[0]))):
    descriptors_pymatgen.append(list(results[0])[i][0])
    print(list(results[0])[i][0])

builder_meta
nsites
elements
nelements
composition
composition_reduced
formula_pretty
formula_anonymous
chemsys
volume
density
density_atomic
symmetry
property_name
material_id
deprecated
deprecation_reasons
last_updated
origins
structure
task_ids
uncorrected_energy_per_atom
energy_per_atom
formation_energy_per_atom
energy_above_hull
is_stable
equilibrium_reaction_energy_per_atom
decomposes_to
xas
grain_boundaries
band_gap
cbm
vbm
efermi
is_gap_direct
is_metal
es_source_calc_id
bandstructure
dos
dos_energy_up
dos_energy_down
is_magnetic
ordering
total_magnetization
total_magnetization_normalized_vol
total_magnetization_normalized_formula_units
num_magnetic_sites
num_unique_magnetic_sites
types_of_magnetic_species
k_voigt
k_reuss
k_vrh
g_voigt
g_reuss
g_vrh
universal_anisotropy
homogeneous_poisson
e_total
e_ionic
e_electronic
n
e_ij_max
weighted_surface_energy_EV_PER_ANG2
weighted_surface_energy
weighted_work_function
surface_anisotropy
shape_factor
has_reconstructed
possible_species
ha

In [19]:
#create a DataFrame that will include all data about descriptors
df_pymatgen = pd.DataFrame(columns=descriptors_pymatgen)

In [None]:
# enumeration of all names in the DataFrame. Calculated for this name descriptors is inserting in the df_pymatgen Dataframe. 
#If this name cannot be used for calculating descriptors, nan row is incerting
df_nan = pd.DataFrame(columns=descriptors_pymatgen, index=[0])
for i,_ in df_initial.iterrows():
    try:
        df_new_row = pd.DataFrame(columns=descriptors_pymatgen, index=[0])
        with MPRester(api_key='px27m7fBcI17PKBcjzdb6AtN8j7v9SQH') as m:
            results = m.summary.search(formula=df_initial.Name[i]) 
        for i in range(len(list(results[0]))):
            df_new_row.at[0, list(results[0])[i][0]] = list(results[0])[i][1]
        df_pymatgen = pd.concat([df_pymatgen, df_new_row])
    except:
        df_pymatgen = pd.concat([df_pymatgen, df_nan])

In [21]:
#let's check everything is OK in the obtained database
df_pymatgen

Unnamed: 0,builder_meta,nsites,elements,nelements,composition,composition_reduced,formula_pretty,formula_anonymous,chemsys,volume,...,weighted_surface_energy,weighted_work_function,surface_anisotropy,shape_factor,has_reconstructed,possible_species,has_props,theoretical,database_IDs,fields_not_requested
0,emmet_version='0.38.6' pymatgen_version='2022....,12.0,"[Element Cl, Element Na]",2.0,Na6 Cl6,Na1 Cl1,NaCl,AB,Cl-Na,284.082319,...,,,,,,"['Na+', 'Cl-']","[<HasProps.oxi_states: 'oxi_states'>, <HasProp...",True,{},[]
1,,,,,,,,,,,...,,,,,,,,,,
2,emmet_version='0.38.6' pymatgen_version='2022....,16.0,"[Element F, Element H, Element Mg, Element O]",4.0,Mg4 H4 O4 F4,Mg1 H1 O1 F1,MgHOF,ABCD,F-H-Mg-O,151.014639,...,,,,,,"['F-', 'Mg2+', 'H+', 'O2-']","[<HasProps.oxi_states: 'oxi_states'>, <HasProp...",False,{<Database.ICSD: 'icsd'>: ['icsd-186501']},[]
3,emmet_version='0.38.6' pymatgen_version='2022....,4.0,"[Element Cu, Element O]",2.0,Cu2 O2,Cu1 O1,CuO,AB,Cu-O,44.213620,...,,,,,,"['O2-', 'Cu2+']","[<HasProps.thermo: 'thermo'>, <HasProps.materi...",True,{},[]
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,emmet_version='0.38.6' pymatgen_version='2022....,100.0,"[Element S, Element Zn]",2.0,Zn50 S50,Zn1 S1,ZnS,AB,S-Zn,2478.010635,...,,,,,,"['S2-', 'Zn2+']",[<HasProps.electronic_structure: 'electronic_s...,True,{},[]
4996,,,,,,,,,,,...,,,,,,,,,,
4997,,,,,,,,,,,...,,,,,,,,,,
4998,,,,,,,,,,,...,,,,,,,,,,


# Finally, we will use PaDelPy to collect 1875 descriptors

In [22]:
from padelpy import from_smiles

In [24]:
# calculate molecular descriptors (propane was used just as an example)
descriptors_PaDELPy = list(from_smiles('CCC'))
descriptors_values = list(from_smiles('CCC').values())

In [25]:
#let's get the list of all available descriptors
for i in list(descriptors_PaDELPy):
    print(i)

nAcid
ALogP
ALogp2
AMR
apol
naAromAtom
nAromBond
nAtom
nHeavyAtom
nH
nB
nC
nN
nO
nS
nP
nF
nCl
nBr
nI
nX
ATS0m
ATS1m
ATS2m
ATS3m
ATS4m
ATS5m
ATS6m
ATS7m
ATS8m
ATS0v
ATS1v
ATS2v
ATS3v
ATS4v
ATS5v
ATS6v
ATS7v
ATS8v
ATS0e
ATS1e
ATS2e
ATS3e
ATS4e
ATS5e
ATS6e
ATS7e
ATS8e
ATS0p
ATS1p
ATS2p
ATS3p
ATS4p
ATS5p
ATS6p
ATS7p
ATS8p
ATS0i
ATS1i
ATS2i
ATS3i
ATS4i
ATS5i
ATS6i
ATS7i
ATS8i
ATS0s
ATS1s
ATS2s
ATS3s
ATS4s
ATS5s
ATS6s
ATS7s
ATS8s
AATS0m
AATS1m
AATS2m
AATS3m
AATS4m
AATS5m
AATS6m
AATS7m
AATS8m
AATS0v
AATS1v
AATS2v
AATS3v
AATS4v
AATS5v
AATS6v
AATS7v
AATS8v
AATS0e
AATS1e
AATS2e
AATS3e
AATS4e
AATS5e
AATS6e
AATS7e
AATS8e
AATS0p
AATS1p
AATS2p
AATS3p
AATS4p
AATS5p
AATS6p
AATS7p
AATS8p
AATS0i
AATS1i
AATS2i
AATS3i
AATS4i
AATS5i
AATS6i
AATS7i
AATS8i
AATS0s
AATS1s
AATS2s
AATS3s
AATS4s
AATS5s
AATS6s
AATS7s
AATS8s
ATSC0c
ATSC1c
ATSC2c
ATSC3c
ATSC4c
ATSC5c
ATSC6c
ATSC7c
ATSC8c
ATSC0m
ATSC1m
ATSC2m
ATSC3m
ATSC4m
ATSC5m
ATSC6m
ATSC7m
ATSC8m
ATSC0v
ATSC1v
ATSC2v
ATSC3v
ATSC4v
ATSC5v
ATSC6v
ATSC7v
ATSC8v
ATSC0

ETA_Eta_F
ETA_EtaP_F
ETA_Eta_L
ETA_EtaP_L
ETA_Eta_R_L
ETA_Eta_F_L
ETA_EtaP_F_L
ETA_Eta_B
ETA_EtaP_B
ETA_Eta_B_RC
ETA_EtaP_B_RC
FMF
fragC
nHBAcc
nHBAcc2
nHBAcc3
nHBAcc_Lipinski
nHBDon
nHBDon_Lipinski
HybRatio
IC0
IC1
IC2
IC3
IC4
IC5
TIC0
TIC1
TIC2
TIC3
TIC4
TIC5
SIC0
SIC1
SIC2
SIC3
SIC4
SIC5
CIC0
CIC1
CIC2
CIC3
CIC4
CIC5
BIC0
BIC1
BIC2
BIC3
BIC4
BIC5
MIC0
MIC1
MIC2
MIC3
MIC4
MIC5
ZMIC0
ZMIC1
ZMIC2
ZMIC3
ZMIC4
ZMIC5
Kier1
Kier2
Kier3
nAtomLC
nAtomP
nAtomLAC
MLogP
McGowan_Volume
MDEC-11
MDEC-12
MDEC-13
MDEC-14
MDEC-22
MDEC-23
MDEC-24
MDEC-33
MDEC-34
MDEC-44
MDEO-11
MDEO-12
MDEO-22
MDEN-11
MDEN-12
MDEN-13
MDEN-22
MDEN-23
MDEN-33
MLFER_A
MLFER_BH
MLFER_BO
MLFER_S
MLFER_E
MLFER_L
MPC2
MPC3
MPC4
MPC5
MPC6
MPC7
MPC8
MPC9
MPC10
TPC
piPC1
piPC2
piPC3
piPC4
piPC5
piPC6
piPC7
piPC8
piPC9
piPC10
TpiPC
R_TpiPCTPC
PetitjeanNumber
nRing
n3Ring
n4Ring
n5Ring
n6Ring
n7Ring
n8Ring
n9Ring
n10Ring
n11Ring
n12Ring
nG12Ring
nFRing
nF4Ring
nF5Ring
nF6Ring
nF7Ring
nF8Ring
nF9Ring
nF10Ring
nF11Ring
nF12Ring
nFG

In [26]:
#create a DataFrame that will include all data about descriptors
df_PaDELPy = pd.DataFrame(columns=descriptors_PaDELPy)

In [None]:
# enumeration of all names in the DataFrame. Calculated for this name descriptors is inserting in the df_PaDELPy Dataframe. 
#If this name cannot be used for calculating descriptors, nan row is incerting
df_nan = pd.DataFrame(columns=descriptors_PaDELPy, index=[0])
for i,_ in df_initial.iterrows():
    try:
        df_new_row = pd.DataFrame(columns=descriptors_PaDELPy, index=[0])
        compound = pcp.get_compounds(df_initial.Name[i], 'name')
        smile = compound[0].canonical_smiles
        df_PaDELPy = pd.concat([df_PaDELPy, pd.DataFrame([from_smiles(smile)])])
    except:
        df_PaDELPy = pd.concat([df_PaDELPy, df_nan])

In [28]:
#let's check everything is OK in the obtained database
df_PaDELPy

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,0.0,,,,25.780000,0.0,0.0,2.0,2.0,0.0,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,0.0,0.6954,0.483581,114.8273,52.304723,0.0,0.0,39.0,28.0,11.0,...,,,,,,,,,,
3,0.0,-0.0233,0.000543,1.4429,6.902000,0.0,0.0,2.0,2.0,0.0,...,,,,,,,,,,
4,0.0,-0.6854,0.469773,16.0871,11.416344,0.0,0.0,12.0,4.0,8.0,...,0.655206,0.220086,0.595615,0.529887,0.456928,3.178534,2.559691,6.315715,0.482809,1.582430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3.0,-0.6898,0.475824,34.1231,20.951551,0.0,0.0,19.0,12.0,7.0,...,,,,,,,,,,
4996,0.0,0.0000,0.000000,0.0000,14.200000,0.0,0.0,2.0,2.0,0.0,...,,,,,,,,,,
4997,0.0,0.1242,0.015426,21.5446,14.509930,0.0,0.0,15.0,5.0,10.0,...,0.495053,0.362138,0.529740,0.452957,0.371929,3.592450,3.893548,8.673008,0.285786,1.354626
4998,0.0,-1.7280,2.985984,17.4696,18.561516,0.0,0.0,18.0,6.0,12.0,...,0.443467,0.397111,0.585551,0.534113,0.413389,3.856162,4.611365,10.077385,0.260867,1.533053


In [29]:
#concatenate all obtained dataframes to get the final one
df = pd.concat([df_initial, df_pcp, df_rdkit, df_pymatgen, df_PaDELPy], axis=1)

In [30]:
#as we can see, our final database contains 2046 columns. So, we have 2035 columns of descriptors
df

Unnamed: 0,DOI,Date,Journal,Title,Name,measurement_error,measurement_wavelength,measurement_method,normalised_name,raw_value,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,10.1016/j.apradiso.2018.05.013,5/21/2018,Applied Radiation and Isotopes,EFFECTSSODIUMSALICYLATEDETERMINATIONLEAD210BIS...,NaCl,0.00,,el_cde_tables,,1.3373,...,,,,,,,,,,
1,10.1016/j.jct.2012.03.002,3/19/2012,The Journal of Chemical Thermodynamics,PHYSICOCHEMICALSTUDIESSODIUMTETRAPHENYLBORATET...,NaPh4B,0.00,,el_mylogic,"[['B', 1.0], ['Na', 1.0], ['Ph', 4.0]]",1.5056,...,,,,,,,,,,
2,10.1016/j.jbiotec.2014.03.023,3/22/2014,Journal of Biotechnology,THERMODYNAMICINVESTIGATIONZ33ANTIBODYINTERACTI...,FITC,0.00,,el_cde_tables,Oc1ccc2c(Oc3cc(O)ccc3C24OC(=O)c5cc(ccc45)N=C=S)c1,1.62,...,,,,,,,,,,
3,10.1016/j.solmat.2012.12.035,1/31/2013,Solar Energy Materials and Solar Cells,OPTICALREFLECTIONHETEROJUNCTIONINTERFACETHINFI...,CuO,0.00,,el_cde_tables,"[['Cu', 1.0], ['O', 1.0]]",2.58,...,,,,,,,,,,
4,10.1016/j.fluid.2008.10.013,12/5/2008,Fluid Phase Equilibria,ERRATUMVAPORLIQUIDEQUILIBRIUMDENSITIESINTERFAC...,Propan-1-ol,0.00,,el_cde_tables,,1.38333,...,0.655206,0.220086,0.595615,0.529887,0.456928,3.178534,2.559691,6.315715,0.482809,1.582430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,10.1016/j.spmi.2015.10.029,10/27/2015,Superlattices and Microstructures,OPTICALINVESTIGATIONSBLUESHIFTINZNSQUANTUMDOTS,ZnS,0.33,,el_mylogic,"[['S', 1.0], ['Zn', 1.0]]",1.59 2.10 2.07 2.25,...,,,,,,,,,,
4996,10.1016/j.mseb.2005.03.029,5/4/2005,Materials Science and Engineering: B,NEWOXYNITRIDEPEARLESCENTPIGMENTS,tantalum nitride,0.00,,el_cde_text,N#[Ta],3.8,...,,,,,,,,,,
4997,10.1039/C6AN00509H,6/20/2016,Analyst,A photochromic–acidochromic HCl fluorescent pr...,i-Butanol,0.00,,rsc_cde_tables,CC(C)CO,1.3959,...,0.495053,0.362138,0.529740,0.452957,0.371929,3.592450,3.893548,8.673008,0.285786,1.354626
4998,10.1039/B208765K,1/16/2003,Physical Chemistry Chemical Physics,"Refractive indices, molar volumes and molar re...",Cyclohexane,0.00,,rsc_cde_tables,,5,...,0.443467,0.397111,0.585551,0.534113,0.413389,3.856162,4.611365,10.077385,0.260867,1.533053
