In [None]:
#!usr/bin/env python3

In [2]:
"""
This code will involve the importing and preprocessing of perovskite crystal data. The data was
taken from a scientific paper published by Nega et al. This dataset will be used to try and 
create a classification model that can predict the crystal score of whichever perovskite  
is involved in the most experimental dataset runs.
"""




In [51]:
#Importing necessary libraries for data collection and preprocessing will be imported

import pandas as pd
import numpy as np
import pubchempy as pcp

In [58]:
#Reading in CSV of perovskite data

perovskite_set = pd.read_csv('PSKITECSV.csv')
perovskite_set = perovskite_set.drop(columns=['name'])
perovskite_set.head()

Unnamed: 0,_rxn_organic-inchikey,_rxn_M_acid,_rxn_M_inorganic,_rxn_M_organic,_solv_GBL,_solv_DMSO,_solv_DMF,_stoich_mmol_org,_stoich_mmol_inorg,_stoich_mmol_acid,...,_feat_organic_0_chainatomcount_std,_feat_organic_0_ringatomcount_std,_raw_modelname,_feat_primaryAmine,_feat_secondaryAmine,_rxn_plateEdgeQ,_feat_maxproj_per_N,_feat_vdw_per_N,_raw_RelativeHumidity,_out_crystalscore
0,XFYICZOIWSBQSK-UHFFFAOYSA-N,6.219603,0.249044,0.721707,1,0,0,0.470553,0.162377,4.055181,...,3,0,escalate_ExpertQuasiRandom_1.1,1,0,1,22.85,112.02,-1.0,1
1,XFYICZOIWSBQSK-UHFFFAOYSA-N,6.754488,0.253438,0.858792,1,0,0,0.576249,0.170057,4.532262,...,3,0,escalate_ExpertQuasiRandom_1.1,1,0,1,22.85,112.02,-1.0,1
2,XFYICZOIWSBQSK-UHFFFAOYSA-N,6.754488,0.415311,0.948975,1,0,0,0.636762,0.278674,4.532262,...,3,0,escalate_ExpertQuasiRandom_1.1,1,0,1,22.85,112.02,-1.0,4
3,XFYICZOIWSBQSK-UHFFFAOYSA-N,6.783878,0.494693,1.522682,1,0,0,1.023242,0.332434,4.558766,...,3,0,escalate_ExpertQuasiRandom_1.1,1,0,1,22.85,112.02,-1.0,4
4,XFYICZOIWSBQSK-UHFFFAOYSA-N,6.636048,0.335557,1.266763,1,0,0,0.844931,0.223817,4.426244,...,3,0,escalate_ExpertQuasiRandom_1.1,1,0,1,22.85,112.02,-1.0,1


In [None]:
"""
This dataset is very large and involves many A-site cations. We will refine our modelling by
only focusing on one type of perovskite. By combing the data we can find the most widely
experimented upon perovskite
"""

In [53]:
print(perovskite_set['_rxn_organic-inchikey'].value_counts(ascending=False))
most_common_inchikey = perovskite_set['_rxn_organic-inchikey'].value_counts().index[0]
most_used = pcp.get_compounds(most_common_inchikey, 'inchikey')[0]
most_used = most_used.iupac_name

print('\nThe most prevalent A-site cation is ' + str(most_used))

XFYICZOIWSBQSK-UHFFFAOYSA-N    1645
GGYGJCFIYJVWIP-UHFFFAOYSA-N     656
UUDRLGYROXTISK-UHFFFAOYSA-N     384
DMFMZFFIQRMJQZ-UHFFFAOYSA-N     379
JMXLWMIFDJCGBV-UHFFFAOYSA-N     300
FCTHQYIDLRRROX-UHFFFAOYSA-N     280
VMLAEGAAHIIWJX-UHFFFAOYSA-N     207
QHJPGANWSLEMTI-UHFFFAOYSA-N     192
HBPSMMXRESDUSG-UHFFFAOYSA-N     192
JBOIAZWJIACNJF-UHFFFAOYSA-N     192
PBGZCCFVBVEIAS-UHFFFAOYSA-N     191
SQXJHWOXNLTOOO-UHFFFAOYSA-N     190
KFXBDBPOGBBVMC-UHFFFAOYSA-N     175
PPCHYMCMRUGLHR-UHFFFAOYSA-N     171
CQWGDVVCKBJLNX-UHFFFAOYSA-N     165
PXWSKGXEHZHFJA-UHFFFAOYSA-N     142
CALQKRVFTWDYDG-UHFFFAOYSA-N     142
NOHLSFNWSBZSBW-UHFFFAOYSA-N     141
QRFXELVDJSDWHX-UHFFFAOYSA-N     139
GIAPQOZCVIEHNY-UHFFFAOYSA-N     131
LLWRXQXPJMPHLR-UHFFFAOYSA-N     128
ZEVRFFCPALTVDN-UHFFFAOYSA-N      96
BJDYCCHRZIFCGN-UHFFFAOYSA-N      96
XZUCBFLUEBDNSJ-UHFFFAOYSA-N      96
VNAAUNTYIONOHR-UHFFFAOYSA-N      96
VAWHFUNJDMQUSB-UHFFFAOYSA-N      96
NXRUEVJQMBGVAT-UHFFFAOYSA-N      96
QZCGFUVVXNFSLE-UHFFFAOYSA-N 

In [None]:
"""
We can see that the most present perovskite in this dataset is ethanamine;hydriodide, 
commonly known as ethylamine hydriodide. We can drop all other rows that do not concern this 
compound. Additionally, we can drop the inchikey column since we know the compound we 
are looking at, the raw model name compound column, and any row in which the raw relative
humidity is equal to 1 (these are errors) along with reaction time. 

We will also drop all columns that contain only one, identical value for each observation.
"""

In [59]:
perovskite_set = perovskite_set.drop(
    perovskite_set.index[perovskite_set['_rxn_organic-inchikey'] != most_common_inchikey],
)
perovskite_set = perovskite_set.drop(columns=['_raw_modelname'])
perovskite_set = perovskite_set.drop(columns=['_rxn_organic-inchikey'])
perovskite_set = perovskite_set.drop(
    perovskite_set.index[perovskite_set['_raw_RelativeHumidity'] == -1],
)
perovskite_set = perovskite_set.reset_index()
perovskite_set = perovskite_set.drop(columns=['index'])

perovskite_set.head()

Unnamed: 0,_rxn_M_acid,_rxn_M_inorganic,_rxn_M_organic,_solv_GBL,_solv_DMSO,_solv_DMF,_stoich_mmol_org,_stoich_mmol_inorg,_stoich_mmol_acid,_stoich_mmol_solv,...,_feat_organic_0_bondcount_std,_feat_organic_0_chainatomcount_std,_feat_organic_0_ringatomcount_std,_feat_primaryAmine,_feat_secondaryAmine,_rxn_plateEdgeQ,_feat_maxproj_per_N,_feat_vdw_per_N,_raw_RelativeHumidity,_out_crystalscore
0,6.823319,0.439475,0.974548,1,0,0,0.654897,0.295327,4.58527,5.486462,...,10,3,0,1,0,1,22.85,112.02,48.721311,4
1,6.665791,0.437176,0.964545,1,0,0,0.644316,0.292033,4.452748,5.514275,...,10,3,0,1,0,1,22.85,112.02,48.721311,4
2,6.725011,0.432593,0.978919,1,0,0,0.655876,0.289838,4.505757,5.503743,...,10,3,0,1,0,1,22.85,112.02,48.721311,4
3,6.81318,0.450241,0.91891,1,0,0,0.618426,0.303012,4.58527,5.531963,...,10,3,0,1,0,1,22.85,112.02,48.721311,4
4,6.485984,0.431188,0.935297,1,0,0,0.619166,0.285446,4.293721,5.548255,...,10,3,0,1,0,1,22.85,112.02,48.721311,3


In [64]:
drop_tester = perovskite_set.nunique()
to_drop = drop_tester[drop_tester == 1].index

perovskite_set = perovskite_set.drop(to_drop, axis=1)
perovskite_set.head()

Unnamed: 0,_rxn_M_acid,_rxn_M_inorganic,_rxn_M_organic,_stoich_mmol_org,_stoich_mmol_inorg,_stoich_mmol_acid,_stoich_mmol_solv,_stoich_org/solv,_stoich_inorg/solv,_stoich_acid/solv,...,_stoich_org+inorg+acid/solv,_stoich_org/liq,_stoich_inorg/liq,_stoich_org+inorg/liq,_stoich_org/inorg,_stoich_acid/inorg,_rxn_temperature_c,_rxn_plateEdgeQ,_raw_RelativeHumidity,_out_crystalscore
0,6.823319,0.439475,0.974548,0.654897,0.295327,4.58527,5.486462,0.119366,0.053828,0.835743,...,1.008937,0.065023,0.029322,0.094346,2.21753,15.526081,105,1,48.721311,4
1,6.665791,0.437176,0.964545,0.644316,0.292033,4.452748,5.514275,0.116845,0.05296,0.807495,...,0.977299,0.064645,0.0293,0.093945,2.206309,15.247395,105,1,48.721311,4
2,6.725011,0.432593,0.978919,0.655876,0.289838,4.505757,5.503743,0.119169,0.052662,0.818671,...,0.990502,0.065525,0.028956,0.094482,2.262907,15.545798,105,1,48.721311,4
3,6.81318,0.450241,0.91891,0.618426,0.303012,4.58527,5.531963,0.111791,0.054775,0.828869,...,0.995435,0.061126,0.02995,0.091076,2.04093,15.132303,105,1,48.721311,4
4,6.485984,0.431188,0.935297,0.619166,0.285446,4.293721,5.548255,0.111597,0.051448,0.773887,...,0.936931,0.062911,0.029003,0.091914,2.169118,15.042142,105,1,48.721311,3


In [65]:
"""
With the redundant and faulty columns and rows removed, we can now standardize some of 
our data points to further clean our data. We can also implement PCA to see if 
dimensionality reduction can bolster the efficiency of our future models without 
inhibiting performance too heavily.
"""

%store perovskite_set

Stored 'perovskite_set' (DataFrame)
