In [1]:
%%capture
import numpy as np
import pandas as pd
!pip install openpyxl
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

In [2]:
max_calculated = pd.read_csv('max_calculated.csv')
max_elemental = pd.read_csv('max_elemental.csv')
list_mxene = pd.read_excel('synthesized-MXenes-MAX.xlsx',sheet_name=0)
list_failed = pd.read_excel('synthesized-MXenes-MAX.xlsx', sheet_name=2)
n_samples = max_elemental.shape[0]
synth_list = pd.unique(list_mxene['MXene'])[:-1]
to_drop = list(range(167,173))
mx_ene_df = list_mxene.drop(labels = to_drop, axis='index')
mx_ene_df = mx_ene_df.drop(['Unnamed: 9','Unnamed: 12','Notes','status','Reference method'],axis=1)
max_elemental['class'] = np.zeros(max_elemental.shape[0])
parents = mx_ene_df['Parent material'].unique()
banned_words = ['+','Mxene','topochemical','reaction', 'or',
               'synthesis','MXene','direct']
complete_parents = []
for i in range(len(parents)):
    inter = parents[i].split()
    for word in range(len(inter)):
        if inter[word] not in banned_words:
            complete_parents.append(inter[word])


for i in range(max_elemental.shape[0]):
    if max_elemental.loc[i,'compound_formula'] in complete_parents:
        max_elemental.loc[i,'class'] = 1

max_elemental = max_elemental.set_index('compound_formula',drop=True)
max_elemental = max_elemental.drop(['M_element', 'A_element', 'X_element'],axis=1)
max_calculated = max_calculated.set_index('prettyformula',drop=True)

In [3]:
whole_data = max_elemental.merge(max_calculated,how='inner',
                                 left_index=True,right_index=True)

In [4]:
whole_data = whole_data.drop(['PU_label','year'],axis=1)

In [5]:
M_elements = pd.get_dummies(whole_data['M'],dtype=float)
A_elements = pd.get_dummies(whole_data['A'],dtype=float)
X_elements = pd.get_dummies(whole_data['X'],dtype=float)

In [6]:
whole_data = whole_data.drop(['M','A','X'],axis=1)

In [7]:
imp_set = whole_data.index

In [8]:
M_elements.index = imp_set
A_elements.index = imp_set
X_elements.index = imp_set

In [9]:
groups = pd.get_dummies(whole_data['X_X_group'],prefix='group',dtype=float)

In [10]:
whole_data = whole_data.drop(['X_X_group'],axis=1)

In [11]:
whole_data = pd.concat([whole_data,M_elements,A_elements,X_elements,groups],axis=1)

In [14]:
test_tree = DecisionTreeClassifier().fit(X=whole_data.drop(['class'],axis=1),
                                          y=whole_data['class'])

In [21]:
imp_feat = test_tree.feature_importances_
names_feat = test_tree.feature_names_in_

In [28]:
imp_feat = imp_feat.reshape(-1,1)
names_feat = names_feat.reshape(-1,1)
test_df = pd.DataFrame(np.hstack((names_feat,imp_feat)))
test_df.columns = ['names_feat','imp_feat']
test_df = test_df.set_index('names_feat',drop=True)

In [39]:
test_df[test_df['imp_feat'] >0].sort_values(by='imp_feat',ascending=False)

Unnamed: 0_level_0,imp_feat
names_feat,Unnamed: 1_level_1
dH,0.227331
rMX,0.17083
dEf,0.089482
dEc,0.082167
A_A_atomic_mass,0.07399
dMA,0.063235
rMA,0.055926
mass.pa,0.053689
volume.pa,0.044741
Zn,0.039549


In [40]:
diff_z = test_df[test_df['imp_feat'] > 0].index

In [42]:
number_of_atoms = np.zeros(n_samples)
compteur = 0
for element in whole_data.index:
    inter = []
    for cara in element:
        if cara in list(str(1234567890)):
            inter.append(cara)
    if len(inter) == 1:
        number_of_atoms[compteur] = int(inter[0]) + 2
    elif len(inter) == 2:
        number_of_atoms[compteur] = int(inter[0]) + int(inter[1]) + 1
    elif len(inter) == 3:
        number_of_atoms[compteur] = int(inter[0]) + int(inter[1]) + int(inter[2])
    compteur += 1


In [43]:
columns_name = whole_data.columns.copy()
normalized = whole_data.drop(['class'],axis=1).to_numpy()/number_of_atoms.reshape(-1,1)

In [44]:
data_norm = pd.DataFrame(normalized)
data_norm['class'] = whole_data['class'].copy()
data_norm.columns = columns_name
data_norm['compound_name'] = whole_data.index
data_norm = data_norm.set_index('compound_name',drop=True)

In [45]:
data_norm

Unnamed: 0_level_0,M_M_atomic_number,M_M_atomic_mass,M_M_molar_volume,M_M_density,M_M_atomic_radius,M_M_atomic_radius_calculated,M_M_van_der_waals_radius,M_M_average_ionic_radius,M_M_average_cationic_radius,M_M_average_anionic_radius,...,Se,Si,Sn,Te,Tl,Zn,C,N,group_14,group_15
compound_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sc2AlC,5.25,11.238978,2.50,1.123898,0.40000,0.46000,0.5375,0.221250,0.221250,0.0,...,0.00,0.0,0.0,0.000,0.0,0.25,0.000,0.25,0.000,
Sc2SiC,5.25,11.238978,2.50,1.123898,0.40000,0.46000,0.5375,0.221250,0.221250,0.0,...,0.25,0.0,0.0,0.000,0.0,0.25,0.000,0.25,0.000,
Sc2PC,5.25,11.238978,2.50,1.123898,0.40000,0.46000,0.5375,0.221250,0.221250,0.0,...,0.00,0.0,0.0,0.000,0.0,0.25,0.000,0.25,0.000,
Sc2SC,5.25,11.238978,2.50,1.123898,0.40000,0.46000,0.5375,0.221250,0.221250,0.0,...,0.00,0.0,0.0,0.000,0.0,0.25,0.000,0.25,0.000,
Sc2MnC,5.25,11.238978,2.50,1.123898,0.40000,0.46000,0.5375,0.221250,0.221250,0.0,...,0.00,0.0,0.0,0.000,0.0,0.25,0.000,0.25,0.000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W4AuN3,9.25,22.980000,1.25,2.298000,0.16875,0.24125,0.2725,0.095833,0.095833,0.0,...,0.00,0.0,0.0,0.000,0.0,0.00,0.125,0.00,0.125,
W4HgN3,9.25,22.980000,1.25,2.298000,0.16875,0.24125,0.2725,0.095833,0.095833,0.0,...,0.00,0.0,0.0,0.000,0.0,0.00,0.125,0.00,0.125,
W4TlN3,9.25,22.980000,1.25,2.298000,0.16875,0.24125,0.2725,0.095833,0.095833,0.0,...,0.00,0.0,0.0,0.125,0.0,0.00,0.125,0.00,0.125,
W4PbN3,9.25,22.980000,1.25,2.298000,0.16875,0.24125,0.2725,0.095833,0.095833,0.0,...,0.00,0.0,0.0,0.000,0.0,0.00,0.125,0.00,0.125,
