In [1]:
%%capture
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import bernoulli
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
!pip install openpyxl

In [2]:
max_calculated = pd.read_csv('max_calculated.csv')
max_elemental = pd.read_csv('max_elemental.csv')
list_mxene = pd.read_excel('synthesized-MXenes-MAX.xlsx',sheet_name=0)
list_failed = pd.read_excel('synthesized-MXenes-MAX.xlsx', sheet_name=2)
n_samples = max_elemental.shape[0]

In [3]:
whole_data = max_calculated.merge(max_elemental, left_on='prettyformula',
                                  right_on='compound_formula')

In [4]:
whole_data = pd.get_dummies(data=whole_data,
                            prefix='group', columns=['X_X_group'], prefix_sep='_')

Method k-fold cross validated, evaluated by their true positive rate.
Against the 63 synthesized MX_enes.

In [5]:
(77/1014)*100

7.593688362919132

Features retained in the article followed by \\ the name in the dataframe :

n : number of layers \\ n

a : in plane lattice constant \\ a

c : out of plane lattice constant \\ c

d_MM,MX,MA,AA : layer distances \\ dMM,MX,MA,AAA

r_MX,MA : bonds lengths \\ rMX,MA

m : mass \\ mass

V : volume \\ volume

rho : density \\density

Energy : total energy

E_pa : energy per atom \\ E_pa

E_form formation energy \\ dEf

E_coh : cohesive energy

e_M,A,X : Bader charges on M,A,X atoms respectively \\ e_M,A,X

Z : atomic number 

N_V : number of valence electrons \\ i_valence_electron $i \in {M,A,X}$

g_P : group number \\ i_group $i \in {M,A,X}$

p_p : period number \\ i_row $i \in {M,A,X}$

Xe : electronegativity \\ i_electronegativity $i \in {M,A,X}$

EA : electron affinity \\ i_electron_affinity
$i \in {M,A,X}$

IE1 : 1st ionization potential \\ i_first_ionization_energy $i \in {M,A,X}$

IE2 : 2nd ionization potential \\ i_second_ionization_energy $i \in {M,A,X}$

E_chem : chemical potential 

E_atom : atom-in-box potential

In [6]:
whole_data = whole_data.set_index(keys='prettyformula',drop=True)

In [7]:
whole_data = whole_data.replace(to_replace=True,value=14)
whole_data = whole_data.replace(to_replace=False,value=15)
whole_data = whole_data.drop(['M_element','X_element','A_element',
                             'M','A','X','year','compound_formula'],axis=1)

  whole_data = whole_data.replace(to_replace=False,value=15)


In [8]:
first_tree = DecisionTreeClassifier().fit(X=whole_data.drop(['PU_label'],axis=1), y=whole_data['PU_label'])

In [9]:
list_of_imp = first_tree.feature_importances_
list_of_names = np.array(np.delete(whole_data.columns,np.where(whole_data.columns == 'PU_label')))

In [10]:
features_importance = np.hstack((list_of_imp.reshape(list_of_imp.shape[0],1),
                                 list_of_names.reshape(list_of_names.shape[0],1)))

In [11]:
features_df = pd.DataFrame(features_importance)
features_df.columns = ['value','name']
features_df = features_df.sort_values('value',ascending=False)

In [12]:
features_df.head()

Unnamed: 0,value,name
20,0.372909,dH
10,0.116049,e_A
9,0.071255,e_M
43,0.04472,A_A_atomic_radius_calculated
3,0.035467,dMX


## USING THE SYNTHESIZED MAXENES AS POSITIVES

In [13]:
synth_list = pd.unique(list_mxene['MXene'])[:-1]
to_drop = list(range(167,173))

In [14]:
mx_ene_df = list_mxene.drop(labels = to_drop, axis='index')

In [15]:
mx_ene_df = mx_ene_df.drop(['Unnamed: 9','Unnamed: 12','Notes','status','Reference method'],axis=1)

In [16]:
max_elemental['label'] = np.zeros(max_elemental.shape[0])

In [17]:
parents = mx_ene_df['Parent material'].unique()
banned_words = ['+','Mxene','topochemical','reaction', 'or',
               'synthesis','MXene','direct']
complete_parents = []
for i in range(len(parents)):
    inter = parents[i].split()
    for word in range(len(inter)):
        if inter[word] not in banned_words:
            complete_parents.append(inter[word])


for i in range(max_elemental.shape[0]):
    if max_elemental.loc[i,'compound_formula'] in complete_parents:
        max_elemental.loc[i,'label'] = 1

In [18]:
max_elemental = max_elemental.set_index('compound_formula',drop=True)

In [19]:
max_elemental = max_elemental.drop(['M_element', 'A_element', 'X_element'],axis=1)

In [20]:
test_tree = DecisionTreeClassifier().fit(X=max_elemental.drop(['label'],axis=1),
                                          y=max_elemental['label'])

In [21]:
imp_feat = test_tree.feature_importances_
names_feat = test_tree.feature_names_in_
df_imp_feat = pd.DataFrame(np.hstack((imp_feat.reshape(imp_feat.shape[0],1),names_feat.reshape(imp_feat.shape[0],1))))
df_imp_feat.columns = ['features', 'name']
df_imp_feat = df_imp_feat.sort_values('features', ascending=False)

In [22]:
df_diff_z = df_imp_feat[df_imp_feat['features'] != 0]

In [23]:
max_elemental[max_elemental['label'] == 1].shape

(15, 52)

In [24]:
max_elemental.shape

(2262, 52)

In [25]:
(15/2262)*100

0.6631299734748011

In [26]:
#less than 1% of positive data --> hard

In [27]:
failed = list_failed['MAX']
failed = list(failed)

In [28]:
for i in max_elemental.index:
    if i in failed:
        max_elemental.loc[i,'label'] = -1

In [29]:
number_of_atoms = np.empty(n_samples)
compteur = 0
for element in max_elemental.index:
    inter = []
    for cara in element:
        if cara in list(str(1234567890)):
            inter.append(cara)
    if len(inter) == 1:
        number_of_atoms[compteur] = int(inter[0]) + 2
    elif len(inter) == 2:
        number_of_atoms[compteur] = int(inter[0]) + int(inter[1]) + 1
    elif len(inter) == 3:
        number_of_atoms[compteur] = int(inter[0]) + int(inter[1]) + int(inter[2])
    compteur += 1

In [30]:
columns_name = max_elemental.columns.copy()

In [31]:
normalized = max_elemental.drop(['label'],axis=1).to_numpy()/number_of_atoms.reshape(n_samples,1)

In [32]:
max_elem_norm = pd.DataFrame(normalized)

In [33]:
max_elem_norm['label'] = max_elemental['label'].copy()

In [34]:
max_elem_norm.columns = columns_name
max_elem_norm['compound_name'] = max_elemental.index
max_elem_norm = max_elem_norm.set_index('compound_name',drop=True)

In [35]:
max_elem_norm['label'] = max_elemental['label'].copy()

In [37]:
#repetition of the cell above is because if we don't do that all the labels become NaN