In [None]:
%%capture
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.svm import SVC
!pip install openpyxl

In [None]:
def rbf(x,y,l_squared=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.zeros(dim)
    type_x = x.shape
    type_y = y.shape
    if len(type_x) == len(type_y):
        d = np.exp(-((np.linalg.norm(x-y))**2)/(2*l_squared))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y))**2)/(2*l_squared))
        return vect


In [None]:
max_calculated = pd.read_csv('max_calculated.csv')
max_elemental = pd.read_csv('max_elemental.csv')
list_mxene = pd.read_excel('synthesized-MXenes-MAX.xlsx',sheet_name=0)
list_failed = pd.read_excel('synthesized-MXenes-MAX.xlsx', sheet_name=2)
n_samples = max_elemental.shape[0]

In [None]:
synth_list = pd.unique(list_mxene['MXene'])[:-1]
to_drop = list(range(167,173))
mx_ene_df = list_mxene.drop(labels = to_drop, axis='index')
mx_ene_df = mx_ene_df.drop(['Unnamed: 9','Unnamed: 12','Notes','status','Reference method'],axis=1)
max_elemental['class'] = np.zeros(max_elemental.shape[0])
parents = mx_ene_df['Parent material'].unique()
banned_words = ['+','Mxene','topochemical','reaction', 'or',
               'synthesis','MXene','direct']
complete_parents = []
for i in range(len(parents)):
    inter = parents[i].split()
    for word in range(len(inter)):
        if inter[word] not in banned_words:
            complete_parents.append(inter[word])


for i in range(max_elemental.shape[0]):
    if max_elemental.loc[i,'compound_formula'] in complete_parents:
        max_elemental.loc[i,'class'] = 1

max_elemental = max_elemental.set_index('compound_formula',drop=True)
max_elemental = max_elemental.drop(['M_element', 'A_element', 'X_element'],axis=1)
test_tree = DecisionTreeClassifier().fit(X=max_elemental.drop(['class'],axis=1),
                                          y=max_elemental['class'])

imp_feat = test_tree.feature_importances_
names_feat = test_tree.feature_names_in_
df_imp_feat = pd.DataFrame(np.hstack((imp_feat.reshape(imp_feat.shape[0],1),names_feat.reshape(imp_feat.shape[0],1))))
df_imp_feat.columns = ['features', 'name']
df_imp_feat = df_imp_feat.sort_values('features', ascending=False)

df_diff_z = df_imp_feat[df_imp_feat['features'] != 0]


failed = list_failed['MAX']
failed = list(failed)

for i in max_elemental.index:
    if i in failed:
        max_elemental.loc[i,'class'] = -1


number_of_atoms = np.zeros(n_samples)
compteur = 0
for element in max_elemental.index:
    inter = []
    for cara in element:
        if cara in list(str(1234567890)):
            inter.append(cara)
    if len(inter) == 1:
        number_of_atoms[compteur] = int(inter[0]) + 2
    elif len(inter) == 2:
        number_of_atoms[compteur] = int(inter[0]) + int(inter[1]) + 1
    elif len(inter) == 3:
        number_of_atoms[compteur] = int(inter[0]) + int(inter[1]) + int(inter[2])
    compteur += 1

columns_name = max_elemental.columns.copy()
normalized = max_elemental.drop(['class'],axis=1).to_numpy()/number_of_atoms.reshape(n_samples,1)
max_elem_norm = pd.DataFrame(normalized)
max_elem_norm['class'] = max_elemental['class'].copy()
max_elem_norm.columns = columns_name
max_elem_norm['compound_name'] = max_elemental.index
max_elem_norm = max_elem_norm.set_index('compound_name',drop=True)

max_elem_norm['class'] = max_elemental['class'].copy()
list_of_imp_names = list(df_diff_z['name'])
list_of_imp_names.append('label')
list_of_imp_names.append('class')
max_elem_norm = max_elem_norm.filter(items=list_of_imp_names, axis=1)


In [None]:
max_elem_norm['label'] = np.zeros(n_samples)
for i in max_elem_norm.index:
    if max_elem_norm.loc[i,'class'] == 1:
        max_elem_norm.loc[i,'label'] = 1
    else:
        max_elem_norm.loc[i,'label'] = -1

positive_samples = max_elem_norm[max_elem_norm['label'] == 1]
unlabelled_samples = max_elem_norm[max_elem_norm['label'] == -1]

In [None]:
list_of_l_squared = np.hstack((np.linspace(1,10,4,endpoint=True),
                               np.linspace(20,100,5,endpoint=True),
                               np.linspace(200,1000,2,endpoint=True)))
dico_of_perf = {}
true_max_elem_norm = max_elem_norm.copy()
for i in tqdm(range(list_of_l_squared.shape[0])):
    std = list_of_l_squared[i] 
    list_of_positives_instances = positive_samples.index
    result_for_positive_instances = np.zeros(positive_samples.shape[0])
    general_count = 0
    for very_particular_element in list_of_positives_instances:    
        max_elem_norm = true_max_elem_norm.drop(very_particular_element,axis=0)
        max_elem_norm['label'] = np.zeros(n_samples -1)
        for i in max_elem_norm.index:
            if max_elem_norm.loc[i,'class'] == 1:
                max_elem_norm.loc[i,'label'] = 1
            else:
                max_elem_norm.loc[i,'label'] = 0
    
        #now, the step of clusterirng and determining whether an instance is positive or not
        n_cluster = 5
        clustering = KMeans(n_clusters=n_cluster).fit(max_elem_norm.to_numpy()[:,:-2])
        max_elem_norm['cluster'] = clustering.labels_
        list_of_ratio = []
        for i in range(n_cluster):
            list_of_ratio.append(max_elem_norm[max_elem_norm['cluster'] == i]['label'].sum()/max_elem_norm[max_elem_norm['cluster'] == i]['class'].shape[0])
        for i in max_elem_norm.index:
            if max_elem_norm.loc[i,'label'] == 0:
                max_elem_norm.loc[i,'label'] = -1
        list_of_ratio = np.array(list_of_ratio)
        positive_cluster = np.argmax(list_of_ratio)
        list_of_dist = np.zeros(5)
        for i in range(5):
            list_of_dist[i] = np.linalg.norm(clustering.cluster_centers_[positive_cluster,:] - clustering.cluster_centers_[i,:])
        negative_cluster = np.argmax(list_of_dist)
        df_unlab_pop = max_elem_norm[max_elem_norm['label'] == -1]
        list_of_pop = pd.DataFrame(df_unlab_pop.groupby('cluster')['class'].count())
        list_of_pop.columns = ['pop']
        list_of_pop['dist'] = list_of_dist #distance to the positive cluster
        list_of_pop = list_of_pop.sort_values('dist',ascending=False)
        list_of_pop['cumsum'] = np.cumsum(list_of_pop['pop'])     
        reliable_positives = max_elem_norm[max_elem_norm['label'] == 1]
        n_positives = reliable_positives.shape[0]
        last_step = np.where(np.array(list_of_pop['cumsum'])>n_positives)[0][0]
        index_ordered_distance = list(list_of_pop.index)
        if last_step == 0:
            reliable_negatives = max_elem_norm[max_elem_norm['cluster'] == negative_cluster]
            reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]
            reliable_negatives = reliable_negatives.sample(n=n_positives)
        else:
            compteur=0
            reliable_negatives = max_elem_norm[max_elem_norm['cluster'] == negative_cluster]
            reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]
            while compteur<last_step:
                interm_negatives = max_elem_norm[max_elem_norm['cluster'] == index_ordered_distance[compteur+1]]
                interm_negatives = interm_negatives[interm_negatives['label'] == -1]
                reliable_negatives = pd.concat([reliable_negatives,interm_negatives])
                compteur += 1
            reliable_negatives = reliable_negatives.head(n_positives)
        #Step of initialization of labels
        train_clf_data = pd.concat([reliable_positives,reliable_negatives])
        index_of_labels = list(train_clf_data.index)
        unlabelled_data = max_elem_norm.drop(labels=index_of_labels,axis=0)
        index_of_unlabelled = list(unlabelled_data.index)
        first_step_clf = SVC().fit(X=train_clf_data.drop(['class','label','cluster'],axis=1).to_numpy(),
                                  y=train_clf_data['label'].to_numpy())
        unlabelled_data['relab'] = first_step_clf.predict(unlabelled_data.drop(['class','label','cluster'],axis=1).to_numpy())
        gamma = 1
        good_ratio = 1/2
        max_iter = 1
        compteur = 0
        train_clf_data['relab'] = train_clf_data['label'].copy()
        updated_data = pd.concat([train_clf_data,unlabelled_data])
        up_data_np = updated_data.to_numpy()[:,:-4]
        results = first_step_clf.decision_function(X=up_data_np)
        while compteur<max_iter:
            compteur += 1
            labels = updated_data['relab'].to_numpy().reshape(1,-1)
            first_row = np.hstack((np.array(0).reshape(1,1),labels))
            
            #computation of omega and the coefficients
            omega = np.zeros((n_samples-1,n_samples-1))
            for i in range(n_samples-1):
                for k in range(i,n_samples-1):
                    omega[i,k] = rbf(x=up_data_np[i,:],y=up_data_np[k,:],l_squared=std)*labels[0,i]*labels[0,k]
                    omega[k,i] = omega[i,k]
                omega[i,i] = 1
        
            bot_right = omega + gamma*np.eye(n_samples-1)
            bot = np.hstack((updated_data['relab'].to_numpy().reshape(n_samples-1,1), bot_right))
            whole_mat = np.vstack((first_row, bot))
            
            del bot_right, bot, first_row
        
            right_side = np.vstack((np.zeros(1).reshape(1,1),np.ones(n_samples-1).reshape(n_samples-1,1)))
        
            coeffs = np.linalg.solve(a=whole_mat,b=right_side)
        
        
            alpha = coeffs[1:]
        
            #once we have the coefficients, we can compute the labels of the unlabelled instances
        
            to_det_b = np.zeros(n_samples-1)
            for i in range(n_samples-1):
                to_det_b[i] = np.sum(alpha*labels*rbf(x=up_data_np,y=up_data_np[i,:],l_squared=std))
        
            b = np.sort(to_det_b)[int(good_ratio*(n_samples-1))]
            
            check_array = np.zeros(n_samples-1)
            count_diff = 0
            
            for i in range(n_samples-1):
                check_array[i] = np.sign(to_det_b[i]-b)
                if check_array[i] != updated_data.loc[updated_data.index[i],'relab']:
                    count_diff += 1
            if count_diff == 0:
                break
            else:
                updated_data['relab'] = check_array
    
        last_outcome = updated_data['relab'].to_numpy()
        result_for_positive_instances[general_count] = np.sign(np.sum(alpha*last_outcome*rbf(x=up_data_np,
                                                                                              y=true_max_elem_norm.drop(['class','label'],
                                                                                                                        axis=1).loc[very_particular_element,:].to_numpy(),l_squared=std))-b)
        general_count += 1
        updated_data = updated_data.drop(['relab'],axis=1)
    true_positive = 0
    for i in range(result_for_positive_instances.shape[0]):
        if result_for_positive_instances[i] == 1:
            true_positive += 1
    dico_of_perf[f'{std}'] = true_positive/result_for_positive_instances.shape[0]
        
        #del alpha, b, check_array, clustering, coeffs, count_diff, first_step_clf, list_of_pop, list_of_ratio
        #del last_step, list_of_dist, omega, positive_cluster, reliable_negatives, reliable_positives
        #del whole_mat


"""
IMPORTANT PB TO SOLVE:
ORDER OF THE CHECK ARRAY COMPARED TO THE ONE OF THE DF, MAYBE IF 
CHECK ARRAY = DF IT CAN BE EASIER
"""
        

In [None]:
true_positive = 0
for i in range(result_for_positive_instances.shape[0]):
    if result_for_positive_instances[i] == 1:
        true_positive += 1

print('TPR = ', true_positive/result_for_positive_instances.shape[0])

In [None]:
"""
n_cluster = 5
clustering = KMeans(n_clusters=n_cluster).fit(max_elem_norm.to_numpy()[:,:-2])
max_elem_norm['cluster'] = clustering.labels_
list_of_ratio = []
for i in range(n_cluster):
    list_of_ratio.append(max_elem_norm[max_elem_norm['cluster'] == i]['label'].sum()/max_elem_norm[max_elem_norm['cluster'] == i]['class'].shape[0])
for i in max_elem_norm.index:
    if max_elem_norm.loc[i,'label'] == 0:
        max_elem_norm.loc[i,'label'] = -1
list_of_ratio = np.array(list_of_ratio)
positive_cluster = np.argmax(list_of_ratio)
list_of_dist = np.zeros(5)
for i in range(5):
    list_of_dist[i] = np.linalg.norm(clustering.cluster_centers_[positive_cluster,:] - clustering.cluster_centers_[i,:])
negative_cluster = np.argmax(list_of_dist)
df_unlab_pop = max_elem_norm[max_elem_norm['label'] == -1]
list_of_pop = pd.DataFrame(df_unlab_pop.groupby('cluster')['class'].count())
list_of_pop.columns = ['pop']
list_of_pop['dist'] = list_of_dist #distance to the positive cluster
list_of_pop = list_of_pop.sort_values('dist',ascending=False)
list_of_pop['cumsum'] = np.cumsum(list_of_pop['pop'])     
reliable_positives = max_elem_norm[max_elem_norm['label'] == 1]
n_positives = reliable_positives.shape[0]
last_step = np.where(np.array(list_of_pop['cumsum'])>n_positives)[0][0]
index_ordered_distance = list(list_of_pop.index)
"""

In [None]:
"""
if last_step == 0:
    reliable_negatives = max_elem_norm[max_elem_norm['cluster'] == negative_cluster]
    reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]
    reliable_negatives = reliable_negatives.sample(n=n_positives)
else:
    compteur=0
    reliable_negatives = max_elem_norm[max_elem_norm['cluster'] == negative_cluster]
    reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]
    while compteur<last_step:
        interm_negatives = max_elem_norm[max_elem_norm['cluster'] == index_ordered_distance[compteur+1]]
        interm_negatives = interm_negatives[interm_negatives['label'] == -1]
        reliable_negatives = pd.concat([reliable_negatives,interm_negatives])
        compteur += 1
    reliable_negatives = reliable_negatives.head(n_positives)
"""

In [None]:
#max_elem_norm[max_elem_norm['cluster'] == positive_cluster].sum()

In [None]:
#max_elem_norm[max_elem_norm['cluster'] == negative_cluster].sum()['class']

In [None]:
"""
dict_of_instances = {}
for i in range(n_cluster):
    dict_of_instances[str(i)] = max_elem_norm[max_elem_norm['cluster'] == i].sum()['class']
"""

In [None]:
#dict_of_instances

In [None]:
#test = max_elem_norm[max_elem_norm['cluster'] == 4]

In [None]:
#test[test['class'] == -1]

In [None]:
#reliable_negatives

In [None]:
#true_max_elem_norm.loc['Sc2AlC',:].to_numpy()