In [1]:
import numpy as np
import pandas as pd
from scipy.stats import bernoulli
from sklearn.cluster import KMeans 
from scipy.stats import multivariate_normal

In [2]:
def update_cov(X,mean,weights,group):
    """a function to estimate the covariance with a new mean
    Parameter
    -------------------------------
    X : array-like
    the data with which we want to estimate the new covariance

    mean : array-like
    the new mean that doesn't correspond to the 'true mean'

    weights : arrar-like 
    the matrix of weights of the whole data

    group : int
    the group in which we want to update
    --------------------------------
    """
    sum_of_mat = np.zeros((X.shape[1],X.shape[1]))
    for i in range(X.shape[0]):
        temporal_cov = weights[i,group]*np.matmul((X[i,:]-mean).reshape((X.shape[1],1)),
                                                      (X[i,:]-mean).reshape((1,X.shape[1])))
        sum_of_mat += temporal_cov
    sum_of_weights = np.sum(weights[:,group])
    weighted_sigma = sum_of_mat/sum_of_weights
    return weighted_sigma

In [3]:
np.random.seed(1457065)
n_gen = 50
n_samples = 4*n_gen
data_red_1 = np.random.multivariate_normal(mean=np.array([2,2]),
                                             cov=np.array(([1,-0.25],[-0.25,1])),
                                             size=n_gen)
data_red_2 = np.random.uniform(low=1.0, high=3.0, size=(n_gen,2))

data_blue_1 = np.random.multivariate_normal(mean=np.array([-1,-1]),
                                            cov=np.eye(2)*0.5,
                                            size=n_gen)
data_blue_2 = np.random.uniform(low=-2, high=-1, size=(n_gen,2))

data_red = np.vstack((data_red_1,data_red_2))
labels_red = (np.ones(2*n_gen)*-1).reshape(2*n_gen,1)
data_red = np.hstack((data_red,labels_red))

data_blue = np.vstack((data_blue_1, data_blue_2))
labels_blue = (np.ones(2*n_gen)).reshape(2*n_gen,1)
data_blue = np.hstack((data_blue,labels_blue))

whole_data = np.vstack((data_blue, data_red))

In [4]:
np.random.seed(1457065)
whole_data_df = pd.DataFrame(whole_data)
whole_data_df.columns = ['x1','x2','class']
whole_data_df['label'] = np.zeros((n_samples))
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if whole_data_df.loc[i,'class'] == 1 and random == 0:
        whole_data_df.loc[i,'label'] = 1

In [5]:
n_cluster=3
fit_cluster = KMeans(n_clusters=n_cluster).fit(X=whole_data_df.drop(['class','label'], axis=1).to_numpy())
whole_data_df['cluster_lab'] = fit_cluster.labels_

In [6]:
positive_cluster = np.argmax(np.array(whole_data_df.groupby('cluster_lab')['label'].sum()))

In [7]:
list_dist_centroids = np.zeros(n_cluster)
array_centroids = fit_cluster.cluster_centers_
for i in range(n_cluster):
    list_dist_centroids[i] = np.linalg.norm(array_centroids[i,:] - array_centroids[positive_cluster,:])
negative_cluster = np.argmax(list_dist_centroids)

In [8]:
positive_data = whole_data_df[whole_data_df['cluster_lab'] == positive_cluster]
positive_data = positive_data[positive_data['label'] == 1]
reliable_negative = whole_data_df[whole_data_df['cluster_lab'] == negative_cluster]
reliable_negative = reliable_negative[reliable_negative['label'] == 0]

In [9]:
label_1 = positive_data.drop(['class','label','cluster_lab'], axis=1).to_numpy()
label_0 = reliable_negative.drop(['class','label','cluster_lab'], axis=1).to_numpy()

In [10]:
mean_1 = np.mean(label_1,axis=0)
mean_0 = np.mean(label_0, axis=0)

In [18]:
cov_1 = np.cov(label_1, rowvar=False)
cov_0 = np.cov(label_0, rowvar=False)

In [12]:
weights = np.zeros((n_samples,2))

In [13]:
for i in range(n_samples):
    if whole_data_df.loc[i,'cluster_lab'] == negative_cluster:
        whole_data_df.loc[i,'label'] = 0

In [14]:
for group in range(2):
    for i in range(n_samples):
        if whole_data_df.loc[i,'label'] == group:
            weights[i,group] = 1

In [15]:
pi_0 = 1/2
pi_1 = 1/2

In [16]:
dict_pi = {'pi_0':pi_0,'pi_1':pi_1}

In [19]:
dict_mean = {'mean_0':mean_0,'mean_1':mean_1}
dict_cov = {'cov_0':cov_0,'cov_1':cov_1}

In [24]:
data_gmm = whole_data_df.to_numpy()[:,:2]
iterations = 5
count = 0

#to opti : put the weights to O/1 before so only one condition in the loop

while count<=iterations:
    count+=1
    for group in range(2):
        for i in range(n_samples):
            if whole_data_df.loc[i,'label'] == 0:
                x_test = data_gmm[i,:]
                numerator = dict_pi['pi_{0}'.format(group)]*multivariate_normal.pdf(x=x_test,
                                                                                    mean=dict_mean['mean_{0}'.format(group)],
                                                                                    cov=dict_cov['cov_{0}'.format(group)],allow_singular=True)
                denom = pi_0*multivariate_normal.pdf(x=x_test,mean=mean_0,cov=cov_0,allow_singular=True)+pi_1*multivariate_normal.pdf(x=x_test,mean=mean_1,cov=cov_1,allow_singular=True)
                result = numerator/denom
                if numerator<0.000001 or denom < 0.000001 or result > 1:
                    weights[i,group] = 0
                else:
                    weights[i,group] = result
        sum_of_weights = np.sum(weights[:,group])
        dict_pi['pi_{0}'.format(group)] = np.mean(weights[:,group])
        dict_mean['mean_{0}'.format(group)] = np.sum((data_gmm*(weights[:,group].reshape(n_samples,1))),axis=0)/sum_of_weights
        dict_cov['cov_{0}'.format(group)] = update_cov(X=data_gmm,group=group,
                                                       mean=dict_mean['mean_{0}'.format(group)],weights=weights)           
        

In [33]:
n_gen_test = 10
n_samples_test = 4*n_gen_test
data_n_test_1 = np.random.multivariate_normal(mean=np.array([2,2]),
                                             cov=np.array(([1,-0.25],[-0.25,1])),
                                             size=n_gen_test)
data_n_test_2 = np.random.uniform(low=1., high=3., size=(n_gen_test,2))

data_p_test_1 = np.random.multivariate_normal(mean=np.array([-1,-1]),
                                            cov=np.eye(2)*0.5,
                                            size=n_gen_test)
data_p_test_2 = np.random.uniform(low=-2, high=-1., size=(n_gen_test,2))


data_test = np.vstack((data_n_test_1,data_n_test_2,data_p_test_1,data_p_test_2))
data_test = np.hstack((data_test,np.hstack((np.ones(n_gen_test*2)*(-1),np.ones(n_gen_test*2))).reshape(n_samples_test,1)))

In [34]:
weights_test = np.zeros((n_samples_test,2))
for group in range(2):
    for i in range(n_samples_test):
        x_test = data_test[i,:-1]
        numerator = dict_pi['pi_{0}'.format(group)]*multivariate_normal.pdf(x=x_test,
                                                                            mean=dict_mean['mean_{0}'.format(group)],
                                                                            cov=dict_cov['cov_{0}'.format(group)],allow_singular=True)
        denom = pi_0*multivariate_normal.pdf(x=x_test,mean=mean_0,cov=cov_0,allow_singular=True)+pi_1*multivariate_normal.pdf(x=x_test,mean=mean_1,cov=cov_1,allow_singular=True)
        result = numerator/denom
        if numerator<0.000001 or denom < 0.000001 or result > 1:
            weights_test[i,group] = 0
        else:
            weights_test[i,group] = result

In [35]:
predictions_test = np.zeros((n_samples_test))
for i in range(n_samples_test):
    if weights_test[i,1]>=weights_test[i,0]:
        predictions_test[i] = 1
    else:
        predictions_test[i] = -1

In [36]:
positives_test = 0
true_positives_test = 0
for i in range(n_samples_test):
    if predictions_test[i] == 1:
        positives_test += 1
        if data_test[i,2] == 1:
            true_positives_test += 1

precision_test = true_positives_test/positives_test
recall_test = true_positives_test/(2*n_gen_test)
f_1_score_test = (2*recall_test*precision_test)/(recall_test+precision_test)
weird_metric = (recall_test**2)/(positives_test/n_samples_test)

In [37]:
print('precision :', precision_test,'\n', 
      'recall :', recall_test, '\n', 
      'f_1 score :', f_1_score_test, '\n', 
      'weird metric :', weird_metric)

precision : 0.7142857142857143 
 recall : 1.0 
 f_1 score : 0.8333333333333333 
 weird metric : 1.4285714285714286


# On a less separable dataset

In [3]:
np.random.seed(1457065)
n_gen = 50
n_samples = 4*n_gen
data_red_1 = np.random.multivariate_normal(mean=np.array([0.5,0.5]),
                                             cov=np.array(([2,-0.5],[-0.5,2])),
                                             size=n_gen)
data_red_2 = np.random.uniform(low=0.5, high=2, size=(n_gen,2))

data_blue_1 = np.random.multivariate_normal(mean=np.array([-1,-1]),
                                            cov=np.eye(2),
                                            size=n_gen)
data_blue_2 = np.random.uniform(low=-1, high=0., size=(n_gen,2))

data_red = np.vstack((data_red_1,data_red_2))
labels_red = (np.ones(2*n_gen)*-1).reshape(2*n_gen,1)
data_red = np.hstack((data_red,labels_red))

data_blue = np.vstack((data_blue_1, data_blue_2))
labels_blue = (np.ones(2*n_gen)).reshape(2*n_gen,1)
data_blue = np.hstack((data_blue,labels_blue))

whole_data = np.vstack((data_blue, data_red))

In [4]:
np.random.seed(1457065)
whole_data_df = pd.DataFrame(whole_data)
whole_data_df.columns = ['x1','x2','class']
whole_data_df['label'] = np.zeros((n_samples))
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if whole_data_df.loc[i,'class'] == 1 and random == 0:
        whole_data_df.loc[i,'label'] = 1

In [5]:
n_cluster=3
fit_cluster = KMeans(n_clusters=n_cluster).fit(X=whole_data_df.drop(['class','label'], axis=1).to_numpy())
whole_data_df['cluster_lab'] = fit_cluster.labels_

In [6]:
positive_cluster = np.argmax(np.array(whole_data_df.groupby('cluster_lab')['label'].sum()))

In [7]:
list_dist_centroids = np.zeros(n_cluster)
array_centroids = fit_cluster.cluster_centers_
for i in range(n_cluster):
    list_dist_centroids[i] = np.linalg.norm(array_centroids[i,:] - array_centroids[positive_cluster,:])
negative_cluster = np.argmax(list_dist_centroids)

In [8]:
positive_data = whole_data_df[whole_data_df['cluster_lab'] == positive_cluster]
positive_data = positive_data[positive_data['label'] == 1]
reliable_negative = whole_data_df[whole_data_df['cluster_lab'] == negative_cluster]
reliable_negative = reliable_negative[reliable_negative['label'] == 0]

In [9]:
label_1 = positive_data.drop(['class','label','cluster_lab'], axis=1).to_numpy()
label_0 = reliable_negative.drop(['class','label','cluster_lab'], axis=1).to_numpy()

In [10]:
mean_1 = np.mean(label_1,axis=0)
mean_0 = np.mean(label_0, axis=0)

In [11]:
cov_1 = np.cov(label_1, rowvar=False)
cov_0 = np.cov(label_0, rowvar=False)

In [12]:
weights = np.zeros((n_samples,2))

In [13]:
for i in range(n_samples):
    if whole_data_df.loc[i,'cluster_lab'] == negative_cluster:
        whole_data_df.loc[i,'label'] = 0

In [14]:
for group in range(2):
    for i in range(n_samples):
        if whole_data_df.loc[i,'label'] == group:
            weights[i,group] = 1

In [15]:
pi_0 = 1/2
pi_1 = 1/2

In [16]:
dict_pi = {'pi_0':pi_0,'pi_1':pi_1}

In [17]:
dict_mean = {'mean_0':mean_0,'mean_1':mean_1}
dict_cov = {'cov_0':cov_0,'cov_1':cov_1}

In [18]:
data_gmm = whole_data_df.to_numpy()[:,:2]
iterations = 5
count = 0

#to opti : put the weights to O/1 before so only one condition in the loop

while count<=iterations:
    count+=1
    for group in range(2):
        for i in range(n_samples):
            if whole_data_df.loc[i,'label'] == 0:
                x_test = data_gmm[i,:]
                numerator = dict_pi['pi_{0}'.format(group)]*multivariate_normal.pdf(x=x_test,
                                                                                    mean=dict_mean['mean_{0}'.format(group)],
                                                                                    cov=dict_cov['cov_{0}'.format(group)],allow_singular=True)
                denom = pi_0*multivariate_normal.pdf(x=x_test,mean=mean_0,cov=cov_0,allow_singular=True)+pi_1*multivariate_normal.pdf(x=x_test,mean=mean_1,cov=cov_1,allow_singular=True)
                result = numerator/denom
                if numerator<0.000001 or denom < 0.000001 or result > 1:
                    weights[i,group] = 0
                else:
                    weights[i,group] = result
        sum_of_weights = np.sum(weights[:,group])
        dict_pi['pi_{0}'.format(group)] = np.mean(weights[:,group])
        dict_mean['mean_{0}'.format(group)] = np.sum((data_gmm*(weights[:,group].reshape(n_samples,1))),axis=0)/sum_of_weights
        dict_cov['cov_{0}'.format(group)] = update_cov(X=data_gmm,group=group,
                                                       mean=dict_mean['mean_{0}'.format(group)],weights=weights)           
        

In [19]:
n_gen_test = 10
n_samples_test = 4*n_gen_test
data_n_test_1 = np.random.multivariate_normal(mean=np.array([0.5,0.5]),
                                             cov=np.array(([2,-0.5],[-0.5,5])),
                                             size=n_gen_test)
data_n_test_2 = np.random.uniform(low=0.5, high=2., size=(n_gen_test,2))

data_p_test_1 = np.random.multivariate_normal(mean=np.array([0.5,0.5]),
                                            cov=np.eye(2),
                                            size=n_gen_test)
data_p_test_2 = np.random.uniform(low=-1, high=0., size=(n_gen_test,2))


data_test = np.vstack((data_n_test_1,data_n_test_2,data_p_test_1,data_p_test_2))
data_test = np.hstack((data_test,np.hstack((np.ones(n_gen_test*2)*(-1),np.ones(n_gen_test*2))).reshape(n_samples_test,1)))

In [20]:
weights_test = np.zeros((n_samples_test,2))
for group in range(2):
    for i in range(n_samples_test):
        x_test = data_test[i,:-1]
        numerator = dict_pi['pi_{0}'.format(group)]*multivariate_normal.pdf(x=x_test,
                                                                            mean=dict_mean['mean_{0}'.format(group)],
                                                                            cov=dict_cov['cov_{0}'.format(group)],allow_singular=True)
        denom = pi_0*multivariate_normal.pdf(x=x_test,mean=mean_0,cov=cov_0,allow_singular=True)+pi_1*multivariate_normal.pdf(x=x_test,mean=mean_1,cov=cov_1,allow_singular=True)
        result = numerator/denom
        if numerator<0.000001 or denom < 0.000001 or result > 1:
            weights_test[i,group] = 0
        else:
            weights_test[i,group] = result

In [21]:
predictions_test = np.zeros((n_samples_test))
for i in range(n_samples_test):
    if weights_test[i,1]>=weights_test[i,0]:
        predictions_test[i] = 1
    else:
        predictions_test[i] = -1

In [22]:
positives_test = 0
true_positives_test = 0
for i in range(n_samples_test):
    if predictions_test[i] == 1:
        positives_test += 1
        if data_test[i,2] == 1:
            true_positives_test += 1

precision_test = true_positives_test/positives_test
recall_test = true_positives_test/(2*n_gen_test)
f_1_score_test = (2*recall_test*precision_test)/(recall_test+precision_test)
weird_metric = (recall_test**2)/(positives_test/n_samples_test)

In [23]:
print('precision :', precision_test,'\n', 
      'recall :', recall_test, '\n', 
      'f_1 score :', f_1_score_test, '\n', 
      'weird metric :', weird_metric)

precision : 0.5384615384615384 
 recall : 0.7 
 f_1 score : 0.608695652173913 
 weird metric : 0.7538461538461537
