In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import bernoulli
from sklearn.cluster import KMeans
from scipy.stats import multivariate_normal
from sklearn.preprocessing import StandardScaler

In [2]:
raisin_whole = pd.read_csv('raisin.csv')
raisin_whole.columns = ['area', 'maj_length','min_length', 'eccentricity','convex','extent',
                  'perimeter','class']
classes_to_keep = raisin_whole['class'].copy()
raisin_whole = StandardScaler().fit_transform(X=raisin_whole.to_numpy()[:,:-1])
raisin_whole = pd.DataFrame(raisin_whole)
raisin_whole.columns = ['area', 'maj_length','min_length', 'eccentricity','convex','extent',
                  'perimeter']
raisin_whole['class'] = classes_to_keep
raisin_whole = raisin_whole.replace('Kecimen',1)
raisin_whole = raisin_whole.replace('Besni',0)
raisin_data = raisin_whole.sample(frac=0.8)
list_train = raisin_data.index.copy()
list_test = []
for i in raisin_whole.index:
    if i not in list_train:
        list_test.append(i)
raisin_test = raisin_whole.filter(items=list_test, axis=0)

raisin_data = raisin_data.reset_index(drop=True)
raisin_test = raisin_test.reset_index(drop=True)

  raisin_whole = raisin_whole.replace('Besni',0)


In [3]:
def update_cov(X,mean,weights,group):
    """a function to estimate the covariance with a new mean
    Parameter
    -------------------------------
    X : array-like
    the data with which we want to estimate the new covariance

    mean : array-like
    the new mean that doesn't correspond to the 'true mean'

    weights : arrar-like 
    the matrix of weights of the whole data

    group : int
    the group in which we want to update
    --------------------------------
    """
    sum_of_mat = np.zeros((X.shape[1],X.shape[1]))
    for i in range(X.shape[0]):
        temporal_cov = weights[i,group]*np.matmul((X[i,:]-mean).reshape((X.shape[1],1)),
                                                      (X[i,:]-mean).reshape((1,X.shape[1])))
        sum_of_mat += temporal_cov
    sum_of_weights = np.sum(weights[:,group])
    weighted_sigma = sum_of_mat/sum_of_weights
    return weighted_sigma

In [4]:
n_samples = raisin_data.shape[0]

In [5]:
raisin_data['label'] = np.ones((n_samples))*99

for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if raisin_data.loc[i,'class'] == 1 and random == 0:
        raisin_data.loc[i,'label'] = 1

In [6]:
n_cluster=3
fit_cluster = KMeans(n_clusters=n_cluster).fit(X=raisin_data.drop(['class','label'], axis=1).to_numpy())
raisin_data['cluster_lab'] = fit_cluster.labels_

In [7]:
positive_cluster = np.where(np.array(raisin_data[raisin_data['label']==1].groupby('cluster_lab').count()['label']) == 
                            np.max(np.array(raisin_data[raisin_data['label']==1].groupby('cluster_lab').count()['label'])))[0][0]

In [8]:
list_dist_centroids = np.zeros(n_cluster)
array_centroids = fit_cluster.cluster_centers_
for i in range(n_cluster):
    list_dist_centroids[i] = np.linalg.norm(array_centroids[i,:] - array_centroids[positive_cluster,:])
negative_cluster = np.argmax(list_dist_centroids)


In [9]:
positive_data = raisin_data[raisin_data['cluster_lab'] == positive_cluster]
positive_data = positive_data[positive_data['label'] == 1]
reliable_negative = raisin_data[raisin_data['cluster_lab'] == negative_cluster]
reliable_negative = reliable_negative[reliable_negative['label'] == 99]

In [10]:
label_1 = positive_data.drop(['class','label','cluster_lab'], axis=1).to_numpy()
label_0 = reliable_negative.drop(['class','label','cluster_lab'], axis=1).to_numpy()

In [11]:
mean_1 = np.mean(label_1,axis=0)
mean_0 = np.mean(label_0, axis=0)

In [12]:
cov_1 = np.cov(label_1, rowvar=False)
cov_0 = np.cov(label_0, rowvar=False)

In [13]:
weights = np.zeros((n_samples,2))

In [14]:
for i in range(n_samples):
    if raisin_data.loc[i,'cluster_lab'] == negative_cluster:
        raisin_data.loc[i,'label'] = 0

In [15]:
for group in range(2):
    for i in range(n_samples):
        if raisin_data.loc[i,'label'] == group:
            weights[i,group] = 1

In [16]:
pi_0 = 1/2
pi_1 = 1/2

In [17]:
dict_pi = {'pi_0':pi_0,'pi_1':pi_1}

In [18]:
dict_mean = {'mean_0':mean_0,'mean_1':mean_1}
dict_cov = {'cov_0':cov_0,'cov_1':cov_1}

In [19]:
data_gmm = raisin_data.to_numpy()[:,:7]
iterations = 5
count = 0

#to opti : put the weights to O/1 before so only one condition in the loop

while count<=iterations:
    count+=1
    for group in range(2):
        for i in range(n_samples):
            if raisin_data.loc[i,'label'] == 99:
                x_test = data_gmm[i,:]
                numerator = dict_pi['pi_{0}'.format(group)]*multivariate_normal.pdf(x=x_test,
                                                                                    mean=dict_mean['mean_{0}'.format(group)],
                                                                                    cov=dict_cov['cov_{0}'.format(group)],allow_singular=True)
                denom = pi_0*multivariate_normal.pdf(x=x_test,mean=mean_0,cov=cov_0,allow_singular=True)+pi_1*multivariate_normal.pdf(x=x_test,mean=mean_1,cov=cov_1,allow_singular=True)
                result = numerator/denom
                if numerator<0.000001 or denom < 0.000001 or result > 1:
                    weights[i,group] = 0
                else:
                    weights[i,group] = result
        sum_of_weights = np.sum(weights[:,group])
        dict_pi['pi_{0}'.format(group)] = np.mean(weights[:,group])
        dict_mean['mean_{0}'.format(group)] = np.sum((data_gmm*(weights[:,group].reshape(n_samples,1))),axis=0)/sum_of_weights
        dict_cov['cov_{0}'.format(group)] = update_cov(X=data_gmm,group=group,
                                                       mean=dict_mean['mean_{0}'.format(group)],weights=weights)           
        

In [20]:
for i in range(n_samples):
    if weights[i,1] >= weights[i,0]:
        raisin_data.loc[i,'label'] = 1
    else:
        raisin_data.loc[i,'label'] = 0

positives = 0
true_positives = 0 
for i in range(n_samples):
    if raisin_data.loc[i,'label'] == 1:
        positives += 1
        if raisin_data.loc[i,'class'] == 1:
            true_positives += 1

print('the precision is :', true_positives/positives)

the precision is : 0.6096718480138169


In [21]:
correct_label = 0
for i in range(n_samples):
    if raisin_data.loc[i,'class'] == raisin_data.loc[i,'label']:
        correct_label += 1
print('the correct labeled points represent :', (correct_label*100)/n_samples)

the correct labeled points represent : 67.91666666666667


#### The precision is still a bit approximative but I haven't tried a 'simple' SVM or GMM on the data fully labeled 

In [22]:
n_samples_test = raisin_test.shape[0]
weights_test = np.zeros(shape=(n_samples_test,2))
data_gmm_test = raisin_test.to_numpy()[:,:-1]
for group in range(2):
    for i in range(n_samples_test):
        x_test = data_gmm_test[i,:]
        numerator = dict_pi['pi_{0}'.format(group)]*multivariate_normal.pdf(x=x_test,
                                                                            mean=dict_mean['mean_{0}'.format(group)],
                                                                            cov=dict_cov['cov_{0}'.format(group)],allow_singular=True)
        denom = pi_0*multivariate_normal.pdf(x=x_test,mean=mean_0,cov=cov_0,allow_singular=True)+pi_1*multivariate_normal.pdf(x=x_test,mean=mean_1,cov=cov_1,allow_singular=True)
        result = numerator/denom
        weights_test[i,group] = result

In [23]:
raisin_test['label'] = np.ones(n_samples_test)
for i in range(n_samples_test):
    if weights_test[i,1] > weights_test[i,0]:
        raisin_test.loc[i,'label'] = 1
    else:
        raisin_test.loc[i,'label'] = 0

positive_test = 0
true_positive_test = 0
for i in range(n_samples_test):
    if raisin_test.loc[i,'label'] == 1:
        positive_test += 1
        if raisin_test.loc[i,'class'] == 1:
            true_positive_test += 1

print(true_positive_test/positive_test)

0.7398373983739838


In [24]:
#not bad at all

In [25]:
positive_test

123

In [26]:
positives

579

In [27]:
true_positive_test

91

In [28]:
#the 'issue' is that it is quite 'generous' on the positives