In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from scipy.stats import bernoulli

## Without split on the data about training or testing

#### First, let's try a svm on the fully labeled data

In [12]:
mushroom_data = pd.read_csv('mushroom.csv')

In [13]:
n_samples = mushroom_data.shape[0]

In [14]:
mushroom_data

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,1372,2,2,10,3.807467,1545,11,1.804273,1
1,1461,2,2,10,3.807467,1557,11,1.804273,1
2,1371,2,2,10,3.612496,1566,11,1.804273,1
3,1261,6,2,10,3.787572,1566,11,1.804273,1
4,1305,6,2,10,3.711971,1464,11,0.943195,1
...,...,...,...,...,...,...,...,...,...
54030,73,5,3,2,0.887740,569,12,0.943195,1
54031,82,2,3,2,1.186164,490,12,0.943195,1
54032,82,5,3,2,0.915593,584,12,0.888450,1
54033,79,2,3,2,1.034963,491,12,0.888450,1


In [15]:
classifier = SVC(kernel='rbf',gamma='auto').fit(X=mushroom_data.drop(['class'], axis=1).to_numpy(),
                                                y=np.array(mushroom_data['class']))

In [19]:
predictions = classifier.predict(X=mushroom_data.drop(['class'], axis=1).to_numpy())

In [22]:
positives = 0
true_positives = 0
for i in range(n_samples):
    if predictions[i] == 1:
        positives += 1
        if mushroom_data.loc[i,'class'] == 1:
            true_positives += 1
precision_svm = true_positives/positives
print('the precision of the svm without any supplementary treatment is :', precision_svm)

the precision of the svm without any supplementary treatment is : 0.9992252762058744


So it's quite hight but we didn't split the dataset into training and test so probably overfitting.

#### Now, let's try a fully labelled gmm

In [30]:
#we compute the \hat{pi_i}
number_of_positives = 0
number_of_negatives = 0
for i in range(n_samples):
    if mushroom_data.loc[i,'class'] == 1:
        number_of_positives += 1
    else:
        number_of_negatives += 1
pi_0 = number_of_negatives
pi_1 = number_of_positives
#we now compute the estimated mean and estimated covariance for each group
mean_0 = mushroom_data.groupby('class').mean().to_numpy()[0,:]
mean_1 = mushroom_data.groupby('class').mean().to_numpy()[1,:]
cov_0 = np.cov(mushroom_data[mushroom_data['class'] == 0].drop(['class'],axis=1).to_numpy(), rowvar=False)
cov_1 = np.cov(mushroom_data[mushroom_data['class'] == 1].drop(['class'],axis=1).to_numpy(), rowvar=False)
dict_mean = {'mean_0':mean_0, 'mean_1':mean_1}
dict_cov = {'cov_0':cov_0, 'cov_1':cov_1}
dict_pi = {'pi_0':pi_0, 'pi_1':pi_1}

In [38]:
y_hat = np.empty((n_samples,2))

In [39]:
#we now check for each point which category maximizes the likelihood
for group in range(2):
    for i in range(n_samples):
        x_test = np.array(mushroom_data.loc[i,])[:8]
        y_hat[i,group] = np.log(dict_pi['pi_{0}'.format(group)]) + multivariate_normal.logpdf(x=x_test,
                                                                                              mean=dict_mean['mean_{0}'.format(group)],
                                                                                              cov=dict_cov['cov_{0}'.format(group)])



In [40]:
decision = np.empty(n_samples)

In [42]:
for i in range(n_samples):
    if y_hat[i,1]>=y_hat[i,0]:
        decision[i] = 1
    else:
        decision[i] = 0

In [43]:
positives = 0
true_positives = 0
for i in range(n_samples):
    if decision[i] == 1:
        positives += 1
        if mushroom_data.loc[i,'class'] == 1:
            true_positives += 1

precision_gmm = true_positives/positives
print('the precision is :', precision_gmm)

the precision is : 0.6721133908953053


So, the data is apparently separable by a svm but is not generated by a gaussian mixture model as the precisions worsens a lot if we try to fit a gmm on it. 