In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from scipy.stats import bernoulli

## Without split on the data about training or testing

#### First, let's try a svm on the fully labeled data

In [2]:
mushroom_data = pd.read_csv('mushroom.csv')

In [3]:
mushroom_data = mushroom_data.sample(n=int(0.8*mushroom_data.shape[0]))
training_indexes = mushroom_data.index

In [4]:
n_samples = mushroom_data.shape[0]

In [5]:
mushroom_data = mushroom_data.reset_index(drop=True)

In [6]:
classifier = SVC(kernel='rbf',gamma='auto').fit(X=mushroom_data.drop(['class'], axis=1).to_numpy(),
                                                y=np.array(mushroom_data['class']))

In [7]:
predictions = classifier.predict(X=mushroom_data.drop(['class'], axis=1).to_numpy())

In [8]:
mushroom_data

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,520,6,5,8,0.863866,1073,11,0.943195,1
1,653,6,0,10,0.230356,1567,11,0.888450,0
2,384,2,6,7,1.248977,435,8,0.943195,1
3,282,6,0,5,0.179480,605,11,0.888450,0
4,107,6,0,10,0.290892,124,12,0.943195,0
...,...,...,...,...,...,...,...,...,...
43223,630,5,1,7,0.095070,668,1,0.943195,1
43224,549,2,6,11,0.365641,1955,12,0.943195,1
43225,416,6,0,1,0.358535,688,1,0.888450,0
43226,245,6,0,1,0.895698,492,1,0.943195,1


In [9]:
positives = 0
true_positives = 0
for i in range(n_samples):
    if predictions[i] == 1:
        positives += 1
        if mushroom_data.loc[i,'class'] == 1:
            true_positives += 1
precision_svm = true_positives/positives
print('the precision of the svm without any supplementary treatment is :', precision_svm)

the precision of the svm without any supplementary treatment is : 0.9995372902031716


In [11]:
mushroom_test = pd.read_csv('mushroom.csv')
list_to_keep = []
for i in range(mushroom_test.shape[0]):
    if i not in training_indexes:
        list_to_keep.append(i)

mushroom_test = mushroom_test.filter(items=list_to_keep,axis=0)
predictions_test = classifier.predict(X=mushroom_test.drop(['class'], axis=1).to_numpy())
mushroom_test = mushroom_test.reset_index(drop=True)

positive_test = 0
true_positive_test = 0
for i in range(mushroom_test.shape[0]):
    if predictions_test[i] == 1:
        positive_test += 1
        if mushroom_test.loc[i,'class'] == 1:
            true_positive_test += 1
print('the test precision is : ',true_positive_test/positive_test)

the test precision is :  0.6036142001710864


In [12]:
#it clearly overfits quite a lot

#### Now, let's try a fully labelled gmm

In [13]:
#we compute the \hat{pi_i}
number_of_positives = 0
number_of_negatives = 0
for i in range(n_samples):
    if mushroom_data.loc[i,'class'] == 1:
        number_of_positives += 1
    else:
        number_of_negatives += 1
pi_0 = number_of_negatives
pi_1 = number_of_positives
#we now compute the estimated mean and estimated covariance for each group
mean_0 = mushroom_data.groupby('class').mean().to_numpy()[0,:]
mean_1 = mushroom_data.groupby('class').mean().to_numpy()[1,:]
cov_0 = np.cov(mushroom_data[mushroom_data['class'] == 0].drop(['class'],axis=1).to_numpy(), rowvar=False)
cov_1 = np.cov(mushroom_data[mushroom_data['class'] == 1].drop(['class'],axis=1).to_numpy(), rowvar=False)
dict_mean = {'mean_0':mean_0, 'mean_1':mean_1}
dict_cov = {'cov_0':cov_0, 'cov_1':cov_1}
dict_pi = {'pi_0':pi_0, 'pi_1':pi_1}

In [14]:
y_hat = np.empty((n_samples,2))

In [15]:
#we now check for each point which category maximizes the likelihood
for group in range(2):
    for i in range(n_samples):
        x_test = np.array(mushroom_data.loc[i,])[:8]
        y_hat[i,group] = np.log(dict_pi['pi_{0}'.format(group)]) + multivariate_normal.logpdf(x=x_test,
                                                                                              mean=dict_mean['mean_{0}'.format(group)],
                                                                                              cov=dict_cov['cov_{0}'.format(group)])



In [16]:
decision = np.empty(n_samples)

In [17]:
for i in range(n_samples):
    if y_hat[i,1]>=y_hat[i,0]:
        decision[i] = 1
    else:
        decision[i] = 0

In [18]:
positives = 0
true_positives = 0
for i in range(n_samples):
    if decision[i] == 1:
        positives += 1
        if mushroom_data.loc[i,'class'] == 1:
            true_positives += 1

precision_gmm = true_positives/positives
print('the precision is :', precision_gmm)

the precision is : 0.6700038505968425


So, the data is apparently separable by a svm but is not generated by a gaussian mixture model as the precisions worsens a lot if we try to fit a gmm on it. 