# Imports

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

# Class Bayes

In [2]:
class NaiveBayes:
    def fit(self, X, y):
        """
        This method is used to fit the classifier to the training data. It initializes the 
        number of samples, number of features and number of classes using the input data.
        It also creates three zero-matrices to store summary statistics and prior probabilities.
        """
        # get number of samples (rows) and features (columns)
        self.n_samples, self.n_features = X.shape

        # get number of uniques classes
        self.n_classes = len(np.unique(y))
        
        # create three zero-matrices to store summary stats & prior
        self.mean = np.zeros((self.n_classes, self.n_features))
        self.variance = np.zeros((self.n_classes, self.n_features))
        self.priors = np.zeros(self.n_classes)

        for c in range(self.n_classes):
            # create a subset of data for the specific class 'c'
            X_c = X[y == c]
            
            # calculate statistics and update zero-matrices, rows=classes, cols=features
            self.mean[c, :] = np.mean(X_c, axis=0)
            self.variance[c, :] = np.var(X_c, axis=0)
            self.priors[c] = X_c.shape[0] / self.n_samples

    def gaussian_density(self, x, mean, var):
        """
        This method calculates the gaussian density function for a given sample, mean and variance.
        """
        const = 1 / np.sqrt(var * 2 * np.pi)
        proba = np.exp(-0.5 * ((x - mean) ** 2 / var))
        return const * proba

    
    def get_class_probability(self, x):
        """
        This method calculates the class probability for a given sample using the Gaussian density function.
        """
        # store new posteriors for each class in a single list
        posteriors = list()

        for c in range(self.n_classes):
            # get summary stats & prior
            mean = self.mean[c]
            variance = self.variance[c]
            prior = np.log(self.priors[c])
            
            # calculate new posterior & append to list
            posterior = np.sum(np.log(self.gaussian_density(x, mean, variance)))
            posterior = prior + posterior
            posteriors.append(posterior)
        
        # return the index with the highest class probability
        return np.argmax(posteriors)


    def predict(self, X):
        # for each sample x in the dataset X
        y_hat = [self.get_class_probability(x) for x in X]
        return np.array(y_hat)

# Import data and adjust labels

In [3]:
beer_df = pd.read_csv('analysis/feature_list.csv')

beer_df.loc[beer_df['Label'] == 'open_broken', 'Label'] = 0
beer_df.loc[beer_df['Label'] == 'broken', 'Label'] = 1
beer_df.loc[beer_df['Label'] == 'closed_sealed', 'Label'] = 2
beer_df.loc[beer_df['Label'] == 'closed_seal_broken', 'Label'] = 3

# Divide the dataset in data columns and one label column

In [4]:
# all but the last column
data = beer_df.iloc[:,:-1]
#convert all cells from str to float, neccessary because of str type labels
data = data.astype(float)
# take just the last column that contains labels
label = beer_df.iloc[:,-1]
#convert to np array
data = np.asarray(data)
label = np.asarray(label)

# Split into train and test data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42)

# Instantiate, train and predict Bayes Classifier

In [6]:
nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

# Print results

In [7]:
def get_accuracy(y_true, y_hat):
     return np.sum(y_true==y_hat) / len(y_true)

Y_pred = nb.predict(X_test)


Y_pred = np.where(Y_pred == 0, 'open_broken',
                 np.where(Y_pred == 1, 'broken',
                 np.where(Y_pred == 2, 'closed_sealed',
                 np.where(Y_pred == 3, 'closed_seal_broken', Y_pred))))

Y_pred = Y_pred.astype(np.str_)

y_test = np.where(y_test == 0, 'open_broken',
                 np.where(y_test == 1, 'broken',
                 np.where(y_test == 2, 'closed_sealed',
                 np.where(y_test == 3, 'closed_seal_broken', y_test))))

y_test = np.array([np.array(x) for x in y_test])
y_test = y_test.astype(np.ndarray)
y_test = np.array(y_test, dtype=np.ndarray)

#  Fast output of the classifier's accuracy
print('Accuracy Bayes',accuracy_score(y_test, Y_pred),'\n')
print('**********************************\n')


#***************************************************************
# Showing of the confusion matrix
# Predicting the classes with the test data set
print(confusion_matrix(y_test, Y_pred),'\n')
print('**********************************\n')

#***************************************************************
# Showing the classification report
print(classification_report(y_test, Y_pred))

Accuracy Bayes 0.6287878787878788 

**********************************

[[67  0  1  2]
 [ 8  3 45  8]
 [ 7  3 42 10]
 [ 4  1  9 54]] 

**********************************

                    precision    recall  f1-score   support

            broken       0.78      0.96      0.86        70
closed_seal_broken       0.43      0.05      0.08        64
     closed_sealed       0.43      0.68      0.53        62
       open_broken       0.73      0.79      0.76        68

          accuracy                           0.63       264
         macro avg       0.59      0.62      0.56       264
      weighted avg       0.60      0.63      0.57       264



# Sklearn implementation

In [9]:
# Trainieren des Bayes Klassifikators 
from sklearn.naive_bayes import GaussianNB # importiere Bayes-Klassifikator
df = pd.read_csv('analysis/feature_list.csv')

# alles außer die letzte Spalte
X = df.iloc[:,:-1]
# nur die letzte Spalte  
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,
                                                   random_state=42)

bayes_clf = GaussianNB()


bayes_clf.fit(X_train,y_train) # fit classifier on data

#  Fast output of the classifier's accuracy
print('Accuracy Bayes:',bayes_clf.score(X_test,y_test),'\n')
print('**********************************\n')

#***************************************************************
# Showing of the confusion matrix
# Predicting the classes with the test data set
y_pred = bayes_clf.predict(X_test) #take test data as prediction data
print(confusion_matrix(y_test, y_pred),'\n')
print('**********************************\n')

#***************************************************************
# Showing the classification report
print(classification_report(y_test, y_pred))

Accuracy Bayes: 0.6287878787878788 

**********************************

[[67  0  1  2]
 [ 8  3 45  8]
 [ 7  3 42 10]
 [ 4  1  9 54]] 

**********************************

                    precision    recall  f1-score   support

            broken       0.78      0.96      0.86        70
closed_seal_broken       0.43      0.05      0.08        64
     closed_sealed       0.43      0.68      0.53        62
       open_broken       0.73      0.79      0.76        68

          accuracy                           0.63       264
         macro avg       0.59      0.62      0.56       264
      weighted avg       0.60      0.63      0.57       264

