In [2]:
import numpy as np
import pandas as pd
from scipy import stats
import math
from sklearn.model_selection import train_test_split

In [3]:
class NB_Gaussian():
    def __init__(self, training_df, test_df, p_class):
        self.training_df = training_df  
        self.test_df = test_df
        self.p_class = p_class
        # training df grouped by classes, with calculated means and standard deviation
        self.aggs = self.training_df.groupby(self.p_class).agg(['mean', 'std']) 

    # P(A|C) - for single example (with distinct feature from distinct group)
    # from gauss formula
    def cond_prob(self, row, feature, group):
        mu = self.aggs[feature]['mean'][group]  # mean
        std = self.aggs[feature]['std'][group]  # standard deviation

        # one selected observation (sigle value)
        selected_observ = self.test_df.loc[self.test_df['id'] == row][feature].values[0]

        prob = stats.norm.pdf(selected_observ, loc=mu, scale=std)

        return prob

    def predict_class(self, row):
        classes = []

        min_class = np.amin(self.training_df[self.p_class].values)
        max_class = np.amax(self.training_df[self.p_class].values)

        for group in range(min_class, max_class):
            # P(C)
            class_amount = len(self.training_df[self.training_df[self.p_class] == group])
            
            prob = class_amount/len(self.training_df.index)
            for feature in self.training_df.columns:
                if feature == self.p_class:
                    break
                # P(C) * P(A|C)
                prob *= self.cond_prob(row, feature, group)
            classes.append(prob)

        return np.argmax(classes) + min_class  # from what number classes starts

    def predict(self):
        values = dict()
        for index, row in self.test_df.iterrows():
            values[index] = self.predict_class(index)
        return values

    def check_result(self):
        res = {'True': 0, 'False': 0}
        predicted_val = self.predict()
        for index, row in self.test_df.iterrows():
            if self.test_df.at[index, self.p_class] == predicted_val[index]:
                res['True'] += 1
            else:
                res['False'] += 1
        return res


In [4]:
def split_set(df, ratio):
    train_df, test_df = train_test_split(df, train_size=ratio)

    return train_df, test_df


def check_model(test_df, predicted, p_class):
    res = {'True': 0, 'False': 0}
    for index, row in test_df.iterrows():
        if test_df.at[index, p_class] == predicted[index]:
            res['True'] += 1
        else:
            res['False'] += 1
    return res


In [5]:
df = pd.read_csv('winequality-red.csv', sep=";")
i=0
for index, row in df.iterrows():
    df.at[index, 'id'] = i
    i +=1

df


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0.0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1.0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2.0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3.0
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594.0
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595.0
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1596.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,1597.0


In [6]:
train_df = df.iloc[:10, :]
test_df = df.iloc[10:13, :]
predicted_df = test_df.assign(quality=-1).copy(deep=False)

train_df.head()



Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3.0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4.0


In [7]:
test_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,id
10,6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.9959,3.28,0.54,9.2,5,10.0
11,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5,11.0
12,5.6,0.615,0.0,1.6,0.089,16.0,59.0,0.9943,3.58,0.52,9.9,5,12.0


In [8]:
predicted_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,id
10,6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.9959,3.28,0.54,9.2,-1,10.0
11,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,-1,11.0
12,5.6,0.615,0.0,1.6,0.089,16.0,59.0,0.9943,3.58,0.52,9.9,-1,12.0


In [9]:
selected_observ = df.loc[df['id'] == 11]['alcohol'].values[0]
selected_observ

10.5

In [10]:
df.groupby('quality').count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,id
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,10,10,10,10,10,10,10,10,10,10,10,10
4,53,53,53,53,53,53,53,53,53,53,53,53
5,681,681,681,681,681,681,681,681,681,681,681,681
6,638,638,638,638,638,638,638,638,638,638,638,638
7,199,199,199,199,199,199,199,199,199,199,199,199
8,18,18,18,18,18,18,18,18,18,18,18,18


In [11]:
gauss_classifier = NB_Gaussian(train_df, test_df, 'quality')
gauss_classifier.aggs

Unnamed: 0_level_0,fixed acidity,fixed acidity,volatile acidity,volatile acidity,citric acid,citric acid,residual sugar,residual sugar,chlorides,chlorides,...,density,density,pH,pH,sulphates,sulphates,alcohol,alcohol,id,id
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
quality,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
5,7.6,0.223607,0.685714,0.119841,0.065714,0.132017,2.6,1.579029,0.079571,0.010998,...,0.997343,0.000597,3.377143,0.132125,0.61,0.11,9.671429,0.411154,3.857143,3.132016
6,11.2,,0.28,,0.56,,1.9,,0.075,,...,0.998,,3.16,,0.58,,9.8,,3.0,
7,7.55,0.353553,0.615,0.049497,0.01,0.014142,1.6,0.565685,0.069,0.005657,...,0.9957,0.001556,3.375,0.021213,0.52,0.070711,9.75,0.353553,7.5,0.707107


In [12]:
gauss_classifier.cond_prob(11, 'alcohol', 7)

0.11893028922362944

In [13]:
predicted = gauss_classifier.predict()

In [14]:
predicted

{10: 6, 11: 6, 12: 6}

In [15]:
test_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,id
10,6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.9959,3.28,0.54,9.2,5,10.0
11,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5,11.0
12,5.6,0.615,0.0,1.6,0.089,16.0,59.0,0.9943,3.58,0.52,9.9,5,12.0


In [16]:
check_model(test_df, predicted, 'quality')

{'True': 0, 'False': 3}

In [17]:
train_df, test_df = split_set(df, 0.8)
gauss_classifierBIG = NB_Gaussian(train_df, test_df, 'quality')

predictions = gauss_classifierBIG.predict()
check_model(test_df, predictions, 'quality')

{'True': 170, 'False': 150}

In [18]:
shuffled = df.sample(frac=1)
parts = np.array_split(shuffled, 5)
results = []

for n in range(0, 5):
    test_set = parts[n]
    rest = parts.copy()
    rest.pop(n)
    train_set = pd.concat(parts, ignore_index=True)
    gauss_classifierh = NB_Gaussian(train_set, test_set, 'quality')
    res = gauss_classifierh.check_result()
    results.append(res)

results







[{'True': 170, 'False': 150},
 {'True': 181, 'False': 139},
 {'True': 184, 'False': 136},
 {'True': 174, 'False': 146},
 {'True': 186, 'False': 133}]

In [19]:
good = 0
all = 0
for res in results:
    good += res['True']
    all += res['True'] + res['False']

good/all

0.5597248280175109

In [1]:
df.describe()


NameError: name 'df' is not defined