In [1]:
import pandas as pd
import random
import numpy as np


# This code performs 2 algorithms, Winnow-2 and Naive Bayes, on 3 datasets
# The datasets can be found in the UCI Machine Learning Repository https://archive.ics.uci.edu/ml/index.php

[Datasets](#Datasets):

[Iris](#Iris)- 3 classes    
[Glass](#Glass)- 7 classes  
[Breast Cancer](#Breast_Cancer)- 2 classes  

[Test Splits](#Test_Splits)  
[Winnow-2](#Winnow)  
[Naive Bayes](#Naive_Bayes)  
[Models](#Models)  

<a id="Datasets"></a>
# Datasets

First we perform EDA on all Datasets

<a id="Iris"></a>
## Iris

In [2]:
iris = pd.read_csv( "iris.data", header=None, names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width','iris_class'])

In [3]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
iris_class      150 non-null object
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
iris.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
iris_class      0
dtype: int64

In [5]:
iris.iris_class.value_counts(normalize = True)

Iris-versicolor    0.333333
Iris-setosa        0.333333
Iris-virginica     0.333333
Name: iris_class, dtype: float64

3 classes

In [6]:
iris = pd.concat([iris, pd.get_dummies(iris["iris_class"])], axis=1)
iris.sepal_length = iris.sepal_length.round(0)
iris.sepal_length = iris.sepal_length.astype(int)
iris.sepal_length.value_counts(normalize = True)

6    0.453333
5    0.313333
7    0.160000
8    0.040000
4    0.033333
Name: sepal_length, dtype: float64

In [7]:
iris = pd.concat([iris, pd.get_dummies(iris["sepal_length"],prefix = 'sepal_length_')], axis=1)

In [8]:
iris.sepal_width = iris.sepal_width.round(0)
iris.sepal_width = iris.sepal_width.astype(int)
iris.sepal_width.value_counts(normalize = True)
iris = pd.concat([iris, pd.get_dummies(iris["sepal_width"],prefix = 'sepal_width_')], axis=1)

In [9]:
iris.petal_length = iris.petal_length.round(0)
iris.petal_length = iris.petal_length.astype(int)
iris.petal_length.value_counts(normalize = True)
iris = pd.concat([iris, pd.get_dummies(iris["petal_length"],prefix = 'petal_length_')], axis=1)

In [10]:
iris.petal_width = iris.petal_width.round(0)
iris.petal_width = iris.petal_width.astype(int)
iris.petal_width.value_counts(normalize = True)
iris = pd.concat([iris, pd.get_dummies(iris["petal_width"],prefix = 'petal_width_')], axis=1)

In [11]:
iris = iris.drop(['sepal_length', 'sepal_width', 'petal_length', 'petal_width','iris_class'], axis=1)
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 21 columns):
Iris-setosa        150 non-null uint8
Iris-versicolor    150 non-null uint8
Iris-virginica     150 non-null uint8
sepal_length__4    150 non-null uint8
sepal_length__5    150 non-null uint8
sepal_length__6    150 non-null uint8
sepal_length__7    150 non-null uint8
sepal_length__8    150 non-null uint8
sepal_width__2     150 non-null uint8
sepal_width__3     150 non-null uint8
sepal_width__4     150 non-null uint8
petal_length__1    150 non-null uint8
petal_length__2    150 non-null uint8
petal_length__3    150 non-null uint8
petal_length__4    150 non-null uint8
petal_length__5    150 non-null uint8
petal_length__6    150 non-null uint8
petal_length__7    150 non-null uint8
petal_width__0     150 non-null uint8
petal_width__1     150 non-null uint8
petal_width__2     150 non-null uint8
dtypes: uint8(21)
memory usage: 3.2 KB


In [62]:
iris.head()

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica,sepal_length__4,sepal_length__5,sepal_length__6,sepal_length__7,sepal_length__8,sepal_width__2,sepal_width__3,...,petal_length__1,petal_length__2,petal_length__3,petal_length__4,petal_length__5,petal_length__6,petal_length__7,petal_width__0,petal_width__1,petal_width__2
0,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,1,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
4,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


<a id="Glass"></a>
## Glass

In [12]:
glass = pd.read_csv( "glass.data", header=None, names = ['ID','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','glass_class'])

In [13]:
glass.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 11 columns):
ID             214 non-null int64
RI             214 non-null float64
Na             214 non-null float64
Mg             214 non-null float64
Al             214 non-null float64
Si             214 non-null float64
K              214 non-null float64
Ca             214 non-null float64
Ba             214 non-null float64
Fe             214 non-null float64
glass_class    214 non-null int64
dtypes: float64(9), int64(2)
memory usage: 18.5 KB


In [14]:
glass.isna().sum()

ID             0
RI             0
Na             0
Mg             0
Al             0
Si             0
K              0
Ca             0
Ba             0
Fe             0
glass_class    0
dtype: int64

In [15]:
glass.glass_class.value_counts(normalize = True)

2    0.355140
1    0.327103
7    0.135514
3    0.079439
5    0.060748
6    0.042056
Name: glass_class, dtype: float64

In [16]:
#round RI to 2 decimal places
glass.RI = glass.RI.round(2)
glass.RI.value_counts(normalize = True)

1.52    0.929907
1.53    0.037383
1.51    0.032710
Name: RI, dtype: float64

In [17]:
glass = pd.concat([glass, pd.get_dummies(glass["RI"],prefix = 'RI_')], axis=1)
glass.Na = glass.Na.round(0)
glass.Na.value_counts(normalize = True)

13.0    0.570093
14.0    0.285047
15.0    0.079439
12.0    0.032710
11.0    0.023364
16.0    0.004673
17.0    0.004673
Name: Na, dtype: float64

In [18]:
glass = pd.concat([glass, pd.get_dummies(glass["Na"],prefix = 'Na_')], axis=1)
glass.Mg = glass.Mg.round(0)
glass.Mg.value_counts(normalize = True)

4.0    0.467290
3.0    0.252336
0.0    0.200935
2.0    0.065421
1.0    0.014019
Name: Mg, dtype: float64

In [19]:
glass = pd.concat([glass, pd.get_dummies(glass["Mg"],prefix = 'Mg_')], axis=1)
glass.Al = glass.Al.round(0)
glass.Al.value_counts(normalize = True)

1.0    0.593458
2.0    0.341121
3.0    0.042056
0.0    0.018692
4.0    0.004673
Name: Al, dtype: float64

In [20]:
glass = pd.concat([glass, pd.get_dummies(glass["Al"],prefix = 'Al_')], axis=1)
glass.Si = glass.Si.round(0)
glass.Si.value_counts(normalize = True)

73.0    0.612150
72.0    0.266355
74.0    0.042056
71.0    0.037383
70.0    0.028037
75.0    0.014019
Name: Si, dtype: float64

In [21]:
glass = pd.concat([glass, pd.get_dummies(glass["Si"],prefix = 'Si_')], axis=1)
glass.K = glass.K.round(0)
glass.K.value_counts(normalize = True)

1.0    0.570093
0.0    0.406542
6.0    0.009346
2.0    0.009346
3.0    0.004673
Name: K, dtype: float64

In [22]:
glass = pd.concat([glass, pd.get_dummies(glass["K"],prefix = 'K_')], axis=1)
glass.Ca = glass.Ca.round(0)
glass.Ca.value_counts(normalize = True)

9.0     0.378505
8.0     0.369159
10.0    0.107477
11.0    0.042056
12.0    0.028037
7.0     0.023364
6.0     0.014019
13.0    0.014019
15.0    0.009346
5.0     0.004673
14.0    0.004673
16.0    0.004673
Name: Ca, dtype: float64

In [23]:
glass.Ca = glass.Ca *0.1
glass.Ca = glass.Ca.round(0)
glass.Ca.value_counts(normalize = True)

1.0    0.981308
2.0    0.014019
0.0    0.004673
Name: Ca, dtype: float64

In [24]:
glass = pd.concat([glass, pd.get_dummies(glass["Ca"],prefix = 'Ca_')], axis=1)
glass.Ba = glass.Ba.round(0)
glass.Ba.value_counts(normalize = True)

0.0    0.869159
1.0    0.070093
2.0    0.051402
3.0    0.009346
Name: Ba, dtype: float64

In [25]:
glass = pd.concat([glass, pd.get_dummies(glass["Ba"],prefix = 'Ba_')], axis=1)
glass.Fe = glass.Fe.round(1)
glass.Fe.value_counts(normalize = True)

0.0    0.686916
0.2    0.130841
0.1    0.126168
0.3    0.042056
0.4    0.009346
0.5    0.004673
Name: Fe, dtype: float64

In [26]:
glass = pd.concat([glass, pd.get_dummies(glass["Fe"],prefix = 'Fe_')], axis=1)
glass = pd.concat([glass, pd.get_dummies(glass["glass_class"],prefix = 'glass_class_')], axis=1)

In [27]:
glass = glass.drop(['ID', 'RI', 'Na', 'Mg','Al','Si','K','Ca','Ba','Fe','glass_class'], axis=1)
glass.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 50 columns):
RI__1.51          214 non-null uint8
RI__1.52          214 non-null uint8
RI__1.53          214 non-null uint8
Na__11.0          214 non-null uint8
Na__12.0          214 non-null uint8
Na__13.0          214 non-null uint8
Na__14.0          214 non-null uint8
Na__15.0          214 non-null uint8
Na__16.0          214 non-null uint8
Na__17.0          214 non-null uint8
Mg__0.0           214 non-null uint8
Mg__1.0           214 non-null uint8
Mg__2.0           214 non-null uint8
Mg__3.0           214 non-null uint8
Mg__4.0           214 non-null uint8
Al__0.0           214 non-null uint8
Al__1.0           214 non-null uint8
Al__2.0           214 non-null uint8
Al__3.0           214 non-null uint8
Al__4.0           214 non-null uint8
Si__70.0          214 non-null uint8
Si__71.0          214 non-null uint8
Si__72.0          214 non-null uint8
Si__73.0          214 non-null uint8
Si__74.

<a id="Breast_Cancer"></a>
## Breast Cancer

In [28]:
breast_cancer = pd.read_csv( "breast-cancer-wisconsin.data", header=None, names = ['ID','clump_thick','unif_cell_size','unif_cell_shape','marg_adh','single_epit_cell_size','bare_nuclei','bland_chromatin','normal_nucleoli','mitosis','cancer_class'])

In [29]:
breast_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
ID                       699 non-null int64
clump_thick              699 non-null int64
unif_cell_size           699 non-null int64
unif_cell_shape          699 non-null int64
marg_adh                 699 non-null int64
single_epit_cell_size    699 non-null int64
bare_nuclei              699 non-null object
bland_chromatin          699 non-null int64
normal_nucleoli          699 non-null int64
mitosis                  699 non-null int64
cancer_class             699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [30]:
breast_cancer.isna().sum()

ID                       0
clump_thick              0
unif_cell_size           0
unif_cell_shape          0
marg_adh                 0
single_epit_cell_size    0
bare_nuclei              0
bland_chromatin          0
normal_nucleoli          0
mitosis                  0
cancer_class             0
dtype: int64

In [31]:
breast_cancer.bare_nuclei.value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: bare_nuclei, dtype: int64

With 16 records with ? we can drop those invalid values and update the column to be int

In [32]:
breast_cancer=breast_cancer[breast_cancer.bare_nuclei != '?']

In [33]:
breast_cancer.bare_nuclei = breast_cancer.bare_nuclei.astype(int)

In [34]:
breast_cancer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
ID                       683 non-null int64
clump_thick              683 non-null int64
unif_cell_size           683 non-null int64
unif_cell_shape          683 non-null int64
marg_adh                 683 non-null int64
single_epit_cell_size    683 non-null int64
bare_nuclei              683 non-null int32
bland_chromatin          683 non-null int64
normal_nucleoli          683 non-null int64
mitosis                  683 non-null int64
cancer_class             683 non-null int64
dtypes: int32(1), int64(10)
memory usage: 61.4 KB


In [35]:
breast_cancer.cancer_class.value_counts(normalize = True)

2    0.650073
4    0.349927
Name: cancer_class, dtype: float64

In [36]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["cancer_class"],prefix = 'cancer_class_')], axis=1)

In [37]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["clump_thick"],prefix = 'clump_thick_')], axis=1)

In [38]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["unif_cell_size"],prefix = 'unif_cell_size_')], axis=1)

In [39]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["unif_cell_shape"],prefix = 'unif_cell_shape_')], axis=1)

In [40]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["marg_adh"],prefix = 'marg_adh_')], axis=1)

In [41]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["single_epit_cell_size"],prefix = 'single_epit_cell_size_')], axis=1)

In [42]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["bare_nuclei"],prefix = 'bare_nuclei_')], axis=1)

In [43]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["bland_chromatin"],prefix = 'bland_chromatin_')], axis=1)

In [44]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["normal_nucleoli"],prefix = 'normal_nucleoli_')], axis=1)

In [45]:
breast_cancer = pd.concat([breast_cancer, pd.get_dummies(breast_cancer["mitosis"],prefix = 'mitosis_')], axis=1)

In [46]:
breast_cancer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Columns: 102 entries, ID to mitosis__10
dtypes: int32(1), int64(10), uint8(91)
memory usage: 122.1 KB


In [47]:
breast_cancer = breast_cancer.drop(['ID', 'clump_thick', 'unif_cell_size', 'unif_cell_shape','marg_adh','single_epit_cell_size','bare_nuclei','bland_chromatin','normal_nucleoli','mitosis','cancer_class'], axis=1)

In [48]:
breast_cancer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 91 columns):
cancer_class__2              683 non-null uint8
cancer_class__4              683 non-null uint8
clump_thick__1               683 non-null uint8
clump_thick__2               683 non-null uint8
clump_thick__3               683 non-null uint8
clump_thick__4               683 non-null uint8
clump_thick__5               683 non-null uint8
clump_thick__6               683 non-null uint8
clump_thick__7               683 non-null uint8
clump_thick__8               683 non-null uint8
clump_thick__9               683 non-null uint8
clump_thick__10              683 non-null uint8
unif_cell_size__1            683 non-null uint8
unif_cell_size__2            683 non-null uint8
unif_cell_size__3            683 non-null uint8
unif_cell_size__4            683 non-null uint8
unif_cell_size__5            683 non-null uint8
unif_cell_size__6            683 non-null uint8
unif_cell_size__7            68

<a id="Test_Splits"></a>
## Split into Test and Training DataSets

In [49]:
def winnow_2_n_fold_cross_val(data, n, target_class, attributes, threshold, alpha, model):
    metrics= []
    list_of_indices = list(range(len(data)))
    k = len(data) #this is the number of records in the data
    l = int(k/n) #this is the number of records in each test set

    random.shuffle(list_of_indices)
    splits = [list_of_indices[0:l-1]]

    test = data.iloc[splits[0]]
    training_indices = [index not in splits[0] for index in list_of_indices]
    training = data.iloc[training_indices]
    new_model = winnow_2_train(training, target_class, attributes, threshold, alpha, model)
    output = winnow_2_test(test, target_class, attributes, threshold, new_model)
        
    print(new_model)
    metrics.append(output)
        
    return metrics

In [50]:
def naive_bayes_n_fold_cross_val(data, n, target_class, attributes, model):
    metrics= []
    list_of_indices = list(range(len(data)))
    k = len(data) #this is the number of records in the data
    l = int(k/n) #this is the number of records in each split
    
    random.shuffle(list_of_indices)
    splits = [list_of_indices[0:l - 1]]
    
    test = data.iloc[splits[0]]
    training_indices = [index not in splits[0] for index in list_of_indices]
    training = data.iloc[training_indices]
    new_model = naive_bayes_train(training, target_class, attributes, model)
    output = naive_bayes_test(test, target_class, attributes, new_model)
        
    metrics.append(output)
    print(new_model)
    return metrics

<a id="Winnow"></a>
## Winnow-2

In [51]:
#target_class is the column # for the class you are checking
#attributes should be a list with the column #s for the features in the data to check
#threshold is used for model prediction
#alpha is a parameter used to adjust the weights

# Winnow-2 only works on 2 classes at a time. If your dataset has more classes, you will need to create boolean features 
# for each class in your dataset.

# The model is a list of weights, one for each feature in the dataset
# Predictions are made by taking the weighted sum of the features x weights, and comparing the result to a threshold theta
# If the weighted sum is greater than theta, the prediction is 1, otherwise the prediction is 0

# Weights are initialized to be all 1s
# Alpha is a number above 1


# Process:
# 1. Receive data instance
# 2. Make prediction for that instance (0 means not in class, 1 means in class)
# 3. Check if the prediction was correct
# 3.a. If correct, do nothing
# 3.b. If not correct, update weights via promotion or demotion
# Promotion- happens when the prediction is 0 but the correct class is 1
#            For only features with a value of 1, multiply the weight for that feature times alpha
# Demotion- happens when the prediction is 1 but the correct class is 0
#            For only features with a value of 1, divide the weight for that feature by alpha


def winnow_2_train(data, target_class, attributes, threshold, alpha, model):
    num_records = len(data)
    num_attributes = len(attributes)
    new_model = model
    
    for record in range(num_records): #Loop through all records in data
        #use model to predict class (pandas indexing starts at 0)
        curr_record = data.iloc[[record]] #pull current record data
        sum_weights = 0
        for index in range(num_attributes): #Loop through each attribute and add the weights
            sum_weights = sum_weights + new_model[index] * curr_record.iloc[0,attributes[index]]
            
        prediction = 0
        if(sum_weights > threshold):
            prediction = 1
        #check if class is correct
        #if class is correct -no action
        #if class is not correct, determine if promotion (predict 0 when correct is 1) or demotion (predict 1 when correct is 0)
        if(prediction == 0):
            if(curr_record.iloc[0,target_class]==1):
                #promoting only adjusts weights for attributes that have a value of 1 - wi * alpha
                for index in range(num_attributes):
                    if(curr_record.iloc[0,attributes[index]] == 1):
                        model[index] = model[index]*alpha
                
        else:
            if(curr_record.iloc[0,target_class]==0):
                #demoting only adjusts weights for attributes that have a value of 1 - wi / alpha
                for index in range(num_attributes):
                    if(curr_record.iloc[0,attributes[index]] == 1):
                        model[index] = model[index]/alpha
       
    return new_model
                

In [52]:
def winnow_2_test(data, target_class, attributes,threshold, model):
    num_records = len(data)
    num_attributes = len(attributes)
    metric = 0
    
    for record in range(num_records): #Loop through all records in data
        #use model to predict class (pandas indexing starts at 0)
        curr_record = data.iloc[[record]] #pull current record data
        sum_weights = 0
        for index in range(num_attributes): #Loop through each attribute and add the weights
            sum_weights = sum_weights + model[index] * curr_record.iloc[0,attributes[index]]
            
        prediction = 0
        if(sum_weights > threshold):
            prediction = 1
        #check if class is correct
        #if class is correct add 1 to metric 
        #if class is not correct, do nothing

        if(prediction == 0):
            if(curr_record.iloc[0,target_class]==0):
                metric = metric +1
                
        else:
            if(curr_record.iloc[0,target_class]==1):
                metric = metric +1

    return metric/num_records
    
    
    

<a id="Naive_Bayes"></a>
## Naive Bayes

In [53]:
#target_class is the column # for the class you are checking
#attributes should be a list with the column #s for the features in the data to check

# Naive Bayes can work for multiple classes at a time, however for the purposes of comparing to Winnow-2 we will keep the
# boolean attributes for each class.
# The purpose of the algorithm is to find the class that maximizes the posterior probability of that class given the data.
# It does this using Bayes Rule- P(c|f1,f2,...,fd)= P(f1,f2,...,fd|c)*P(c)/P(f1,f2,...,fd)
# with the assumption that the features are conditionally independent of eachother given the classes, this becomes
# P(c|f1,f2,...,fd)= P(f1|c)*P(f2|c)*...*P(fd|c)*P(c)/P(f1,f2,...,fd)
# The Naive Bayes Classifier is the argmax of this, which is (the denominator does not matter because it does not depend on c):
# class=argmax_c(P(c)*Product_i=1-d(P(fi|d)))

# Some negatives for using Naive Bayes is the assumption that all of the features are conditionally independent of eachother
# given the classes


def naive_bayes_train(data, target_class, attributes, model):
    num_records = len(data)
    num_attributes = len(attributes)
    new_model = model
    count_1s = 1
    count_0s = 1
    
    #Model of form
    # P(0) / P(xi = 0)
    # ---- / P(xi = 1)
    # P(1) / P(xi = 0)
    # ---- / P(xi = 1)
    for record in range(num_records): #Loop through all records in data
        #add 1 to model for every datapoint 
        curr_record = data.iloc[[record]] #pull current record data
        #first add for when target_class = 1
        if(curr_record.iloc[0,target_class]==1):
            for index in range(num_attributes): #Loop through each attribute
                if(curr_record.iloc[0,attributes[index]]==1):
                    new_model[3][index+1] = new_model[3][index+1] +1
                else:
                    new_model[2][index+1] = new_model[2][index+1] +1
            new_model[2][0] = new_model[2][0] +1
            count_1s = count_1s +1
                
        #then add for when target_class = 0
        else:
            for index in range(num_attributes): #Loop through each attribute  
                if(curr_record.iloc[0,attributes[index]]==0):
                    new_model[0][index+1] = new_model[0][index+1] +1
                else:
                    new_model[1][index+1]= new_model[1][index+1] +1
            new_model[0][0] = new_model[0][0] +1
            count_0s = count_0s+1
    
    
    
    new_model[0][0] = new_model[0][0] / num_records
    new_model[2][0] = new_model[2][0] / num_records
    
    for index in range(num_attributes):
        new_model[0][index+1] = (new_model[0][index+1] + 0.01) / count_0s
        new_model[1][index+1] = (new_model[1][index+1] + 0.01) / count_0s
        new_model[2][index+1] = (new_model[2][index+1] + 0.01) / count_1s
        new_model[3][index+1] = (new_model[3][index+1] + 0.01) / count_1s
    
    return new_model

In [54]:
def naive_bayes_test(data, target_class, attributes, model):
    num_records = len(data)
    num_attributes = len(attributes)
    count_correct = 0
    metric = 0

    #Model of form
    # P(0) / P(xi = 0)
    # ---- / P(xi = 1)
    # P(1) / P(xi = 0)
    # ---- / P(xi = 1)
    for record in range(num_records): #Loop through all records in data
        curr_record = data.iloc[[record]] #pull current record data
        p_0 = model[0][0]
        p_1 = model[2][0]
        for index in range(num_attributes): #Loop through each attribute
            if(curr_record.iloc[0,attributes[index]]==1):
                p_0 = p_0 * model[2][index+1]
                p_1 = p_1 * model[3][index+1]
            else:
                p_0 = p_0 * model[0][index+1]
                p_1 = p_1 * model[1][index+1]
                
        #determine class and add to count_correct if correct
        if(p_0>p_1):
            if(curr_record.iloc[0,target_class]==0):
                count_correct = count_correct +1
        else:
            if(curr_record.iloc[0,target_class]==1):
                count_correct = count_correct +1

    metric = count_correct/num_records
    
    return metric

<a id="Models"></a>
## Models

Winnow-2

In [55]:
# Run Iris through
attributes_iris = [3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
winnow_2_iris = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
print("winnow-2 Iris Class 1 model:")
winnow_2_iris_results_class1 = winnow_2_n_fold_cross_val(iris, 3, 0, attributes_iris, 4, 2, winnow_2_iris)
print("winnow-2 Iris Class 2 model:")
winnow_2_iris_results_class2 = winnow_2_n_fold_cross_val(iris, 3, 1, attributes_iris, 4, 2, winnow_2_iris)
print("winnow-2 Iris Class 3 model:")
winnow_2_iris_results_class3 = winnow_2_n_fold_cross_val(iris, 3, 2, attributes_iris, 4, 2, winnow_2_iris)

winnow-2 Iris Class 1 model:
[1, 1.0, 1, 0.5, 1, 0.5, 1.0, 1, 2, 1, 0.5, 1, 0.5, 1, 1, 2, 0.25, 1]
winnow-2 Iris Class 2 model:
[1, 1.0, 0.5, 1.0, 1, 1.0, 0.5, 1, 1.0, 1, 2.0, 1.0, 1.0, 0.25, 1, 1.0, 1.0, 0.5]
winnow-2 Iris Class 3 model:
[1, 1.0, 2.0, 1.0, 1, 1.0, 2.0, 1, 1.0, 1, 1.0, 2.0, 2.0, 0.5, 1, 1.0, 0.5, 4.0]


In [56]:
# Run Glass through
attributes_glass = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43]
winnow_2_glass = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
print("winnow-2 Glass Class 1 model:")
winnow_2_glass_results_class1 = winnow_2_n_fold_cross_val(glass, 3, 44, attributes_glass, 4, 2, winnow_2_glass)
print("winnow-2 Glass Class 2 model:")
winnow_2_glass_results_class2 = winnow_2_n_fold_cross_val(glass, 3, 45, attributes_glass, 4, 2, winnow_2_glass)
print("winnow-2 Glass Class 3 model:")
winnow_2_glass_results_class3 = winnow_2_n_fold_cross_val(glass, 3, 46, attributes_glass, 4, 2, winnow_2_glass)
print("winnow-2 Glass Class 5 model:")
winnow_2_glass_results_class5 = winnow_2_n_fold_cross_val(glass, 3, 47, attributes_glass, 4, 2, winnow_2_glass)
print("winnow-2 Glass Class 6 model:")
winnow_2_glass_results_class6 = winnow_2_n_fold_cross_val(glass, 3, 48, attributes_glass, 4, 2, winnow_2_glass)
print("winnow-2 Glass Class 7 model:")
winnow_2_glass_results_class7 = winnow_2_n_fold_cross_val(glass, 3, 49, attributes_glass, 4, 2, winnow_2_glass)

winnow-2 Glass Class 1 model:
[0.5, 0.03125, 0.125, 0.5, 0.5, 0.125, 0.125, 0.5, 1, 1, 0.125, 1, 1, 0.125, 0.125, 1, 0.125, 0.0625, 0.5, 0.5, 0.125, 0.25, 0.25, 0.25, 1, 1, 0.0625, 0.125, 0.5, 1, 0.5, 1, 0.00390625, 0.5, 0.0078125, 1, 0.5, 0.5, 0.0625, 0.5, 0.25, 0.25, 1, 1]
winnow-2 Glass Class 2 model:
[0.125, 0.0625, 0.5, 0.5, 1.0, 0.125, 0.125, 1.0, 1, 0.5, 0.125, 1, 0.5, 0.25, 0.25, 0.5, 0.25, 0.125, 0.5, 0.5, 0.25, 0.5, 0.5, 0.125, 1, 0.5, 0.125, 0.25, 0.25, 1, 0.5, 0.5, 0.0078125, 1.0, 0.03125, 0.5, 0.5, 0.5, 0.0625, 0.5, 0.5, 0.5, 1, 0.5]
winnow-2 Glass Class 3 model:
[0.125, 0.125, 0.5, 0.5, 0.5, 0.25, 0.25, 1.0, 1, 0.5, 0.0625, 1, 0.25, 0.5, 1.0, 0.5, 0.5, 0.125, 0.5, 0.5, 0.25, 0.5, 0.5, 0.5, 1, 0.25, 0.25, 0.5, 0.25, 0.5, 0.5, 0.5, 0.015625, 1.0, 0.0625, 0.5, 0.5, 0.5, 0.125, 0.5, 1.0, 0.5, 1, 0.25]
winnow-2 Glass Class 5 model:
[0.0625, 0.25, 0.5, 0.5, 0.5, 1.0, 0.0625, 1.0, 1, 0.5, 0.125, 1, 0.25, 0.25, 1.0, 0.5, 0.125, 0.25, 0.5, 1.0, 0.5, 0.5, 0.25, 0.5, 1.0, 0.25, 0.25

In [57]:
# Run Breast Cancer Through
attributes_breast_cancer = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90]
winnow_2_breast_cancer = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
print("winnow-2 Breast Cancer Class 1 model:")
winnow_2_breast_cancer_results_class1 = winnow_2_n_fold_cross_val(breast_cancer, 3, 0, attributes_breast_cancer, 4, 2, winnow_2_breast_cancer)
print("winnow-2 Breast Cancer Class 2 model:")
winnow_2_breast_cancer_results_class2 = winnow_2_n_fold_cross_val(breast_cancer, 3, 1, attributes_breast_cancer, 4, 2, winnow_2_breast_cancer)

winnow-2 Breast Cancer Class 1 model:
[1, 1.0, 1.0, 0.5, 1.0, 0.5, 0.25, 0.0625, 0.25, 0.0009765625, 1.0, 0.5, 1.0, 0.25, 0.5, 0.25, 0.25, 0.0625, 1.0, 0.00390625, 2.0, 0.25, 1.0, 0.125, 0.25, 0.25, 0.25, 0.0625, 1, 0.015625, 0.125, 0.25, 0.5, 0.5, 0.25, 0.25, 0.5, 0.0625, 1, 0.0625, 1, 0.5, 0.03125, 0.125, 0.25, 0.03125, 0.25, 1.0, 1, 0.25, 1.0, 1.0, 1.0, 0.5, 1.0, 0.5, 0.25, 0.5, 0.25, 0.0001220703125, 1.0, 1.0, 0.5, 0.0625, 0.0625, 1, 0.015625, 0.25, 0.5, 0.25, 1.0, 2, 0.03125, 0.25, 0.5, 0.5, 0.5, 0.0625, 0.25, 0.03125, 0.03125, 0.125, 0.0625, 0.25, 0.5, 1, 0.5, 0.25, 0.25]
winnow-2 Breast Cancer Class 2 model:
[0.5, 0.25, 0.0625, 0.25, 0.25, 0.5, 0.5, 1.0, 2.0, 2.0, 0.0625, 0.5, 0.125, 0.25, 2.0, 2.0, 0.25, 0.5, 1.0, 1.0, 0.125, 0.0625, 0.5, 1.0, 0.25, 1.0, 0.25, 1.0, 2, 1.0, 0.125, 0.5, 0.0625, 1.0, 1.0, 0.5, 0.5, 0.5, 1, 1.0, 0.5, 0.03125, 0.25, 2.0, 0.25, 0.5, 0.125, 2.0, 1, 2.0, 0.0078125, 1.0, 0.5, 2.0, 0.25, 1.0, 0.5, 0.25, 1.0, 2.0, 0.25, 0.25, 0.25, 0.25, 2.0, 0.5, 1.0, 0.

Naive Bayes

In [58]:
# Run Iris through
naive_bayes_iris=[[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]
print("Naive Bayes Iris Class 1 model:")
naive_bayes_iris_results_class1 = naive_bayes_n_fold_cross_val(iris, 3, 0, attributes_iris, naive_bayes_iris)
print("Naive Bayes Iris Class 2 model:")
naive_bayes_iris_results_class2 = naive_bayes_n_fold_cross_val(iris, 3, 1, attributes_iris, naive_bayes_iris)
print("Naive Bayes Iris Class 3 model:")
naive_bayes_iris_results_class3 = naive_bayes_n_fold_cross_val(iris, 3, 2, attributes_iris, naive_bayes_iris)

Naive Bayes Iris Class 1 model:
[[0.9900990099009901, 0.9901980198019803, 0.9208910891089109, 0.36643564356435643, 0.7525742574257426, 0.9307920792079208, 0.811980198019802, 0.20801980198019804, 0.9604950495049506, 0.9901980198019803, 0.9901980198019803, 0.9604950495049506, 0.6535643564356436, 0.6436633663366337, 0.7525742574257426, 0.9505940594059407, 0.9901980198019803, 0.6337623762376238, 0.3565346534653465], [0, 9.900990099009902e-05, 0.0694059405940594, 0.6238613861386139, 0.23772277227722774, 0.0595049504950495, 0.17831683168316834, 0.7822772277227723, 0.0298019801980198, 9.900990099009902e-05, 9.900990099009902e-05, 0.0298019801980198, 0.3367326732673267, 0.3466336633663366, 0.23772277227722774, 0.039702970297029704, 9.900990099009902e-05, 0.3565346534653465, 0.6337623762376238], [0.009900990099009901, 0.505, 0.005, 0.505, 0.505, 0.505, 0.505, 0.005, 0.505, 0.005, 0.505, 0.505, 0.505, 0.505, 0.505, 0.505, 0.005, 0.505, 0.505], [0, 0.005, 0.505, 0.005, 0.005, 0.005, 0.005, 0.505,

In [59]:
# Run Glass through
naive_bayes_glass=[[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]
print("Naive Bayes Glass Class 1 model:")
naive_bayes_glass_results_class1 = naive_bayes_n_fold_cross_val(glass, 3, 44, attributes_glass, naive_bayes_glass)
print("Naive Bayes Glass Class 2 model:")
naive_bayes_glass_results_class2 = naive_bayes_n_fold_cross_val(glass, 3, 45, attributes_glass, naive_bayes_glass)
print("Naive Bayes Glass Class 3 model:")
naive_bayes_glass_results_class3 = naive_bayes_n_fold_cross_val(glass, 3, 46, attributes_glass, naive_bayes_glass)
print("Naive Bayes Glass Class 5 model:")
naive_bayes_glass_results_class5 = naive_bayes_n_fold_cross_val(glass, 3, 47, attributes_glass, naive_bayes_glass)
print("Naive Bayes Glass Class 6 model:")
naive_bayes_glass_results_class6 = naive_bayes_n_fold_cross_val(glass, 3, 48, attributes_glass, naive_bayes_glass)
print("Naive Bayes Glass Class 7 model:")
naive_bayes_glass_results_class7 = naive_bayes_n_fold_cross_val(glass, 3, 49, attributes_glass, naive_bayes_glass)

Naive Bayes Glass Class 1 model:
[[1.0, 0.9517931034482758, 0.08972413793103448, 0.9448965517241379, 0.9586896551724138, 0.9517931034482758, 0.4966206896551725, 0.6966206896551724, 0.8828275862068965, 0.9862758620689654, 0.9862758620689654, 0.6966206896551724, 0.9724827586206896, 0.8966206896551724, 0.7724827586206897, 0.634551724137931, 0.9862758620689654, 0.5517931034482759, 0.5173103448275862, 0.931103448275862, 0.9862758620689654, 0.9517931034482758, 0.9517931034482758, 0.7517931034482759, 0.40006896551724136, 0.938, 0.9724827586206896, 0.5517931034482759, 0.47593103448275864, 0.9793793103448275, 0.9862758620689654, 0.9793793103448275, 0.9862758620689654, 0.027655172413793103, 0.9724827586206896, 0.18627586206896551, 0.8966206896551724, 0.9173103448275861, 0.9793793103448275, 0.2966206896551724, 0.8759310344827587, 0.8759310344827587, 0.9517931034482758, 0.9793793103448275, 0.9862758620689654], [0, 0.041448275862068965, 0.9035172413793103, 0.048344827586206895, 0.034551724137931034

[[0.915869944380144, 0.9694138839079514, 0.08418346999558933, 0.9466317084710486, 0.9694765064792414, 0.9618990274630678, 0.5265747802590183, 0.6797987660407486, 0.8779043001581596, 0.9924405452931676, 0.9924405452931676, 0.7323504376481267, 0.9771706470983257, 0.9234798019432519, 0.7637386024672324, 0.6036422775236435, 0.9924405452931676, 0.5197684828868709, 0.5416508300655545, 0.9465056055660617, 1.000016302868925, 0.969414741670358, 0.9618990274630678, 0.7559243691956106, 0.3819070632179175, 0.9541439857131094, 0.9771689315735125, 0.5572115274630677, 0.4583929443303916, 0.9923804960090976, 0.9924405452931676, 0.9999562535848552, 0.9924405452931676, 0.030617012220688893, 0.9771715048607325, 0.1986665279457809, 0.8931750561154083, 0.9236582342706487, 0.9848055961957467, 0.3129411614623559, 0.8703954421345199, 0.8703988731841465, 0.9618415573818204, 0.9848647818018074, 1.000016302868925], [0, 0.030738826313642418, 0.9159692402260045, 0.05352100175054518, 0.030676203742352307, 0.0382536

In [60]:
# Run Breast Cancer Through
naive_bayes_breast_cancer=[[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]
print("Naive Bayes Breast Cancer Class 2 model:")
naive_bayes_breast_cancer_results_class1 = naive_bayes_n_fold_cross_val(breast_cancer, 3, 0, attributes_breast_cancer, naive_bayes_breast_cancer)
print("Naive Bayes Breast Cancer Class 3 model:")
naive_bayes_breast_cancer_results_class2 = naive_bayes_n_fold_cross_val(breast_cancer, 3, 1, attributes_breast_cancer, naive_bayes_breast_cancer)

Naive Bayes Breast Cancer Class 2 model:
[[0.29978118161925604, 0.9855797101449275, 0.9783333333333333, 0.9421014492753622, 0.9203623188405797, 0.8189130434782609, 0.9131159420289855, 0.9058695652173914, 0.8116666666666668, 0.9493478260869564, 0.7102173913043479, 0.971086956521739, 0.971086956521739, 0.9276086956521739, 0.8334057971014494, 0.9131159420289855, 0.8841304347826088, 0.9203623188405797, 0.855144927536232, 0.971086956521739, 0.6884782608695652, 0.9855797101449275, 0.9638405797101448, 0.8986231884057971, 0.855144927536232, 0.8986231884057971, 0.8768840579710145, 0.8913768115942029, 0.8768840579710145, 0.9638405797101448, 0.7247101449275363, 0.8841304347826088, 0.8913768115942029, 0.9131159420289855, 0.9203623188405797, 0.8841304347826088, 0.9203623188405797, 0.9348550724637681, 0.8841304347826088, 0.9783333333333333, 0.7247101449275363, 0.9928260869565216, 0.9058695652173914, 0.789927536231884, 0.7826811594202899, 0.855144927536232, 0.8334057971014494, 0.9565942028985507, 0.9

Results- strict percentage of total correct guesses divided by the total population count

In [61]:
print("Winnow Results:")
print("Iris:")
print(winnow_2_iris_results_class1)
print(winnow_2_iris_results_class2)
print(winnow_2_iris_results_class3)
print("Glass:")
print(winnow_2_glass_results_class1)
print(winnow_2_glass_results_class2)
print(winnow_2_glass_results_class3)
print(winnow_2_glass_results_class5)
print(winnow_2_glass_results_class6)
print(winnow_2_glass_results_class7)
print("Breast Cancer:")
print(winnow_2_breast_cancer_results_class1)
print(winnow_2_breast_cancer_results_class2)
print("Naive-Bayes Results:")
print("Iris:")
print(naive_bayes_iris_results_class1)
print(naive_bayes_iris_results_class2)
print(naive_bayes_iris_results_class3)
print("Glass:")
print(naive_bayes_glass_results_class1)
print(naive_bayes_glass_results_class2)
print(naive_bayes_glass_results_class3)
print(naive_bayes_glass_results_class5)
print(naive_bayes_glass_results_class6)
print(naive_bayes_glass_results_class7)
print("Breast Cancer:")
print(naive_bayes_breast_cancer_results_class1)
print(naive_bayes_breast_cancer_results_class2)

Winnow Results:
Iris:
[0.9795918367346939]
[0.673469387755102]
[0.5510204081632653]
Glass:
[0.6571428571428571]
[0.6285714285714286]
[0.8857142857142857]
[0.9142857142857143]
[0.9571428571428572]
[0.14285714285714285]
Breast Cancer:
[0.9778761061946902]
[0.9646017699115044]
Naive-Bayes Results:
Iris:
[0.673469387755102]
[0.6326530612244898]
[0.7755102040816326]
Glass:
[0.7]
[0.6]
[0.9714285714285714]
[0.9285714285714286]
[0.9857142857142858]
[0.8571428571428571]
Breast Cancer:
[0.37610619469026546]
[0.6946902654867256]
