In [1]:
import pandas as pd
import random
import numpy as np
import math
import copy 

In [2]:
# Method to convert features to binomial
# data is the dataset to be used for the algorithm
# features is a list of the indicies for the features
# return data_binomial, which is the dataset with the features converted to binomial
def binomial_features(data, features):
    data_binomial = copy.deepcopy(data)
    for index in features:
        column_name = data_binomial.columns[index]
        mean = data_binomial[column_name].mean()
        data_ltm = data_binomial[column_name] < mean
        data_binomial.loc[data_ltm, column_name] = 0
        data_gtm = data_binomial[column_name] >= mean
        data_binomial.loc[data_gtm, column_name] = 1
    return data_binomial

[Datasets](#Datasets):

[Iris](#Iris)- 3 classes    
[Glass](#Glass)- 7 classes  
[Breast Cancer](#Breast_Cancer)- 2 classes  
[Votes](#Votes)- 2 classes  
[Soybean](#Soybean)- 4 classes  

[Cross Validation](#Cross_Val)  
[Logistic Regression](#Log_Regress)  
[Naive Bayes](#Naive_Bayes)  
[Models](#Models)  

<a id="Datasets"></a>
# Datasets

<a id="Iris"></a>
## Iris

Dummies for different Iris Classes:  
iris_class_Iris-setosa       
iris_class_Iris-versicolor    
iris_class_Iris-virginica  
Target Class Indices: [5,6,7]  
Feature Indices: [0,1,2,3]

In [3]:
iris = pd.read_csv( "iris.data", header=None, names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width','iris_class'])
iris_log = pd.read_csv( "iris.data", header=None, names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width','iris_class'])
iris = pd.concat([iris, pd.get_dummies(iris["iris_class"],prefix = 'iris_class')], axis=1)
iris_log = pd.concat([iris_log, pd.get_dummies(iris["iris_class"],prefix = 'iris_class')], axis=1)
iris_features = [0,1,2,3]
iris_binomial = binomial_features(iris, iris_features)

<a id="Glass"></a>
## Glass

Class Indices: [11,12,13,14,15,16]  
Feature Indices: [1,2,3,4,5,6,7,8,9]

In [4]:
glass = pd.read_csv( "glass.data", header=None, names = ['ID','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','glass_class'])
glass_log = pd.read_csv( "glass.data", header=None, names = ['ID','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','glass_class'])
glass = pd.concat([glass, pd.get_dummies(glass["glass_class"],prefix = 'glass_class')], axis=1)
glass_log = pd.concat([glass_log, pd.get_dummies(glass["glass_class"],prefix = 'glass_class')], axis=1)
glass_features = [1,2,3,4,5,6,7,8,9]
glass_binomial = binomial_features(glass, glass_features)

<a id="Breast_Cancer"></a>
## Breast Cancer

Remove 16 records with missing data.  
Class 2 changed to 1  
Class 4 changed to 0  
Class Index: 10  
Features: [1,2,3,4,5,6,7,8,9]

In [5]:
breast_cancer = pd.read_csv( "breast-cancer-wisconsin.data", header=None, names = ['ID','clump_thick','unif_cell_size','unif_cell_shape','marg_adh','single_epit_cell_size','bare_nuclei','bland_chromatin','normal_nucleoli','mitosis','cancer_class'])
breast_cancer_log = pd.read_csv( "breast-cancer-wisconsin.data", header=None, names = ['ID','clump_thick','unif_cell_size','unif_cell_shape','marg_adh','single_epit_cell_size','bare_nuclei','bland_chromatin','normal_nucleoli','mitosis','cancer_class'])
breast_cancer['cancer_class'] = breast_cancer['cancer_class'].map({2: 1, 4: 0})
breast_cancer_log['cancer_class'] = breast_cancer_log['cancer_class'].map({2: 1, 4: 0})
breast_cancer=breast_cancer[breast_cancer.bare_nuclei != '?']
breast_cancer.bare_nuclei = breast_cancer.bare_nuclei.astype(int)
breast_cancer_log=breast_cancer_log[breast_cancer_log.bare_nuclei != '?']
breast_cancer_log.bare_nuclei = breast_cancer_log.bare_nuclei.astype(int)
breast_cancer_features = [1,2,3,4,5,6,7,8,9]
breast_cancer_target_class = 10
breast_cancer_binomial = binomial_features(breast_cancer, breast_cancer_features)

<a id="Votes"></a>
## Votes

Changed all ? to 'n', 'n' to 0, 'y' to 1.  
Democrats = 1  
Republicans = 0  
Class Index: 0  
Feature Indices: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]

In [6]:
votes = pd.read_csv( "house-votes-84.data", header=None, names = ['votes_class','handicapped_infants','water_project','adoption_budget_resol','physician_fee_freeze','el_salv_aid','rel_groups_in_schools','anti_satellite_test_ban','aid_to_nica_contras','mx_missile','immigration','synfuels','education_spending','superfund_right_to_sue','crime','duty_free_exports','export_admin_act_south_africa'])
votes = votes.replace('y', 1)
votes = votes.replace('n', 0)
votes = votes.replace('?', 0)
votes = votes.replace('democrat', 1)
votes = votes.replace('republican', 0)
votes_features = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]

<a id="Soybean"></a>
## Soybean

Class Indices: [35,36,37,38]  
Feature Indices: [0,1,2,3,4,5,6,7,8,10,18,19,20,21,22,23,24,25,26,33]  

Not using features due to no info:  
seed  
mold_growth  
seed_discolor  
seed_size  
shriveling  
leaves  
leaf_shread  
leaf_malf  
leaf_mild  
germination  
leafspots_marg  
leafspot_size  
stem  
fruit_spots

In [7]:
soybean = pd.read_csv( "soybean-small.data", header=None, names = ['date','plant_stand','precip','temp','hail','crop_hist','area_damaged','severity','seed_tmt','germination','plant_growth','leaves','leafspots_marg','leafspot_size','leaf_shread','leaf_malf','leaf_mild','stem','lodging','stem_cankers','canker_lesion','fruiting_bodies','external_decay','mycelium','int_discolor','sclerotia','fruit_pods','fruit_spots','seed','mold_growth','seed_discolor','seed_size','shriveling','roots','soybean_class'])
soybean = pd.concat([soybean, pd.get_dummies(soybean["soybean_class"],prefix = 'soybean_class')], axis=1)
soybean_features = [1,2,3,4,5,6,7,8,10,18,19,20,21,22,23,24,25,26,33] 
soybean_binomial = binomial_features(soybean, soybean_features)

In [8]:
soybean_log = pd.read_csv( "soybean-small.data", header=None, names = ['date','plant_stand','precip','temp','hail','crop_hist','area_damaged','severity','seed_tmt','germination','plant_growth','leaves','leafspots_marg','leafspot_size','leaf_shread','leaf_malf','leaf_mild','stem','lodging','stem_cankers','canker_lesion','fruiting_bodies','external_decay','mycelium','int_discolor','sclerotia','fruit_pods','fruit_spots','seed','mold_growth','seed_discolor','seed_size','shriveling','roots','soybean_class'])
soybean_log = pd.concat([soybean_log, pd.get_dummies(soybean_log["soybean_class"],prefix = 'soybean_class')], axis=1)
soybean_log = soybean_log.append(soybean_log)

<a id="Cross_Val"></a>
## 5-Fold Cross Validation

In [9]:
# 5-Fold Cross Validation
# data is the dataset to be used for the algorithm
# target_class is the index of the target class feature
# features is a list of the indicies for the features to be used to measure distance
# Algorithm is which algorithm is chosen to be used
def five_fold_cross_val(data, target_class, features, Algorithm):
    metrics= []
    list_of_indices = list(range(len(data)))
    n_data = len(data) #this is the number of records in the data
    test_size = int(n_data/5) #this is the number of records in each test set
    df = copy.deepcopy(data)
    df = df.sample(frac=1).reset_index(drop=True) # a dataframe with shuffled records
    df.sort_values(by= df.columns[target_class]) # sorts the data by the target class
    
    split1 = []
    split2 = []
    split3 = []
    split4 = []
    split5 = []
    for count in range(test_size):
        split1.append(5*count)
        split2.append(5*count +1)
        split3.append(5*count +2)
        split4.append(5*count +3)
        split5.append(5*count +4)
    
    # test split1
    split1_train = []
    for index in range(n_data):
        if(index not in split1):
            split1_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes1, model1 = Algorithm(df, split1, split1_train, target_class, features)
    output1 = performance(df, split1, test_classes1, target_class) 
    metrics.append(output1)
    
    # test split2
    split2_train = []
    for index in range(n_data):
        if(index not in split2):
            split2_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes2, model2 = Algorithm(df, split2, split2_train, target_class, features)
    output2 = performance(df, split2, test_classes2, target_class)     
    metrics.append(output2)
    
    # test split3
    split3_train = []
    for index in range(n_data):
        if(index not in split3):
            split3_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes3, model3 = Algorithm(df, split3, split3_train, target_class, features)
    output3 = performance(df, split3, test_classes3, target_class)    
    metrics.append(output3)
    
    # test split4
    split4_train = []
    for index in range(n_data):
        if(index not in split4):
            split4_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes4, model4 = Algorithm(df, split4, split4_train, target_class, features)
    output4 = performance(df, split4, test_classes4, target_class)    
    metrics.append(output4)
    
    # test split5
    split5_train = []
    for index in range(n_data):
        if(index not in split5):
            split5_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes5, model5 = Algorithm(df, split5, split5_train, target_class, features)
    output5 = performance(df, split5, test_classes5, target_class)  
    metrics.append(output5)  
        
    return metrics, model1, test_classes1

In [10]:
# Performance
# data is the dataset to be used for the algorithm
# test is a list of indices from data to be used for testing
# test_classes is a list with the class for the test dataset that includes the estimate for the target_class
# features is a list of the indicies for the features to be used to measure distance
# target_class is the index of the target class feature
def performance(data, test, test_classes, target_class):
    n_data = len(test) # this is the number of records to be tested
    curr_perf = 0

    # classification error
    for record in range(n_data): # record is index in test 
        test_record = data.iloc[[test[record]]] # pulls record to test
        if(test_record.iloc[0,target_class] != test_classes[record]):
            curr_perf = curr_perf + 1
        # end if
    # end for
    curr_perf = curr_perf / n_data

    return curr_perf

<a id="Log_Regress"></a>
## Logistic Regression

In [11]:
# Logistic Regression
# data is the dataset to be used for the algorithm
# test is a list of indices from data to be used for testing
# train is a list of indices from data to be used for training
# target_class is the index of the target class feature
# features is a list of the indicies for the features to be used to measure distance
# returns target_classes, which is a list that contains the target class calculated for each test index
# returns betas, which is a list of the betas trained for the logistic regression model
def logistic_regression_train(data, test, train, target_class, features):
    # get the normalized featureset for the train data
    train_data = copy.deepcopy(data)
    train_data = train_data.iloc[train]
    train_data = train_data.iloc[:,features]
    train_data = train_data.values
    feature_data = normalize(train_data)
    # add a column of 1s in index 0
    feature_data = np.hstack((np.matrix(np.ones(feature_data.shape[0])).T, feature_data)) 
    y = copy.deepcopy(data.iloc[train])
    y = y.iloc[:,target_class]
    y = y.values
    # initial beta values 
    betas = np.matrix(np.zeros(feature_data.shape[1]))
    # use gradient descent to get final beta values
    betas = gradient_descent(feature_data, y, betas) 
    # get the normalized featureset for the test data
    test_data = copy.deepcopy(data)
    test_data = test_data.iloc[test]
    test_data = test_data.iloc[:, features]
    test_data = test_data.values
    test_data = normalize(test_data)
    # add a column of 1s in index 0
    test_data = np.hstack((np.matrix(np.ones(test_data.shape[0])).T, test_data)) 
    # calculate the target values for our test data
    feature_probabilities = 1.0/(1 + np.exp(-np.dot(test_data, betas.T))) 
    sum_probabilities = feature_probabilities.sum(axis=1)
    target_classes = np.where(sum_probabilities >= .5, 1, 0) 
    target_classes = np.squeeze(target_classes) 
    
    return target_classes, betas

In [12]:
# method to normalize features in the data
# used for continous features
# feature_data is the data frame with just the features needing normalized
# returns normed_features which is the data frame with normalized features
def normalize(feature_data): 
    feature_mins = np.min(feature_data, axis = 0) 
    feature_maxes = np.max(feature_data, axis = 0) 
    feature_range = feature_maxes - feature_mins 
    normed_features = 1 - ((feature_maxes - feature_data)/feature_range) 
    return normed_features 

In [13]:
# method to calculate the cost of a particular beta set
# betas is the betas being calculated for
# feature_data is the data frame with just the features
# y is the listing of target classes for the records from feature_data
def cost(betas, feature_data, y): 
    sigmoid_result = 1.0/(1 + np.exp(-np.dot(feature_data, betas.T)))  
    y = np.squeeze(y) 
    # uses log likelihood
    first_step = y * np.log(sigmoid_result) 
    second_step = (1 - y) * np.log(1 - sigmoid_result) 
    final_cost =  -first_step - second_step 
    return np.mean(final_cost) 

In [14]:
# method to use gradient descent to train the betas
# feature_data is the data frame with just the features
# y is the listing of target classes for the records from feature_data
# betas is the list of initial betas passed
# returns new betas learned by method
def gradient_descent(feature_data, y, betas): 
    converge_delta = 0.001
    learning_rate = 0.01
    new_cost = cost(betas, feature_data, y) 
    change_cost = 1
    counter = 1
      
    while(change_cost > converge_delta): 
        old_cost = new_cost 
        betas = betas - (learning_rate * log_gradient(betas, feature_data, y)) 
        new_cost = cost(betas, feature_data, y) 
        change_cost = old_cost - new_cost 
        counter += 1
      
    return betas 

In [15]:
# method to calculate the log gradient
# betas is the list of betas being calcualted for
# feature_data is the data frame with just the features
# y is the listing of target classes for the records from feature_data
def log_gradient(betas, feature_data, y): 
    initial_calculation = (1.0/(1 + np.exp(-np.dot(feature_data, betas.T)))) - y.reshape(feature_data.shape[0], -1)
    final_calculation = np.dot(initial_calculation.T, feature_data) 
    return final_calculation 

<a id="Naive_Bayes"></a>
## Naive Bayes

In [16]:
# Naive Bayes 
# data is the dataset to be used for the algorithm
# test is a list of indices from data to be used for testing
# train is a list of indices from data to be used for training
# target_class is the index of the target class feature
# attributes is a list of the indicies for the features to be used 
# returns target_classes, which is a list that contains the target class calculated for each test index
# returns new_model, which is a list of the probabilities trained for the naive bayes model
def naive_bayes_train(data, test, train, target_class, attributes):
    num_records = len(train)
    num_attributes = len(attributes)
    model_part = [0] * (num_attributes+1)
    new_model = [copy.deepcopy(model_part),copy.deepcopy(model_part),copy.deepcopy(model_part),copy.deepcopy(model_part)]
    count_1s = 1
    count_0s = 1
    
    #Model of form
    # P(0) / P(xi = 0)
    # ---- / P(xi = 1)
    # P(1) / P(xi = 0)
    # ---- / P(xi = 1)
    for index_test in range(num_records): #Loop through all records in data
        #add 1 to model for every datapoint 
        curr_record = data.iloc[[train[index_test]]] #pull current record data
        #first add for when target_class = 1
        if(curr_record.iloc[0,target_class]==1):
            for index in range(num_attributes): #Loop through each attribute
                if(curr_record.iloc[0,attributes[index]]==1):
                    new_model[3][index+1] = new_model[3][index+1] +1
                else:
                    new_model[2][index+1] = new_model[2][index+1] +1
            new_model[2][0] = new_model[2][0] +1
            count_1s = count_1s +1
                
        #then add for when target_class = 0
        else:
            for index in range(num_attributes): #Loop through each attribute 
                if(curr_record.iloc[0,attributes[index]]==0):
                    new_model[0][index+1] = new_model[0][index+1] +1
                else:
                    new_model[1][index+1]= new_model[1][index+1] +1
            new_model[0][0] = new_model[0][0] +1
            count_0s = count_0s+1

    
    new_model[0][0] = new_model[0][0] / num_records
    new_model[2][0] = new_model[2][0] / num_records
    for index in range(num_attributes):
        new_model[0][index+1] = (new_model[0][index+1] + 0.01) / count_0s
        new_model[1][index+1] = (new_model[1][index+1] + 0.01) / count_0s
        new_model[2][index+1] = (new_model[2][index+1] + 0.01) / count_1s
        new_model[3][index+1] = (new_model[3][index+1] + 0.01) / count_1s
   
    test_classes = naive_bayes_test(data, test, target_class, attributes, new_model)
    return test_classes, new_model

In [17]:
# method to predict target class based off naive bayes model
# data is the dataset to be used for the algorithm
# test is a list of indices from data to be used for testing
# target_class is the index of the target class feature
# attributes is a list of the indicies for the features to be used 
# model is the naive bayes model
# returns test_classes, which is a list that contains the target class calculated for each test index
def naive_bayes_test(data, test, target_class, attributes, model):
    num_records = len(test)
    num_attributes = len(attributes)
    test_classes = []

    #Model of form
    # P(0) / P(xi = 0)
    # ---- / P(xi = 1)
    # P(1) / P(xi = 0)
    # ---- / P(xi = 1)
    for index_test in range(num_records): #Loop through all records in data
        curr_record = data.iloc[[test[index_test]]] #pull current record data
        p_0 = model[0][0]
        p_1 = model[2][0]
        for index in range(num_attributes): #Loop through each attribute
            if(curr_record.iloc[0,attributes[index]]==1):
                p_0 = p_0 * model[2][index+1]
                p_1 = p_1 * model[3][index+1]
            else:
                p_0 = p_0 * model[0][index+1]
                p_1 = p_1 * model[1][index+1]
                
        #determine class and add to count_correct if correct
        if(p_0>p_1):
            test_classes.append(0)
        else:
            test_classes.append(1)
    
    return test_classes

<a id="Processing"></a>
## Processing

In [18]:
breast_cancer_metrics, breast_cancer_model, bc_nb_output = five_fold_cross_val(breast_cancer_binomial, breast_cancer_target_class, breast_cancer_features, naive_bayes_train)

In [19]:
print(bc_nb_output)

[1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0]


In [20]:
print(breast_cancer_metrics)

[0.014705882352941176, 0.051470588235294115, 0.007352941176470588, 0.03676470588235294, 0.03676470588235294]


In [21]:
print(breast_cancer_model)

[[0.3656307129798903, 0.1294029850746269, 0.15427860696517415, 0.13935323383084577, 0.21398009950248756, 0.2587562189054726, 0.16422885572139304, 0.1891044776119403, 0.17915422885572138, 0.5423383084577115], [0, 0.8657213930348259, 0.8408457711442786, 0.8557711442786069, 0.7811442786069651, 0.73636815920398, 0.8308955223880596, 0.8060199004975124, 0.8159701492537313, 0.45278606965174134], [0.6343692870201096, 0.7356609195402298, 0.9712931034482758, 0.9483045977011494, 0.8937068965517241, 0.9569252873563218, 0.9483045977011494, 0.9483045977011494, 0.939683908045977, 0.9770402298850575], [0, 0.2615229885057471, 0.02589080459770115, 0.04887931034482759, 0.10347701149425287, 0.040258620689655175, 0.04887931034482759, 0.04887931034482759, 0.0575, 0.02014367816091954]]


In [22]:
iris_metrics, iris_model, iris_nb_output = five_fold_cross_val(iris_binomial, 5, iris_features, naive_bayes_train)

In [23]:
print(iris_nb_output)

[0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1]


In [24]:
print(iris_metrics)

[0.0, 0.06666666666666667, 0.03333333333333333, 0.06666666666666667, 0.03333333333333333]


In [25]:
print(iris_model)

[[0.675, 0.3171951219512195, 0.7196341463414634, 0.08548780487804877, 0.12207317073170731], [0, 0.6708536585365853, 0.2684146341463415, 0.9025609756097561, 0.8659756097560977], [0.325, 0.97525, 0.17525, 0.97525, 0.97525], [0, 0.00025, 0.8002499999999999, 0.00025, 0.00025]]


In [26]:
glass_metrics, glass_model, glass_nb_output = five_fold_cross_val(glass_binomial, 15, glass_features, naive_bayes_train)

In [27]:
print(glass_nb_output)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [28]:
print(glass_metrics)

[0.023809523809523808, 0.047619047619047616, 0.0, 0.023809523809523808, 0.023809523809523808]


In [29]:
print(glass_model)

[[0.9476744186046512, 0.6525000000000001, 0.5671341463414634, 0.2622560975609756, 0.5854268292682927, 0.39640243902439026, 0.36591463414634146, 0.7256707317073171, 0.8415243902439024, 0.6890853658536585], [0, 0.3415243902439024, 0.42689024390243907, 0.7317682926829269, 0.4085975609756098, 0.5976219512195122, 0.628109756097561, 0.2683536585365853, 0.1525, 0.3049390243902439], [0.05232558139534884, 0.301, 0.001, 0.901, 0.301, 0.301, 0.901, 0.20099999999999998, 0.901, 0.901], [0, 0.601, 0.901, 0.001, 0.601, 0.601, 0.001, 0.701, 0.001, 0.001]]


In [30]:
votes_metrics, votes_model, votes_nb_output = five_fold_cross_val(votes, 0, votes_features, naive_bayes_train)

In [31]:
print(votes_nb_output)

[0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


In [32]:
print(votes_metrics)

[0.13793103448275862, 0.06896551724137931, 0.06896551724137931, 0.06896551724137931, 0.13793103448275862]


In [33]:
print(votes_model)

[[0.3879310344827586, 0.81625, 0.56625, 0.8603676470588235, 0.029485294117647057, 0.06625, 0.11036764705882353, 0.7647794117647059, 0.8456617647058824, 0.8750735294117648, 0.4265441176470588, 0.8603676470588235, 0.17654411764705882, 0.1986029411764706, 0.051544117647058824, 0.9044852941176471, 0.41919117647058823], [0, 0.17654411764705882, 0.4265441176470588, 0.1324264705882353, 0.9633088235294117, 0.9265441176470589, 0.8824264705882353, 0.22801470588235295, 0.1471323529411765, 0.11772058823529413, 0.56625, 0.1324264705882353, 0.81625, 0.7941911764705882, 0.9412499999999999, 0.08830882352941176, 0.5736029411764706], [0.6120689655172413, 0.3878971962616823, 0.5514485981308411, 0.11686915887850469, 0.9533177570093457, 0.817803738317757, 0.5748130841121496, 0.215, 0.14957943925233644, 0.2850934579439252, 0.5187383177570094, 0.5327570093457944, 0.8598598130841121, 0.7243457943925233, 0.6869626168224299, 0.3832242990654206, 0.3645327102803739], [0, 0.6075233644859813, 0.4439719626168225, 0.

In [34]:
soybean_metrics, soybean_model, soybean_nb_output = five_fold_cross_val(soybean_binomial, 36, soybean_features, naive_bayes_train)

In [35]:
print(soybean_nb_output)

[0, 0, 0, 0, 0, 1, 0, 0, 0]


In [36]:
print(soybean_metrics)

[0.0, 0.0, 0.0, 0.0, 0.0]


In [37]:
print(soybean_model)

[[0.7631578947368421, 0.06699999999999999, 0.46699999999999997, 0.7336666666666667, 0.4003333333333333, 0.9670000000000001, 0.46699999999999997, 0.5670000000000001, 0.5003333333333333, 0.267, 0.8003333333333333, 0.5336666666666667, 0.5670000000000001, 0.7003333333333334, 0.267, 0.8003333333333333, 0.9670000000000001, 0.9670000000000001, 0.267, 0.5336666666666667], [0, 0.9003333333333334, 0.5003333333333333, 0.23366666666666666, 0.5670000000000001, 0.0003333333333333333, 0.5003333333333333, 0.4003333333333333, 0.46699999999999997, 0.7003333333333334, 0.16699999999999998, 0.43366666666666664, 0.4003333333333333, 0.267, 0.7003333333333334, 0.16699999999999998, 0.0003333333333333333, 0.0003333333333333333, 0.7003333333333334, 0.43366666666666664], [0.23684210526315788, 0.901, 0.001, 0.40099999999999997, 0.40099999999999997, 0.001, 0.901, 0.40099999999999997, 0.601, 0.001, 0.701, 0.901, 0.001, 0.901, 0.901, 0.901, 0.001, 0.001, 0.901, 0.901], [0, 0.001, 0.901, 0.501, 0.501, 0.901, 0.001, 0.

Logistic Regression

In [38]:
breast_cancer_log_metrics, breast_cancer_log_model, breast_cancer_output = five_fold_cross_val(breast_cancer_log, breast_cancer_target_class, breast_cancer_features, logistic_regression_train)

In [39]:
print(breast_cancer_output)

[1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 1 1 1 1 1 0 1 0 1 1
 0 1 1 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 0 0
 1 1 0 1 1 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 0 1 0 0 1 0 1 0 1 1 1 1 0 0
 1 0 1 0 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 1 0 1 0]


In [40]:
print(breast_cancer_log_metrics)

[0.03676470588235294, 0.03676470588235294, 0.04411764705882353, 0.022058823529411766, 0.022058823529411766]


In [41]:
print(breast_cancer_log_model)

[[ 7.14383053 -5.56185825 -1.29464269 -1.92306616 -2.41674833 -0.5385677
  -3.41618193 -3.14474547 -1.73555005 -2.42626869]]


In [42]:
iris_log_metrics, iris_log_model, iris_output = five_fold_cross_val(iris_log, 5, iris_features, logistic_regression_train)

In [43]:
print(iris_output)

[0 0 0 0 0 1 0 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 1]


In [44]:
print(iris_log_metrics)

[0.0, 0.0, 0.0, 0.0, 0.0]


In [45]:
print(iris_log_model)

[[ 2.88054314 -3.24987857  5.79453883 -7.87053119 -7.57496416]]


In [46]:
glass_log_metrics, glass_log_model, glass_output = five_fold_cross_val(glass_log, 15, glass_features, logistic_regression_train)

In [47]:
print(glass_output)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0]


In [48]:
print(glass_log_metrics)

[0.047619047619047616, 0.07142857142857142, 0.07142857142857142, 0.16666666666666666, 0.023809523809523808]


In [49]:
print(glass_log_model)

[[ -3.56568035  -3.59112973   9.92041539  -2.69954827  -1.84912899
    0.11651702  -7.26233494  -0.41371806 -11.77698603  -5.80958883]]


In [50]:
votes_log_metrics, votes_log_model, votes_output = five_fold_cross_val(votes, 0, votes_features, logistic_regression_train)

In [51]:
print(votes_output)

[1 0 1 0 1 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 0 1 1 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1 0 1 0 1 0 0 1 1 0 1 0 1
 1 0 1 1 1 1 1 1 1 1 1 1 1]


In [52]:
print(votes_log_metrics)

[0.011494252873563218, 0.034482758620689655, 0.06896551724137931, 0.034482758620689655, 0.034482758620689655]


In [53]:
print(votes_log_model)

[[-0.21249096  0.37492762  1.63462776  3.61962031 -7.32366436 -0.3287923
   1.1798216  -0.71941434 -0.53903372  2.21084609 -1.75585308  3.35425818
  -0.99305289  0.4689131   0.88643717  1.95781593 -0.56544845]]


In [54]:
soybean_log_metrics, soybean_log_model, soybean_output = five_fold_cross_val(soybean_log, 36, soybean_features, logistic_regression_train)

  if __name__ == '__main__':


In [55]:
print(soybean_output)

[0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1]


In [56]:
print(soybean_log_metrics)

[0.0, 0.0, 0.0, 0.0, 0.2222222222222222]


In [57]:
print(soybean_log_model)

[[-0.48061779 -2.02620935  0.65700547  0.43620733 -0.2719909   0.78127257
  -0.93130679 -0.13679937 -0.42946579 -0.0579817   0.05504234 -1.32773283
   0.71477194 -0.6399324  -1.42650624 -0.23620877  1.786354    1.786354
  -1.62703939 -1.17962496]]
