In [1]:
import pandas as pd
import random
import numpy as np
import math
import copy 

In [2]:
# Method to convert features to binomial
# data is the dataset to be used for the algorithm
# features is a list of the indicies for the features
# return data_binomial, which is the dataset with the features converted to binomial
def binomial_features(data, features):
    data_binomial = copy.deepcopy(data)
    for index in features:
        column_name = data_binomial.columns[index]
        mean = data_binomial[column_name].mean()
        data_ltm = data_binomial[column_name] < mean
        data_binomial.loc[data_ltm, column_name] = 0
        data_gtm = data_binomial[column_name] >= mean
        data_binomial.loc[data_gtm, column_name] = 1
    return data_binomial

## This code performs Logistic Regression on datasets from the UCI Machine Learning Repository https://archive.ics.uci.edu/ml/index.php


[Datasets](#Datasets):

[Iris](#Iris)- 3 classes    
[Glass](#Glass)- 7 classes  
[Breast Cancer](#Breast_Cancer)- 2 classes  
[Congressional Voting Records](#Votes)- 2 classes  

[Cross Validation](#Cross_Val)  
[Logistic Regression](#Log_Regress)  
[Models](#Models)  

<a id="Datasets"></a>
# Datasets

<a id="Iris"></a>
## Iris

Dummies for different Iris Classes:  
iris_class_Iris-setosa       
iris_class_Iris-versicolor    
iris_class_Iris-virginica  
Target Class Indices: [5,6,7]  
Feature Indices: [0,1,2,3]

In [3]:
iris = pd.read_csv( "iris.data", header=None, names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width','iris_class'])
iris_log = pd.read_csv( "iris.data", header=None, names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width','iris_class'])
iris = pd.concat([iris, pd.get_dummies(iris["iris_class"],prefix = 'iris_class')], axis=1)
iris_log = pd.concat([iris_log, pd.get_dummies(iris["iris_class"],prefix = 'iris_class')], axis=1)
iris_features = [0,1,2,3]
iris_binomial = binomial_features(iris, iris_features)

<a id="Glass"></a>
## Glass

Class Indices: [11,12,13,14,15,16]  
Feature Indices: [1,2,3,4,5,6,7,8,9]

In [4]:
glass = pd.read_csv( "glass.data", header=None, names = ['ID','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','glass_class'])
glass_log = pd.read_csv( "glass.data", header=None, names = ['ID','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','glass_class'])
glass = pd.concat([glass, pd.get_dummies(glass["glass_class"],prefix = 'glass_class')], axis=1)
glass_log = pd.concat([glass_log, pd.get_dummies(glass["glass_class"],prefix = 'glass_class')], axis=1)
glass_features = [1,2,3,4,5,6,7,8,9]
glass_binomial = binomial_features(glass, glass_features)

<a id="Breast_Cancer"></a>
## Breast Cancer

Remove 16 records with missing data.  
Class 2 changed to 1  
Class 4 changed to 0  
Class Index: 10  
Features: [1,2,3,4,5,6,7,8,9]

In [5]:
breast_cancer = pd.read_csv( "breast-cancer-wisconsin.data", header=None, names = ['ID','clump_thick','unif_cell_size','unif_cell_shape','marg_adh','single_epit_cell_size','bare_nuclei','bland_chromatin','normal_nucleoli','mitosis','cancer_class'])
breast_cancer_log = pd.read_csv( "breast-cancer-wisconsin.data", header=None, names = ['ID','clump_thick','unif_cell_size','unif_cell_shape','marg_adh','single_epit_cell_size','bare_nuclei','bland_chromatin','normal_nucleoli','mitosis','cancer_class'])
breast_cancer['cancer_class'] = breast_cancer['cancer_class'].map({2: 1, 4: 0})
breast_cancer_log['cancer_class'] = breast_cancer_log['cancer_class'].map({2: 1, 4: 0})
breast_cancer=breast_cancer[breast_cancer.bare_nuclei != '?']
breast_cancer.bare_nuclei = breast_cancer.bare_nuclei.astype(int)
breast_cancer_log=breast_cancer_log[breast_cancer_log.bare_nuclei != '?']
breast_cancer_log.bare_nuclei = breast_cancer_log.bare_nuclei.astype(int)
breast_cancer_features = [1,2,3,4,5,6,7,8,9]
breast_cancer_target_class = 10
breast_cancer_binomial = binomial_features(breast_cancer, breast_cancer_features)

<a id="Votes"></a>
## Votes

Changed all ? to 'n', 'n' to 0, 'y' to 1.  
Democrats = 1  
Republicans = 0  
Class Index: 0  
Feature Indices: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]

In [6]:
votes = pd.read_csv( "house-votes-84.data", header=None, names = ['votes_class','handicapped_infants','water_project','adoption_budget_resol','physician_fee_freeze','el_salv_aid','rel_groups_in_schools','anti_satellite_test_ban','aid_to_nica_contras','mx_missile','immigration','synfuels','education_spending','superfund_right_to_sue','crime','duty_free_exports','export_admin_act_south_africa'])
votes = votes.replace('y', 1)
votes = votes.replace('n', 0)
votes = votes.replace('?', 0)
votes = votes.replace('democrat', 1)
votes = votes.replace('republican', 0)
votes_features = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]

<a id="Cross_Val"></a>
## 5-Fold Cross Validation

In [7]:
# 5-Fold Cross Validation
# data is the dataset to be used for the algorithm
# target_class is the index of the target class feature
# features is a list of the indicies for the features to be used to measure distance
# Algorithm is which algorithm is chosen to be used
def five_fold_cross_val(data, target_class, features, Algorithm):
    metrics= []
    list_of_indices = list(range(len(data)))
    n_data = len(data) #this is the number of records in the data
    test_size = int(n_data/5) #this is the number of records in each test set
    df = copy.deepcopy(data)
    df = df.sample(frac=1).reset_index(drop=True) # a dataframe with shuffled records
    df.sort_values(by= df.columns[target_class]) # sorts the data by the target class
    
    split1 = []
    split2 = []
    split3 = []
    split4 = []
    split5 = []
    for count in range(test_size):
        split1.append(5*count)
        split2.append(5*count +1)
        split3.append(5*count +2)
        split4.append(5*count +3)
        split5.append(5*count +4)
    
    # test split1
    split1_train = []
    for index in range(n_data):
        if(index not in split1):
            split1_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes1, model1 = Algorithm(df, split1, split1_train, target_class, features)
    output1 = performance(df, split1, test_classes1, target_class) 
    metrics.append(output1)
    
    # test split2
    split2_train = []
    for index in range(n_data):
        if(index not in split2):
            split2_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes2, model2 = Algorithm(df, split2, split2_train, target_class, features)
    output2 = performance(df, split2, test_classes2, target_class)     
    metrics.append(output2)
    
    # test split3
    split3_train = []
    for index in range(n_data):
        if(index not in split3):
            split3_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes3, model3 = Algorithm(df, split3, split3_train, target_class, features)
    output3 = performance(df, split3, test_classes3, target_class)    
    metrics.append(output3)
    
    # test split4
    split4_train = []
    for index in range(n_data):
        if(index not in split4):
            split4_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes4, model4 = Algorithm(df, split4, split4_train, target_class, features)
    output4 = performance(df, split4, test_classes4, target_class)    
    metrics.append(output4)
    
    # test split5
    split5_train = []
    for index in range(n_data):
        if(index not in split5):
            split5_train.append(index)
        # end if
    # end for
    # run algorithm
    test_classes5, model5 = Algorithm(df, split5, split5_train, target_class, features)
    output5 = performance(df, split5, test_classes5, target_class)  
    metrics.append(output5)  
        
    return metrics, model1, test_classes1

In [8]:
# Performance
# data is the dataset to be used for the algorithm
# test is a list of indices from data to be used for testing
# test_classes is a list with the class for the test dataset that includes the estimate for the target_class
# features is a list of the indicies for the features to be used to measure distance
# target_class is the index of the target class feature
def performance(data, test, test_classes, target_class):
    n_data = len(test) # this is the number of records to be tested
    curr_perf = 0

    # classification error
    for record in range(n_data): # record is index in test 
        test_record = data.iloc[[test[record]]] # pulls record to test
        if(test_record.iloc[0,target_class] != test_classes[record]):
            curr_perf = curr_perf + 1
        # end if
    # end for
    curr_perf = curr_perf / n_data

    return curr_perf

<a id="Log_Regress"></a>
## Logistic Regression

In [9]:
# Logistic Regression
# data is the dataset to be used for the algorithm
# test is a list of indices from data to be used for testing
# train is a list of indices from data to be used for training
# target_class is the index of the target class feature
# features is a list of the indicies for the features to be used to measure distance
# returns target_classes, which is a list that contains the target class calculated for each test index
# returns betas, which is a list of the betas trained for the logistic regression model
def logistic_regression_train(data, test, train, target_class, features):
    # get the normalized featureset for the train data
    train_data = copy.deepcopy(data)
    train_data = train_data.iloc[train]
    train_data = train_data.iloc[:,features]
    train_data = train_data.values
    feature_data = normalize(train_data)
    # add a column of 1s in index 0
    feature_data = np.hstack((np.matrix(np.ones(feature_data.shape[0])).T, feature_data)) 
    y = copy.deepcopy(data.iloc[train])
    y = y.iloc[:,target_class]
    y = y.values
    # initial beta values 
    betas = np.matrix(np.zeros(feature_data.shape[1]))
    # use gradient descent to get final beta values
    betas = gradient_descent(feature_data, y, betas) 
    # get the normalized featureset for the test data
    test_data = copy.deepcopy(data)
    test_data = test_data.iloc[test]
    test_data = test_data.iloc[:, features]
    test_data = test_data.values
    test_data = normalize(test_data)
    # add a column of 1s in index 0
    test_data = np.hstack((np.matrix(np.ones(test_data.shape[0])).T, test_data)) 
    # calculate the target values for our test data
    feature_probabilities = 1.0/(1 + np.exp(-np.dot(test_data, betas.T))) 
    sum_probabilities = feature_probabilities.sum(axis=1)
    target_classes = np.where(sum_probabilities >= .5, 1, 0) 
    target_classes = np.squeeze(target_classes) 
    
    return target_classes, betas

In [10]:
# method to normalize features in the data
# used for continous features
# feature_data is the data frame with just the features needing normalized
# returns normed_features which is the data frame with normalized features
def normalize(feature_data): 
    feature_mins = np.min(feature_data, axis = 0) 
    feature_maxes = np.max(feature_data, axis = 0) 
    feature_range = feature_maxes - feature_mins 
    normed_features = 1 - ((feature_maxes - feature_data)/feature_range) 
    return normed_features 

In [11]:
# method to calculate the cost of a particular beta set
# betas is the betas being calculated for
# feature_data is the data frame with just the features
# y is the listing of target classes for the records from feature_data
def cost(betas, feature_data, y): 
    sigmoid_result = 1.0/(1 + np.exp(-np.dot(feature_data, betas.T)))  
    y = np.squeeze(y) 
    # uses log likelihood
    first_step = y * np.log(sigmoid_result) 
    second_step = (1 - y) * np.log(1 - sigmoid_result) 
    final_cost =  -first_step - second_step 
    return np.mean(final_cost) 

In [12]:
# method to use gradient descent to train the betas
# feature_data is the data frame with just the features
# y is the listing of target classes for the records from feature_data
# betas is the list of initial betas passed
# returns new betas learned by method
def gradient_descent(feature_data, y, betas): 
    converge_delta = 0.001
    learning_rate = 0.01
    new_cost = cost(betas, feature_data, y) 
    change_cost = 1
    counter = 1
      
    while(change_cost > converge_delta): 
        old_cost = new_cost 
        betas = betas - (learning_rate * log_gradient(betas, feature_data, y)) 
        new_cost = cost(betas, feature_data, y) 
        change_cost = old_cost - new_cost 
        counter += 1
      
    return betas 

In [13]:
# method to calculate the log gradient
# betas is the list of betas being calcualted for
# feature_data is the data frame with just the features
# y is the listing of target classes for the records from feature_data
def log_gradient(betas, feature_data, y): 
    initial_calculation = (1.0/(1 + np.exp(-np.dot(feature_data, betas.T)))) - y.reshape(feature_data.shape[0], -1)
    final_calculation = np.dot(initial_calculation.T, feature_data) 
    return final_calculation 

<a id="Processing"></a>
## Processing

Logistic Regression

Performance is based on Classification Error

In [14]:
breast_cancer_log_metrics, breast_cancer_log_model, breast_cancer_output = five_fold_cross_val(breast_cancer_log, breast_cancer_target_class, breast_cancer_features, logistic_regression_train)

In [15]:
print(breast_cancer_output)

[0 0 0 1 0 0 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1 1 0 0 0 1
 1 1 1 1 1 0 1 1 1 0 1 0 1 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0
 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 1 1 0]


In [16]:
print(breast_cancer_log_metrics)

[0.029411764705882353, 0.04411764705882353, 0.058823529411764705, 0.014705882352941176, 0.022058823529411766]


In [17]:
print(breast_cancer_log_model)

[[ 7.04270679 -4.87595501 -0.1080613  -2.26185057 -4.02614809 -1.41508764
  -3.59446137 -3.24903743 -1.45842438 -3.40202366]]


In [18]:
iris_log_metrics, iris_log_model, iris_output = five_fold_cross_val(iris_log, 5, iris_features, logistic_regression_train)

In [19]:
print(iris_output)

[0 0 0 1 1 0 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0]


In [20]:
print(iris_log_metrics)

[0.0, 0.0, 0.0, 0.0, 0.0]


In [21]:
print(iris_log_model)

[[ 2.54770755 -3.30689559  5.83134949 -7.95385444 -7.56531688]]


In [22]:
glass_log_metrics, glass_log_model, glass_output = five_fold_cross_val(glass_log, 15, glass_features, logistic_regression_train)

In [23]:
print(glass_output)

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0]


In [24]:
print(glass_log_metrics)

[0.0, 0.047619047619047616, 0.11904761904761904, 0.07142857142857142, 0.023809523809523808]


In [25]:
print(glass_log_model)

[[ -3.96340581  -3.92663025   9.79763478  -2.37918345  -0.46490111
    0.14772643  -9.7627006    0.68073363 -13.84404376  -6.96701087]]


In [26]:
votes_log_metrics, votes_log_model, votes_output = five_fold_cross_val(votes, 0, votes_features, logistic_regression_train)

In [27]:
print(votes_output)

[1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1
 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 1 1
 0 0 0 1 0 0 0 1 1 1 0 1 1]


In [28]:
print(votes_log_metrics)

[0.04597701149425287, 0.04597701149425287, 0.034482758620689655, 0.034482758620689655, 0.05747126436781609]


In [29]:
print(votes_log_model)

[[-0.22141687  0.10644634  1.10119753  3.22181472 -8.66302572  1.50500071
   2.15606686 -0.19888698 -0.19466821  2.14107699 -1.64371001  4.36716674
  -1.63245533 -1.18256009  1.4078778   2.16473481 -0.40465216]]
