--------------------------------------------------------------------------------
2 Logistic Regression Spam Classification
--------------------------------------------------------------------------------

In [107]:
import pandas as pd
import numpy as np
import math
import random

In [108]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [109]:
# Reads in the data, ignoring the first row (header) and first column (index).
df = pd.read_csv('gdrive/My Drive/spambase.data')
numpyArray = df.to_numpy()

In [110]:
# Randomizes the data
np.random.seed(0)
np.random.shuffle(numpyArray)

In [111]:
# Selects the first 2/3 (round up) of the data for training and the remaining for testing
trainingData = numpyArray[:math.ceil(len(numpyArray) * 2/3)]
testingData = numpyArray[math.ceil(len(numpyArray) * 2/3):]

X_train = np.delete(trainingData, -1, axis=1)
X_test = np.delete(testingData, -1, axis=1)

Y_train =  np.delete(trainingData, slice(0, 57), axis=1)
Y_test =  np.delete(testingData, slice(0, 57), axis=1)

In [112]:
# Standardizes the data (except for the last column of course) using the training data
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0, ddof=1)

sX_train = (X_train-mean)/std
sX_test = (X_test-mean)/std

In [113]:
# Add bias to standardized input data
sX_train = np.insert(sX_train, 0, 1, axis=1)
sX_test = np.insert(sX_test, 0, 1, axis=1)

In [114]:
# Use a learning rate η = 0.01.
learning_rate = .01
num_of_features = len(numpyArray[0])

In [115]:
#Set randomized values between [-1,1] for thetas
thetas = []
for i in range(0, num_of_features):
  lst=[]
  lst.append(random.uniform(-1.0, 1.0))
  thetas.append(lst)

thetas = np.array(thetas)

In [116]:
def sigmoid(X, thetas):
  return (1 / (1 + np.exp((-1) * X @ thetas)))

def log_loss_calc(X_train, Y_train, thetas):
  sum = 0
  for i in range(len(X_train)):
    if((1 - sigmoid(X_train[i], thetas)) < 0):
      print(sigmoid(X_train[i], thetas))
    if((sigmoid(X_train[i], thetas)) < 0):
      print(sigmoid(X_train[i], thetas))
    sum += (-Y_train[i] * np.exp(sigmoid(X_train[i], thetas))) - ((1 - Y_train[i]) * np.exp((1 - sigmoid(X_train[i], thetas))))
  
  return sum

def run_batch_gradient_descent(thetas, X_train, X_test, Y_train, Y_test, N, learning_rate, max_iterations):
   log_loss = log_loss_calc(X_train, Y_train, thetas)
   while(max_iterations >= 1):
     thetas  = thetas + ((learning_rate/N) * (X_train.T @ (Y_train - sigmoid(X_train, thetas))))
     
     new_log_loss = log_loss_calc(X_train, Y_train, thetas)

     if(abs(new_log_loss - log_loss) <= 2**-23):
       break

     log_loss = new_log_loss
     
     max_iterations -= 1
     
     #Return the final calculated theta values
   return thetas


In [117]:
final_thetas = run_batch_gradient_descent(thetas, sX_train, sX_test, Y_train, Y_test, N=len(trainingData), learning_rate=.01, max_iterations=1500)


[-7152.20025173]


In [118]:
def calc_accuracy(Y_test, sX_test, thetas):
  correct = 0
  for i in range(len(Y_test)):
    obs_class = 0
    Prob_1 = sigmoid(sX_test[i], thetas)
    Prob_0 = 1 - sigmoid(sX_test[i], thetas)

    if(Prob_1[0] > Prob_0[0]):
      obs_class = 1
    else:
      obs_class = 0

    if(Y_test[i][0] == obs_class):
      correct += 1

  return correct/len(Y_test)

In [119]:
def calc_precision(Y_test, sX_test, thetas):
  true_positives = 0
  false_positives = 0
  for i in range(len(Y_test)):
    obs_class = 0
    Prob_1 = sigmoid(sX_test[i], thetas)
    Prob_0 = 1 - sigmoid(sX_test[i], thetas)

    if(Prob_1[0] > Prob_0[0]):
      obs_class = 1
    else:
      obs_class = 0

    if(obs_class == 1 and int(Y_test[i]) == 1):
      true_positives += 1
    if(obs_class == 1 and int(Y_test[i]) == 0):
      false_positives += 1
  
  return (true_positives / (true_positives + false_positives))

In [120]:
def calc_recall(Y_test, sX_test, thetas):
  true_positives = 0
  false_negatives = 0
  for i in range(len(Y_test)):
    obs_class = 0
    Prob_1 = sigmoid(sX_test[i], thetas)
    Prob_0 = 1 - sigmoid(sX_test[i], thetas)

    if(Prob_1[0] > Prob_0[0]):
      obs_class = 1
    else:
      obs_class = 0

    if(obs_class == 1 and int(Y_test[i]) == 1):
      true_positives += 1
    if(obs_class == 0 and int(Y_test[i]) == 1):
      false_negatives += 1
  
  return (true_positives / (true_positives + false_negatives))

In [121]:
def calc_f_measure(precision, recall):
  return ((2 * precision * recall) / (precision + recall))

In [122]:
LR_accuracy = calc_accuracy(Y_test, sX_test, final_thetas)

In [126]:
LR_precision = calc_precision(Y_test, sX_test, final_thetas)

In [127]:
LR_recall = calc_recall(Y_test, sX_test, final_thetas)

In [128]:
LR_f_measure = calc_f_measure(LR_precision, LR_recall)

In [129]:
LR_accuracy

0.8904109589041096

In [130]:
LR_precision

0.8671454219030521

In [131]:
LR_recall

0.8370883882149047

In [132]:
LR_f_measure

0.8518518518518519

--------------------------------------------------------------------------------
3 Naive Bayes Classifier
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

In [133]:
import pandas as pd
import numpy as np
import math
import random

In [134]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [135]:
# Reads in the data, ignoring the first row (header) and first column (index).
df = pd.read_csv('gdrive/My Drive/spambase.data')
numpyArray = df.to_numpy()

In [136]:
# Randomizes the data
np.random.seed(0)
np.random.shuffle(numpyArray)

In [137]:
# Selects the first 2/3 (round up) of the data for training and the remaining for testing
trainingData = numpyArray[:math.ceil(len(numpyArray) * 2/3)]
testingData = numpyArray[math.ceil(len(numpyArray) * 2/3):]

X_train = np.delete(trainingData, -1, axis=1)
X_test = np.delete(testingData, -1, axis=1)

Y_train =  np.delete(trainingData, slice(0, 57), axis=1)
Y_test =  np.delete(testingData, slice(0, 57), axis=1)

In [138]:
# Standardizes the data (except for the last column of course) using the training data
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0, ddof=1)

sX_train = (X_train-mean)/std
sX_test = (X_test-mean)/std

In [139]:
# Add bias to standardized input data
sX_train = np.insert(sX_train, 0, 1, axis=1)
sX_test = np.insert(sX_test, 0, 1, axis=1)

In [140]:
def get_non_spam_class_prob(Y_test):
  total_non_spams = 0
  for output in Y_test:
    if(int(output[0]) == 0):
      total_non_spams += 1
  
  return (total_non_spams / len(Y_test))

In [141]:
def get_spam_class_prob(Y_test):
  total_spams = 0
  for output in Y_test:
    if(int(output[0]) == 1):
      total_spams += 1
  
  return (total_spams / len(Y_test))

In [142]:
non_spam_class_prob = get_non_spam_class_prob(Y_test)

In [143]:
spam_class_prob = get_spam_class_prob(Y_test)

In [144]:
spam_samples = []
non_spam_samples = []
for row in range(len(sX_train)):
  if(int(Y_train[row][0]) == 0):
    non_spam_samples.append(sX_train[row])
  else:
    spam_samples.append(sX_train[row])

spam_samples = np.array(spam_samples)
non_spam_samples = np.array(non_spam_samples)

In [145]:
spam_features_mean = np.mean(spam_samples, axis=0)  

In [146]:
non_spam_features_mean = np.mean(non_spam_samples, axis=0)  

In [147]:
spam_features_std = np.std(spam_samples, axis=0, ddof=1)

In [148]:
non_spam_features_std = np.std(non_spam_samples, axis=0, ddof=1)

In [149]:
def get_gaussian_approx(feature, mean, std):
  exp = (-1) * (( (feature - mean)**2 ) / (2 * (std**2)))
  return ( (1 / (std * math.sqrt(2 * math.pi))) * (math.e**exp))

In [150]:
predictions = []
for i in range(len(sX_test)):
  spam_prob = spam_class_prob
  non_spam_prob = non_spam_class_prob
  for feature_index in range(1, len(sX_test[i])):
    spam_gaussian_approx = get_gaussian_approx(sX_test[i][feature_index], spam_features_mean[feature_index], spam_features_std[feature_index])
    non_spam_gaussian_approx = get_gaussian_approx(sX_test[i][feature_index], non_spam_features_mean[feature_index], non_spam_features_std[feature_index])

    spam_prob *= spam_gaussian_approx
    non_spam_prob *= non_spam_gaussian_approx
  
  if(spam_prob > non_spam_prob):
    predictions.append(1)
  else:
    predictions.append(0)

In [151]:
def get_naive_bayes_accuracy(predictions, Y_test):
  correct = 0

  for i in range(len(Y_test)):
    if(predictions[i] == int(Y_test[i][0])):
      correct += 1
    
  return correct/len(Y_test)

In [152]:
def get_naive_bayes_precision(predictions, Y_test):
  true_positives = 0
  false_positives = 0
  for i in range(len(Y_test)):
    if(predictions[i] == 1 and int(Y_test[i][0]) == 1):
      true_positives += 1
    if(predictions[i] == 1 and int(Y_test[i][0]) == 0):
      false_positives += 1
  
  return (true_positives / (true_positives + false_positives))


In [153]:
def get_naive_bayes_recall(predictions, Y_test):
  true_positives = 0
  false_negatives = 0

  for i in range(len(Y_test)):
    if(predictions[i] == 1 and int(Y_test[i][0]) == 1):
      true_positives += 1
    if(predictions[i] == 0 and int(Y_test[i][0]) == 1):
      false_negatives += 1
  

  return (true_positives / (true_positives + false_negatives))

In [154]:
def get_naive_bayes_f_measure(precision, recall):
  return ((2 * precision * recall) / (precision + recall))

In [155]:
naive_bayes_accuracy = get_naive_bayes_accuracy(predictions, Y_test)

In [156]:
naive_bayes_accuracy

0.7879973907371167

In [157]:
naive_bayes_precision = get_naive_bayes_precision(predictions, Y_test)

In [158]:
naive_bayes_precision

0.6465116279069767

In [159]:
naive_bayes_recall = get_naive_bayes_recall(predictions, Y_test)

In [160]:
naive_bayes_recall

0.9636048526863085

In [161]:
naive_bayes_f_measure = get_naive_bayes_f_measure(naive_bayes_precision, naive_bayes_recall)

In [162]:
naive_bayes_f_measure

0.7738343771746694

--------------------------------------------------------------------------------
4 Decision Trees
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

In [304]:
import pandas as pd
import numpy as np
import math
import random

In [305]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [306]:
# Reads in the data, ignoring the first row (header) and first column (index).
df = pd.read_csv('gdrive/My Drive/spambase.data')
numpyArray = df.to_numpy()

In [307]:
# Randomizes the data
np.random.seed(0)
np.random.shuffle(numpyArray)

In [308]:
# Selects the first 2/3 (round up) of the data for training and the remaining for testing
trainingData = numpyArray[:math.ceil(len(numpyArray) * 2/3)]
testingData = numpyArray[math.ceil(len(numpyArray) * 2/3):]

X_train = np.delete(trainingData, -1, axis=1)
X_test = np.delete(testingData, -1, axis=1)

Y_train =  np.delete(trainingData, slice(0, 57), axis=1)
Y_test =  np.delete(testingData, slice(0, 57), axis=1)

In [309]:
# Standardizes the data (except for the last column of course) using the training data
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0, ddof=1)

sX_train = (X_train-mean)/std
sX_test = (X_test-mean)/std

sX_train = np.append(sX_train, Y_train, axis=1)

In [310]:
#Divides the training data into two groups: Spam samples, Non-Spam samples
spam_samples = []
non_spam_samples = []
for row in range(len(sX_train)):
  if(int(sX_train[row][-1]) == 0):
    non_spam_samples.append(sX_train[row])
  else:
    spam_samples.append(sX_train[row])

spam_samples = np.array(spam_samples)
non_spam_samples = np.array(non_spam_samples)

In [311]:
spam_samples_medians = np.median(spam_samples, axis=0)

In [312]:
non_spam_samples_medians = np.median(non_spam_samples, axis=0)

In [313]:
binary_spam_samples = []
for row in range(len(spam_samples)):
  lst = []
  for i in range(len(spam_samples[row]) - 1):
    if(spam_samples[row][i] >= spam_samples_medians[i]):
      lst.append(1)
    else:
      lst.append(0)
  lst.append(spam_samples[row][-1])
  binary_spam_samples.append(lst)

binary_spam_samples = np.array(binary_spam_samples)

In [314]:
binary_non_spam_samples = []
for row in range(len(non_spam_samples)):
  lst = []
  for i in range(len(non_spam_samples[row])):
    if(non_spam_samples[row][i] >= non_spam_samples_medians[i]):
      lst.append(1)
    else:
      lst.append(0)
    
  lst.append(non_spam_samples[row][-1])
  binary_non_spam_samples.append(lst)

binary_non_spam_samples = np.array(binary_non_spam_samples)

In [315]:
p_spam = (-1) * (len(binary_spam_samples) / (len(binary_spam_samples) + len(binary_non_spam_samples))) * math.log(len(binary_spam_samples) / (len(binary_spam_samples) + len(binary_non_spam_samples)), 2)

In [316]:
p_non_spam = (-1) * (len(binary_non_spam_samples) / (len(binary_non_spam_samples) + len(binary_spam_samples))) * math.log(len(binary_non_spam_samples) / (len(binary_non_spam_samples) + len(binary_spam_samples)), 2)

In [317]:
p_non_spam

0.4440576038237948

In [318]:
#Entropy of entire system
H_data_set = p_non_spam + p_spam

In [319]:
H_data_set

0.972493094244988

In [320]:
def calculate_entropy(totalPositive, totalNegative, totalObs):
  if(totalObs <= 0):
    return 0
  return (-1) * ((totalPositive/totalObs) * math.log(totalPositive/totalObs, 2) + (totalNegative/totalObs) * math.log(totalNegative/totalObs, 2))
   

In [321]:
def find_entropy(data):
  classValues = np.delete(data, slice(0, 57), axis=1)
  classValues = classValues.astype(int)
  values, counts = np.unique(classValues, return_counts=True)
  entropy = 0

  for value in values:
    prob = counts[value]/len(classValues)
    entropy += -prob * np.log2(prob)
  return entropy

In [322]:
def find_entropy_attribute(data, attribute_index):
  #Class = df.keys()[-1]
  classValues = np.delete(data, slice(0, 57), axis=1)
  classValues = classValues.astype(int)
  #target_values = df[Class].unique()
  values, counts = np.unique(classValues, return_counts=True)

  #attribute_values = df[attribute].unique()
  attributeValues = data[:, [attribute_index]]
  attributeValuesUnique = np.unique(attributeValues)

  avg_entropy = 0
  for value in attributeValuesUnique:
    entropy = 0
    for value1 in values:
      numerator = 1
      denominator = 1
      for row in range(len(data)):
        if(data[row][attribute_index] == value and data[row][-1] == value1):
          numerator += 1
        if(data[row][attribute_index] == value):
          denominator += 1

      entropy += -(numerator/denominator) * np.log2((numerator/denominator) + 0.00000001)
    avg_entropy += (denominator/len(data))*entropy
  return avg_entropy


In [323]:
def best_attribute(data):
  information_gains = []
  for i in range(len(data[0])):
    information_gains.append(find_entropy(data) - find_entropy_attribute(data, i))
  
  max = 0
  index = 0

  for i in range(len(information_gains)):
    if(information_gains[i] > max):
      max = information_gains[i]
      index = i

  return i

In [324]:
find_entropy(sX_train)

0.972493094244988

In [325]:
find_entropy_attribute(sX_train, 57)

0.006883392744106848

In [326]:
best_attribute(sX_train)

57

In [327]:
def DTL(samples, attributes, defaultTree = None):
  if(len(examples) == 0):
    return defaultTree
  elif(sameClassification(examples)[0] == True):
    return sameClassification(examples)[1]
  elif(len(attributes) == 0):
    return Mode(samples)
  else:
    bestAtt = ChooseAttribute(attributes, samples)
    tree = {}
    tree[bestAtt] = None
    for value in bestAtt:
      samples[bestAtt]