In [None]:
from csv import reader
from math import *
from random import *

In [None]:
import zipfile
from google.colab import drive

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/dataset.zip", 'r')
zip_ref.extractall("/content/")
zip_ref.close()


In [None]:
def load_csv(filename):
  dataset = list()
  with open(filename,"r") as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if not row:
        continue
      dataset.append(row)
  return dataset

In [None]:
def str_column_to_float(dataset,column):
  for row in dataset:
    row[column]=float(row[column].strip())

In [None]:
def str_column_to_int(dataset,column):
  class_values=[row[column] for row in dataset]
  unique=set(class_values)
  lookup=dict()
  for i, value in enumerate(unique):
    lookup[value] = i
  for row in dataset:
    row[column] = lookup[row[column]]
  return lookup

In [None]:
filename="diabetes.csv"
dataset=load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset,i)

In [None]:
dataset[1]

[1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0]

In [None]:
dataset[2]

[8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0]

In [None]:
def dataset_minmax(dataset):
  minmax=list()
  for i,value in enumerate(dataset):
    col_values = [row for row in value]
    value_min = min(col_values)
    value_max = max(col_values)
    minmax.append([value_min,value_max])
  return minmax

In [None]:
minmax = dataset_minmax(list(dataset[1:]))

In [None]:
def normalize_dataset(dataset,minmax):
  for row in dataset:
    for i,value in enumerate(row):
      if i:
        row[i] = (row[i] - minmax[i][0])/(minmax[i][1]-minmax[i][0])

In [None]:
normalize_dataset(list(dataset[1:]),minmax)

In [None]:
filename="diabetes.csv"
dataset=load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset,i)

In [None]:
def column_means(dataset):
  means = [0 for i  in range(len(dataset[0]))]
  for i,value in enumerate(dataset[0]):
    if i:
      col_values=[row[i] for row in dataset]
      means[i] = sum(col_values)/float(len(dataset))
  return means

In [None]:
def column_stdevs(dataset,means):
  stddevs = [0 for i in range(len(dataset[0]))]
  for i, value in enumerate(dataset[0]):
    variance=[pow(row[i]-means[i],2) for row in dataset]
    stddevs[i]=sum(variance)
  stddevs=[sqrt(x/float(len(dataset)-1)) for x in stddevs]
  return stddevs


In [None]:
means = column_means(dataset[1:])
stddevs = column_stdevs(dataset[1:],means)

In [None]:
def standardize_dataset(dataset,means,stddevs):
  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i]-means[i])/stddevs[i]
      

In [None]:
standardize_dataset(dataset[1:],means,stddevs)

In [None]:
dataset[1]

[0.19557189140478415,
 -1.1213542545166537,
 -0.16014405966562575,
 0.5316765423046242,
 -0.6931069396435374,
 -0.6832830888060338,
 -0.364027201812711,
 -0.18881717054245747,
 -0.7302765099363351]

In [None]:
filename="diabetes.csv"
dataset=load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset,i)

In [None]:
def train_test_split(dataset,split=0.60):
  train = list()
  train_size = split * len(dataset)
  dataset_copy = list(dataset)
  while len(train) < train_size:
    index = randrange(len(dataset_copy))
    train.append(dataset_copy.pop(index))
  return train,dataset_copy

In [None]:
train,test = train_test_split(dataset[1:])

In [None]:
print(len(dataset),len(train),len(test))

768 461 306


In [None]:
filename="diabetes.csv"
dataset=load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset,i)

In [None]:
def cross_validation_split(dataset,folds=3):
  dataset_split = list()
  dataset_copy = list(dataset)
  fold_size = int(len(dataset)/folds)
  for i in range(folds):
    fold = list()
    while len(fold) < fold_size:
      index = randrange(len(dataset_copy))
      fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
  return dataset_split


  

In [None]:
split_dataset = cross_validation_split(dataset[1:])

In [None]:
len(split_dataset)

3

In [None]:
[len(i) for i in split_dataset]

[255, 255, 255]

In [None]:
def accuracy_metric(actual,predicted):
  correct = 0
  for i in range(len(actual)):
    if actual[i] == predicted[i]:
      correct+=1
  return correct/float(len(actual)) * 100.00

In [None]:
actual = [1,0,0,0,1,1,1,1,1,1]
predicted = [0,1,0,0,0,1,0,1,1,1]
accuracy = accuracy_metric(actual,predicted)
accuracy

60.0

In [None]:
def confusion_matrix(actual,predicted):
  unique = set(actual)
  matrix = [list() for x in range(len(unique))]
  for i in range(len(unique)):
    matrix[i] = [0 for x in range(len(unique))]
  lookup = dict()
  for i,value in enumerate(unique):
    lookup[value] = i
  for i in range(len(actual)):
    x=lookup[actual[i]]
    y=lookup[predicted[i]]
    matrix[y][x] +=1 
  return unique, matrix

In [None]:
def print_confusion_matrix(unique,matrix):
  print('(A)'+' '.join(str(x) for x in unique))
  print('(P)----')
  for i,x in enumerate(unique):
    print("%s|%s" % (x, ' '.join(str(x) for x in matrix[i])))

In [None]:
unique,matrix = confusion_matrix(actual,predicted)
print_confusion_matrix(unique,matrix)

(A)0 1
(P)----
0|2 3
1|1 4


In [None]:
def mae_metric(actual,predicted):
  sum_error=0.0
  for i in range(len(actual)):
    sum_error+=abs(predicted[i]-actual[i])
  return sum_error/float(len(actual))

In [None]:
actual = [0.1, 0.2, 0.3, 0.4, 0.5]
predicted = [0.11, 0.19, 0.29, 0.41, 0.5]
mae = mae_metric(actual,predicted)

In [None]:
mae

0.007999999999999993

In [None]:
def rmse_metric(actual,predicted):
  sum_error=0.0
  for i in range(len(actual)):
    prediction_error=predicted[i]-actual[i]
    sum_error+=(prediction_error ** 2)
  mean_error = sum_error / float(len(actual))
  return sqrt(mean_error)


In [None]:
actual = [0.1, 0.2, 0.3, 0.4, 0.5]
predicted = [0.11, 0.19, 0.29, 0.41, 0.5]
rmse = rmse_metric(actual,predicted)

In [None]:
rmse

0.00894427190999915

In [None]:
def random_algorithm(train,test):
  output_values = [row[-1] for row in train]
  unique = list(set(output_values))
  predicted = list()
  for _ in test:
    index = randrange(len(unique))
    predicted.append(unique[index])
  return predicted

In [None]:
seed(1)
train = [[0], [1], [0], [1], [0], [1]]
test = [[None], [None], [None], [None]]

In [None]:
predictions = random_algorithm(train,test)

In [None]:
predictions

[0, 0, 1, 0]

In [None]:
def zero_rule_algorithm_classification(train,test):
  output_values = [row[-1] for row in train]
  prediction = max(set(output_values),key=output_values.count)
  predicted = [prediction for i in range(len(test))]
  return predicted 


In [None]:
seed(1)
train = [['0'], ['0'], ['0'], ['0'], ['1'], ['1']]
test = [[None], [None], [None], [None]]

In [None]:
predictions = zero_rule_algorithm_classification(train,test)

In [None]:
predictions

['0', '0', '0', '0']

In [None]:
def zero_rule_algorithm_regression(train,test):
  output_values = [row[-1] for row in train]
  prediction = sum(output_values) / float(len(output_values))
  predicted = [prediction for i in range(len(test))]
  return predicted

In [None]:
seed(1)
train = [[10], [15], [12], [15], [18], [20]]
test = [[None], [None], [None], [None]]

In [None]:
predictions = zero_rule_algorithm_regression(train,test)

In [None]:
predictions

[15.0, 15.0, 15.0, 15.0]

In [None]:
def evaluate_algorithm(dataset,algorithm,split,*args):
  train,test = train_test_split(dataset,split)
  test_set = list()
  for row in test:
    row_copy = list(row)
    row_copy[-1]=None
    test_set.append(row_copy)
  predicted = algorithm(train,test_set,*args)
  actual = [row[-1] for row in test]
  accuracy = accuracy_metric(actual,predicted)
  return accuracy

In [None]:
# Test the train/test harness
seed(1)
# load and prepare data
filename = 'diabetes.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset, i)

In [None]:
split = 0.6
accuracy = evaluate_algorithm(dataset[1:],zero_rule_algorithm_classification,split)

In [None]:
print('Accuracy: %.3f%%' % (accuracy))

Accuracy: 69.608%


In [None]:
def evaluate_algorithm_n_fold(dataset,algorithm,n_folds,*args):
  folds = cross_validation_split(dataset,n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set,[])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set,test_set,*args)
    actual = [row[-1] for row in fold]
    accuracy = accuracy_metric(actual,predicted)
    scores.append(accuracy)
  return scores

In [None]:
n_folds = 5
scores = evaluate_algorithm_n_fold(dataset[1:],zero_rule_algorithm_classification,n_folds)
print(f"Scores: {scores}")
print('Mean Accuracy: %.3f%%' % (sum(scores)/len(scores)))

Scores: [68.62745098039215, 63.39869281045751, 67.97385620915033, 66.66666666666666, 59.47712418300654]
Mean Accuracy: 65.229%


In [None]:
def mean(values):
  return sum(values)/float(len(values))

In [None]:
def variance(values,mean):
  return sum([(x-mean)**2 for x in values])

In [None]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]

In [None]:
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]

In [None]:
mean_x, mean_y = mean(x),mean(y)
var_x, var_y = variance(x,mean_x),variance(y,mean_y)
print('x stats: mean=%.3f variance=%.3f' % (mean_x, var_x))
print('y stats: mean=%.3f variance=%.3f' % (mean_y, var_y))

x stats: mean=3.000 variance=10.000
y stats: mean=2.800 variance=8.800


In [None]:
def covariance(x,mean_x,y,mean_y):
  covar = 0.0
  for i in range(len(x)):
    covar += (x[i] - mean_x) * (y[i] - mean_y)
  return covar

In [None]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
mean_x, mean_y = mean(x), mean(y)
covar = covariance(x, mean_x, y, mean_y)
print('Covariance: %.3f' % (covar))

Covariance: 8.000


In [None]:
def coefficients(dataset):
  x = [row[0] for row in dataset]
  y = [row[1] for row in dataset]
  x_mean,y_mean = mean(x), mean(y)
  b1 = covariance(x,mean_x,y,mean_y) / variance(x,x_mean)
  b0 = y_mean - b1 * x_mean
  return [b0, b1]

In [None]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
b0, b1 = coefficients(dataset)
print('Coefficients: B0=%.3f, B1=%.3f' % (b0, b1))

Coefficients: B0=0.400, B1=0.800


In [None]:
def simple_linear_regression(train, test):
  predictions = list()
  b0, b1 = coefficients(train)
  for row in test:
    yhat = b0 + b1 * row[0]
    predictions.append(yhat)
  return predictions

In [None]:
def evaluate_algorithm_rmse(dataset, algorithm, split, *args):
  train, test = train_test_split(dataset, split)
  test_set = list()
  for row in test:
    row_copy = list(row)
    row_copy[-1] = None
    test_set.append(row_copy)
  predicted = algorithm(train, test_set, *args)
  actual = [row[-1] for row in test]
  rmse = rmse_metric(actual, predicted)
  return rmse

In [None]:
def evaluate_algorithm_rmse_folds(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset, n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
  predicted = algorithm(train_set, test_set, *args)
  actual = [row[-1] for row in fold]
  rmse = rmse_metric(actual, predicted)
  scores.append(rmse)
  return scores

In [None]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
rmse = evaluate_algorithm_rmse(dataset, simple_linear_regression,split=0.7)
print('RMSE: %.3f' % (rmse))

RMSE: 0.050


In [None]:
seed(1)
# load and prepare data
filename = 'insurance.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset, i)

In [None]:
split = 0.6
rmse = evaluate_algorithm_rmse(dataset, simple_linear_regression, split)
print('RMSE: %.3f' % (rmse))

RMSE: 77.966


In [None]:
def predict(row,coefficients):
  yhat = coefficients[0]
  for i in range(len(row)-1):
    yhat += coefficients[i+1] * row[i]
  return yhat

In [None]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
coef = [0.4, 0.8]

In [None]:
for row in dataset:
  yhat = predict(row,coef)
  print("Expected=%.3f, Predicted=%.3f" % (row[-1], yhat))

Expected=1.000, Predicted=1.200
Expected=3.000, Predicted=2.000
Expected=3.000, Predicted=3.600
Expected=2.000, Predicted=2.800
Expected=5.000, Predicted=4.400


In [None]:
def coefficients_sgd(train,l_rate,n_epoch):
  coef = [0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):
    sum_error = 0
    for row in train:
      yhat = predict(row,coef)
      error = yhat - row[-1]
      sum_error += error ** 2
      coef[0] = coef[0] - l_rate * error
      for i in range(len(row) - 1):
        coef[i+1] = coef[i+1] - l_rate * error * row[i]
    print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
  return coef

In [None]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
l_rate = 0.001
n_epoch = 50
coef = coefficients_sgd(dataset,l_rate,n_epoch)
print(coef)

>epoch=0, lrate=0.001, error=46.236
>epoch=1, lrate=0.001, error=41.305
>epoch=2, lrate=0.001, error=36.930
>epoch=3, lrate=0.001, error=33.047
>epoch=4, lrate=0.001, error=29.601
>epoch=5, lrate=0.001, error=26.543
>epoch=6, lrate=0.001, error=23.830
>epoch=7, lrate=0.001, error=21.422
>epoch=8, lrate=0.001, error=19.285
>epoch=9, lrate=0.001, error=17.389
>epoch=10, lrate=0.001, error=15.706
>epoch=11, lrate=0.001, error=14.213
>epoch=12, lrate=0.001, error=12.888
>epoch=13, lrate=0.001, error=11.712
>epoch=14, lrate=0.001, error=10.668
>epoch=15, lrate=0.001, error=9.742
>epoch=16, lrate=0.001, error=8.921
>epoch=17, lrate=0.001, error=8.191
>epoch=18, lrate=0.001, error=7.544
>epoch=19, lrate=0.001, error=6.970
>epoch=20, lrate=0.001, error=6.461
>epoch=21, lrate=0.001, error=6.009
>epoch=22, lrate=0.001, error=5.607
>epoch=23, lrate=0.001, error=5.251
>epoch=24, lrate=0.001, error=4.935
>epoch=25, lrate=0.001, error=4.655
>epoch=26, lrate=0.001, error=4.406
>epoch=27, lrate=0.001,

In [None]:
def linear_regression_sgd(train, test, l_rate, n_epoch):
  predictions = list()
  coef = coefficients_sgd(train, l_rate, n_epoch)
  for row in test:
    yhat = predict(row, coef)
    predictions.append(yhat)
  return(predictions)

In [None]:
seed(1)
# load and prepare data
filename = 'winequality-red.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset, i)

In [None]:
minmax=dataset_minmax(dataset)
normalize_dataset(dataset,minmax)

In [None]:
# evalueate algorithm
n_folds = 5
l_rate = 0.01
n_epoch = 50
scores = evaluate_algorithm_rmse_folds(dataset, linear_regression_sgd, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean RMSE: %.3f' % (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.010, error=0.264
>epoch=1, lrate=0.010, error=0.175
>epoch=2, lrate=0.010, error=0.143
>epoch=3, lrate=0.010, error=0.130
>epoch=4, lrate=0.010, error=0.125
>epoch=5, lrate=0.010, error=0.122
>epoch=6, lrate=0.010, error=0.121
>epoch=7, lrate=0.010, error=0.121
>epoch=8, lrate=0.010, error=0.120
>epoch=9, lrate=0.010, error=0.120
>epoch=10, lrate=0.010, error=0.120
>epoch=11, lrate=0.010, error=0.119
>epoch=12, lrate=0.010, error=0.119
>epoch=13, lrate=0.010, error=0.119
>epoch=14, lrate=0.010, error=0.119
>epoch=15, lrate=0.010, error=0.119
>epoch=16, lrate=0.010, error=0.119
>epoch=17, lrate=0.010, error=0.119
>epoch=18, lrate=0.010, error=0.118
>epoch=19, lrate=0.010, error=0.118
>epoch=20, lrate=0.010, error=0.118
>epoch=21, lrate=0.010, error=0.118
>epoch=22, lrate=0.010, error=0.118
>epoch=23, lrate=0.010, error=0.118
>epoch=24, lrate=0.010, error=0.118
>epoch=25, lrate=0.010, error=0.118
>epoch=26, lrate=0.010, error=0.118
>epoch=27, lrate=0.010, error=0.117
>e

In [None]:
from math import exp

In [None]:
def predict(row,coefficients):
  yhat = coefficients[0]
  for i in range(len(row)-1):
    yhat += coefficients[i+1] * row[i]
  return 1.0/(1.0+exp(-yhat))

In [None]:
# test predictions
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
coef = [-0.406605464, 0.852573316, -1.104746259]

In [None]:
for row in dataset:
  yhat = predict(row,coef)
  print("Expected=%.3f, Predicted=%.3f [%d]" % (row[-1], yhat, round(yhat)))

Expected=0.000, Predicted=0.299 [0]
Expected=0.000, Predicted=0.146 [0]
Expected=0.000, Predicted=0.085 [0]
Expected=0.000, Predicted=0.220 [0]
Expected=0.000, Predicted=0.247 [0]
Expected=1.000, Predicted=0.955 [1]
Expected=1.000, Predicted=0.862 [1]
Expected=1.000, Predicted=0.972 [1]
Expected=1.000, Predicted=0.999 [1]
Expected=1.000, Predicted=0.905 [1]


In [None]:
def coefficients_sgd(train,l_rate,n_epoch):
  coef=[0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):
    sum_error=0
    for row in train:
      yhat = predict(row,coef)
      error=row[-1]-yhat
      sum_error += error ** 2
      coef[0] = coef[0] + l_rate * error * yhat * (1-yhat)
      for i in range(len(row)-1):
        coef[i+1] = coef[i+1] + l_rate * error * yhat * (1-yhat) * row[i]
    print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
  return coef

In [None]:
l_rate = 0.3
n_epoch = 100
coef = coefficients_sgd(dataset, l_rate, n_epoch)
print(coef)

>epoch=0, lrate=0.300, error=2.217
>epoch=1, lrate=0.300, error=1.613
>epoch=2, lrate=0.300, error=1.113
>epoch=3, lrate=0.300, error=0.827
>epoch=4, lrate=0.300, error=0.623
>epoch=5, lrate=0.300, error=0.494
>epoch=6, lrate=0.300, error=0.412
>epoch=7, lrate=0.300, error=0.354
>epoch=8, lrate=0.300, error=0.310
>epoch=9, lrate=0.300, error=0.276
>epoch=10, lrate=0.300, error=0.248
>epoch=11, lrate=0.300, error=0.224
>epoch=12, lrate=0.300, error=0.205
>epoch=13, lrate=0.300, error=0.189
>epoch=14, lrate=0.300, error=0.174
>epoch=15, lrate=0.300, error=0.162
>epoch=16, lrate=0.300, error=0.151
>epoch=17, lrate=0.300, error=0.142
>epoch=18, lrate=0.300, error=0.134
>epoch=19, lrate=0.300, error=0.126
>epoch=20, lrate=0.300, error=0.119
>epoch=21, lrate=0.300, error=0.113
>epoch=22, lrate=0.300, error=0.108
>epoch=23, lrate=0.300, error=0.103
>epoch=24, lrate=0.300, error=0.098
>epoch=25, lrate=0.300, error=0.094
>epoch=26, lrate=0.300, error=0.090
>epoch=27, lrate=0.300, error=0.087
>e

In [None]:
def logistic_regression(train,test,l_rate,n_epoch):
  predictions=list()
  coef=coefficients_sgd(train,l_rate,n_epoch)
  for row in test:
    yhat=predict(row,coef)
    yhat=round(yhat)
    predictions.append(yhat)
  return predictions

In [None]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset, n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set, test_set, *args)
    actual = [row[-1] for row in fold]
    accuracy = accuracy_metric(actual, predicted)
    scores.append(accuracy)
  return scores

In [None]:
seed(1)
# load and prepare data
filename = 'diabetes.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset, i)
# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

In [None]:
n_folds = 5
l_rate = 0.1
n_epoch = 100
scores = evaluate_algorithm(dataset[1:], logistic_regression, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.100, error=8.135
>epoch=1, lrate=0.100, error=1.447
>epoch=2, lrate=0.100, error=0.851
>epoch=3, lrate=0.100, error=0.607
>epoch=4, lrate=0.100, error=0.472
>epoch=5, lrate=0.100, error=0.387
>epoch=6, lrate=0.100, error=0.329
>epoch=7, lrate=0.100, error=0.286
>epoch=8, lrate=0.100, error=0.253
>epoch=9, lrate=0.100, error=0.227
>epoch=10, lrate=0.100, error=0.206
>epoch=11, lrate=0.100, error=0.188
>epoch=12, lrate=0.100, error=0.174
>epoch=13, lrate=0.100, error=0.161
>epoch=14, lrate=0.100, error=0.151
>epoch=15, lrate=0.100, error=0.141
>epoch=16, lrate=0.100, error=0.133
>epoch=17, lrate=0.100, error=0.126
>epoch=18, lrate=0.100, error=0.119
>epoch=19, lrate=0.100, error=0.113
>epoch=20, lrate=0.100, error=0.108
>epoch=21, lrate=0.100, error=0.103
>epoch=22, lrate=0.100, error=0.099
>epoch=23, lrate=0.100, error=0.095
>epoch=24, lrate=0.100, error=0.091
>epoch=25, lrate=0.100, error=0.088
>epoch=26, lrate=0.100, error=0.085
>epoch=27, lrate=0.100, error=0.082
>e

In [None]:
def predict(row,weights):
  activation=weights[0]
  for i in range(len(row)-1):
    activation += weights[i+1] * row[i]
  return 1.0 if activation >= 0.0 else 0.0

In [None]:
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
weights = [-0.1, 0.20653640140000007, -0.23418117710000003]

In [None]:
for row in dataset:
  prediction=predict(row,weights)
  print('Expected=%d, Predicted=%d' % (row[-1],prediction))

Expected=0, Predicted=0
Expected=0, Predicted=0
Expected=0, Predicted=0
Expected=0, Predicted=0
Expected=0, Predicted=0
Expected=1, Predicted=1
Expected=1, Predicted=1
Expected=1, Predicted=1
Expected=1, Predicted=1
Expected=1, Predicted=1


In [None]:
def train_weights(train,l_rate,n_epoch):
  """
  Psalms 32:8
  I will instruct thee and teach thee in the way which thou shalt go: I will guide thee with mine eye.
  """
  weights = [0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):
    sum_error=0.0
    for row in train:
      prediction = predict(row,weights)
      error = row[-1] - prediction
      sum_error += error ** 2
      weights[0] = weights[0] + l_rate * error
      for i in range(len(row)-1):
        weights[i+1]=weights[i+1] + l_rate * error * row[i]
    print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
  return weights

In [None]:
l_rate=0.1
n_epoch = 5
weights = train_weights(dataset,l_rate,n_epoch)
print(weights)

>epoch=0, lrate=0.100, error=2.000
>epoch=1, lrate=0.100, error=1.000
>epoch=2, lrate=0.100, error=0.000
>epoch=3, lrate=0.100, error=0.000
>epoch=4, lrate=0.100, error=0.000
[-0.1, 0.20653640140000007, -0.23418117710000003]


In [None]:
def perceptron(train,test,l_rate,n_epoch):
  predictions=list()
  weights=train_weights(train,l_rate,n_epoch)
  for row in test:
    prediction=predict(row,weights)
    predictions.append(prediction)
  return (predictions)

In [None]:
seed(1)
filename = 'sonar_csv.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
  str_column_to_float(dataset, i)
# convert string class to integers
str_column_to_int(dataset, len(dataset[0])-1)

{'Mine': 1, 'Rock': 0}

In [None]:
n_folds = 3
l_rate = 0.01
n_epoch = 500
scores = evaluate_algorithm(dataset, perceptron, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.010, error=65.000
>epoch=1, lrate=0.010, error=49.000
>epoch=2, lrate=0.010, error=45.000
>epoch=3, lrate=0.010, error=36.000
>epoch=4, lrate=0.010, error=54.000
>epoch=5, lrate=0.010, error=43.000
>epoch=6, lrate=0.010, error=38.000
>epoch=7, lrate=0.010, error=38.000
>epoch=8, lrate=0.010, error=38.000
>epoch=9, lrate=0.010, error=43.000
>epoch=10, lrate=0.010, error=40.000
>epoch=11, lrate=0.010, error=38.000
>epoch=12, lrate=0.010, error=45.000
>epoch=13, lrate=0.010, error=31.000
>epoch=14, lrate=0.010, error=41.000
>epoch=15, lrate=0.010, error=41.000
>epoch=16, lrate=0.010, error=39.000
>epoch=17, lrate=0.010, error=30.000
>epoch=18, lrate=0.010, error=36.000
>epoch=19, lrate=0.010, error=36.000
>epoch=20, lrate=0.010, error=30.000
>epoch=21, lrate=0.010, error=43.000
>epoch=22, lrate=0.010, error=40.000
>epoch=23, lrate=0.010, error=40.000
>epoch=24, lrate=0.010, error=32.000
>epoch=25, lrate=0.010, error=31.000
>epoch=26, lrate=0.010, error=43.000
>epoch=27, 

In [None]:
def gini_index(groups,classes):
  n_instances=float(sum([len(group) for group in groups]))
  gini = 0.0
  
  for group in groups:
    size = float(len(group))
    if size == 0:
      continue
    score = 0.0
    for class_val in classes:
      p=[row[-1] for row in group].count(class_val) / size
      score += p * p
    gini += (1.0 - score) * (size/n_instances)
  return gini

In [None]:
print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1]))
print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1]))

0.5
0.0


In [None]:
def test_split(index,value,dataset):
  left,right = list(),list()
  for row in dataset:
    if row[index] < value:
      left.append(row)
    else:
      right.append(row)
  return left,right
  

In [None]:
def get_split(dataset):
  """get the split"""
  class_values = list(set(row[-1] for row in dataset))
  b_index,b_value,b_score,b_groups = 999,999,999,None
  for index in range(len(dataset[0])-1):
    for row in dataset:
      groups = test_split(index,row[index],dataset)
      gini = gini_index(groups,class_values)
      print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini))
      if gini < b_score:
        b_index,b_value,b_score,b_groups = index,row[index],gini,groups
  return {'index':b_index,'value':b_value,'groups':b_groups}


In [None]:
dataset = [[2.771244718,1.784783929,0],
[1.728571309,1.169761413,0],
[3.678319846,2.81281357,0],
[3.961043357,2.61995032,0],
[2.999208922,2.209014212,0],
[7.497545867,3.162953546,1],
[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],
[10.12493903,3.234550982,1],
[6.642287351,3.319983761,1]]
split = get_split(dataset)
print('Split: [X%d < %.3f]' % ((split['index']+1), split['value']))

X1 < 2.771 Gini=0.444
X1 < 1.729 Gini=0.500
X1 < 3.678 Gini=0.286
X1 < 3.961 Gini=0.167
X1 < 2.999 Gini=0.375
X1 < 7.498 Gini=0.286
X1 < 9.002 Gini=0.375
X1 < 7.445 Gini=0.167
X1 < 10.125 Gini=0.444
X1 < 6.642 Gini=0.000
X2 < 1.785 Gini=0.500
X2 < 1.170 Gini=0.444
X2 < 2.813 Gini=0.320
X2 < 2.620 Gini=0.417
X2 < 2.209 Gini=0.476
X2 < 3.163 Gini=0.167
X2 < 3.339 Gini=0.444
X2 < 0.477 Gini=0.500
X2 < 3.235 Gini=0.286
X2 < 3.320 Gini=0.375
Split: [X1 < 6.642]


In [None]:
def to_terminal(group):
  outcomes = [row[-1] for row in group]
  return max(set(outcomes),key=outcomes.count)

In [None]:
def split(node,max_depth,min_size,depth):
  left,right = node['groups']
  del(node['groups'])

  if not left or not right:
    node['left'] = node['right'] = to_terminal(left+right)
    return
  
  if depth >= max_depth:
    node['left'],node['right'] = to_terminal(left),to_terminal(right)
    return
  
  if len(left) <= min_size:
    node['left'] = to_terminal(left)
  else:
    node['left'] = get_split(left)
    split(node['left'],max_depth,min_size,depth+1)

  if len(right) <= min_size:
    node['right'] = to_terminal(right)
  else:
    node['right'] = get_split(right)
    split(node['right'],max_depth,min_size,depth+1)


In [None]:
def build_tree(train, max_depth, min_size):
  root = get_split(train)
  split(root,max_depth,min_size,1)
  return root

In [None]:
def print_tree(node, depth=0):
  if isinstance(node, dict):
    print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))
    print_tree(node['left'], depth+1)
    print_tree(node['right'], depth+1)
  else:
    print('%s[%s]' % ((depth*' ', node)))

In [None]:
dataset = [[2.771244718,1.784783929,0],
[1.728571309,1.169761413,0],
[3.678319846,2.81281357,0],
[3.961043357,2.61995032,0],
[2.999208922,2.209014212,0],
[7.497545867,3.162953546,1],
[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],
[10.12493903,3.234550982,1],
[6.642287351,3.319983761,1]]
tree = build_tree(dataset, 1, 1)
print_tree(tree)

X1 < 2.771 Gini=0.444
X1 < 1.729 Gini=0.500
X1 < 3.678 Gini=0.286
X1 < 3.961 Gini=0.167
X1 < 2.999 Gini=0.375
X1 < 7.498 Gini=0.286
X1 < 9.002 Gini=0.375
X1 < 7.445 Gini=0.167
X1 < 10.125 Gini=0.444
X1 < 6.642 Gini=0.000
X2 < 1.785 Gini=0.500
X2 < 1.170 Gini=0.444
X2 < 2.813 Gini=0.320
X2 < 2.620 Gini=0.417
X2 < 2.209 Gini=0.476
X2 < 3.163 Gini=0.167
X2 < 3.339 Gini=0.444
X2 < 0.477 Gini=0.500
X2 < 3.235 Gini=0.286
X2 < 3.320 Gini=0.375
[X1 < 6.642]
 [0]
 [1]


In [None]:
tree = build_tree(dataset, 2, 1)
print_tree(tree)

X1 < 2.771 Gini=0.444
X1 < 1.729 Gini=0.500
X1 < 3.678 Gini=0.286
X1 < 3.961 Gini=0.167
X1 < 2.999 Gini=0.375
X1 < 7.498 Gini=0.286
X1 < 9.002 Gini=0.375
X1 < 7.445 Gini=0.167
X1 < 10.125 Gini=0.444
X1 < 6.642 Gini=0.000
X2 < 1.785 Gini=0.500
X2 < 1.170 Gini=0.444
X2 < 2.813 Gini=0.320
X2 < 2.620 Gini=0.417
X2 < 2.209 Gini=0.476
X2 < 3.163 Gini=0.167
X2 < 3.339 Gini=0.444
X2 < 0.477 Gini=0.500
X2 < 3.235 Gini=0.286
X2 < 3.320 Gini=0.375
X1 < 2.771 Gini=0.000
X1 < 1.729 Gini=0.000
X1 < 3.678 Gini=0.000
X1 < 3.961 Gini=0.000
X1 < 2.999 Gini=0.000
X2 < 1.785 Gini=0.000
X2 < 1.170 Gini=0.000
X2 < 2.813 Gini=0.000
X2 < 2.620 Gini=0.000
X2 < 2.209 Gini=0.000
X1 < 7.498 Gini=0.000
X1 < 9.002 Gini=0.000
X1 < 7.445 Gini=0.000
X1 < 10.125 Gini=0.000
X1 < 6.642 Gini=0.000
X2 < 3.163 Gini=0.000
X2 < 3.339 Gini=0.000
X2 < 0.477 Gini=0.000
X2 < 3.235 Gini=0.000
X2 < 3.320 Gini=0.000
[X1 < 6.642]
 [X1 < 2.771]
  [0]
  [0]
 [X1 < 7.498]
  [1]
  [1]


In [None]:
tree = build_tree(dataset, 3, 1)
print_tree(tree)

X1 < 2.771 Gini=0.444
X1 < 1.729 Gini=0.500
X1 < 3.678 Gini=0.286
X1 < 3.961 Gini=0.167
X1 < 2.999 Gini=0.375
X1 < 7.498 Gini=0.286
X1 < 9.002 Gini=0.375
X1 < 7.445 Gini=0.167
X1 < 10.125 Gini=0.444
X1 < 6.642 Gini=0.000
X2 < 1.785 Gini=0.500
X2 < 1.170 Gini=0.444
X2 < 2.813 Gini=0.320
X2 < 2.620 Gini=0.417
X2 < 2.209 Gini=0.476
X2 < 3.163 Gini=0.167
X2 < 3.339 Gini=0.444
X2 < 0.477 Gini=0.500
X2 < 3.235 Gini=0.286
X2 < 3.320 Gini=0.375
X1 < 2.771 Gini=0.000
X1 < 1.729 Gini=0.000
X1 < 3.678 Gini=0.000
X1 < 3.961 Gini=0.000
X1 < 2.999 Gini=0.000
X2 < 1.785 Gini=0.000
X2 < 1.170 Gini=0.000
X2 < 2.813 Gini=0.000
X2 < 2.620 Gini=0.000
X2 < 2.209 Gini=0.000
X1 < 2.771 Gini=0.000
X1 < 3.678 Gini=0.000
X1 < 3.961 Gini=0.000
X1 < 2.999 Gini=0.000
X2 < 1.785 Gini=0.000
X2 < 2.813 Gini=0.000
X2 < 2.620 Gini=0.000
X2 < 2.209 Gini=0.000
X1 < 7.498 Gini=0.000
X1 < 9.002 Gini=0.000
X1 < 7.445 Gini=0.000
X1 < 10.125 Gini=0.000
X1 < 6.642 Gini=0.000
X2 < 3.163 Gini=0.000
X2 < 3.339 Gini=0.000
X2 < 0.4

In [None]:
def predict(node,row):
  if row[node['index']] < node['value']:
    if isinstance(node['left'],dict):
      return predict(node['left'],row)
    else:
      return node['left']
  else:
    if isinstance(node['right'],dict):
      return predict(node['right'],row)
    else:
      return node['right']

In [None]:
dataset = [[2.771244718,1.784783929,0],
[1.728571309,1.169761413,0],
[3.678319846,2.81281357,0],
[3.961043357,2.61995032,0],
[2.999208922,2.209014212,0],
[7.497545867,3.162953546,1],
[9.00220326,3.339047188,1],
[7.444542326,0.476683375,1],
[10.12493903,3.234550982,1],
[6.642287351,3.319983761,1]]
stump = {'index': 0, 'right': 1, 'value': 6.642287351, 'left': 0}
for row in dataset:
  prediction = predict(stump,row)
  print('Expected=%d, Got=%d' % (row[-1], prediction))

Expected=0, Got=0
Expected=0, Got=0
Expected=0, Got=0
Expected=0, Got=0
Expected=0, Got=0
Expected=1, Got=1
Expected=1, Got=1
Expected=1, Got=1
Expected=1, Got=1
Expected=1, Got=1


In [None]:
def decision_tree(train,test,max_depth,min_size):
  tree = build_tree(train,max_depth,min_size)
  predictions = list()
  for row in test:
    prediction = predict(tree,row)
    predictions.append(prediction)
  return predictions


In [None]:
seed(1)
# load and prepare data
filename = 'banknote.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])):
  str_column_to_float(dataset, i)

In [None]:
# evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(dataset, decision_tree, n_folds, max_depth, min_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
X3 < 1.613 Gini=0.119
X3 < -0.188 Gini=0.115
X3 < -0.822 Gini=0.114
X3 < -4.046 Gini=0.121
X3 < -2.424 Gini=0.120
X3 < 2.257 Gini=0.119
X3 < -4.031 Gini=0.122
X3 < 0.276 Gini=0.115
X3 < -3.557 Gini=0.122
X3 < 0.353 Gini=0.115
X3 < 0.661 Gini=0.116
X3 < -2.548 Gini=0.120
X3 < -1.943 Gini=0.116
X3 < 0.419 Gini=0.115
X3 < -0.588 Gini=0.115
X3 < 0.465 Gini=0.116
X3 < 2.281 Gini=0.119
X3 < 0.486 Gini=0.116
X3 < 0.706 Gini=0.117
X3 < 1.705 Gini=0.118
X3 < 0.183 Gini=0.115
X3 < -1.388 Gini=0.115
X3 < -2.260 Gini=0.118
X3 < 3.090 Gini=0.120
X3 < 6.599 Gini=0.121
X3 < 1.456 Gini=0.119
X3 < -0.793 Gini=0.115
X3 < -2.123 Gini=0.117
X3 < 6.010 Gini=0.121
X3 < -1.430 Gini=0.114
X3 < 0.757 Gini=0.117
X3 < -4.358 Gini=0.122
X3 < 0.511 Gini=0.116
X3 < -1.361 Gini=0.115
X3 < 2.033 Gini=0.118
X3 < 1.545 Gini=0.119
X3 < 1.951 Gini=0.118
X3 < 2.071 Gini=0.118
X3 < 3.994 Gini=0.120
X3 < -2.429 Gini=0.120
X3 < -1.969 Gini=0.116
X3 < 7.385 Gini

In [None]:
def separate_by_class(dataset):
  separated = dict()
  for i in range(len(dataset)):
    vector = dataset[i]
    class_value = vector[-1]
    if (class_value not in separated):
      separated[class_value] = list()
    separated[class_value].append(vector)
  return separated

In [None]:
dataset = [[3.393533211,2.331273381,0],
[3.110073483,1.781539638,0],
[1.343808831,3.368360954,0],
[3.582294042,4.67917911,0],
[2.280362439,2.866990263,0],
[7.423436942,4.696522875,1],
[5.745051997,3.533989803,1],
[9.172168622,2.511101045,1],
[7.792783481,3.424088941,1],
[7.939820817,0.791637231,1]]
separated = separate_by_class(dataset)

In [None]:
for label in separated:
  print(label)
  for row in separated[label]:
    print(row)

0
[3.393533211, 2.331273381, 0]
[3.110073483, 1.781539638, 0]
[1.343808831, 3.368360954, 0]
[3.582294042, 4.67917911, 0]
[2.280362439, 2.866990263, 0]
1
[7.423436942, 4.696522875, 1]
[5.745051997, 3.533989803, 1]
[9.172168622, 2.511101045, 1]
[7.792783481, 3.424088941, 1]
[7.939820817, 0.791637231, 1]


In [None]:
def stdev(numbers):
  avg = mean(numbers)
  variance = sum([(x-avg)**2 for x in numbers])/float(len(numbers)-1)
  return sqrt(variance)

In [None]:
def summarize_dataset(dataset):
  summaries = [(mean(column),stdev(column),len(column)) for column in zip(*dataset)]
  del(summaries[-1])
  return summaries


In [None]:
summary = summarize_dataset(dataset)

In [None]:
summary

[(5.178333386499999, 2.7665845055177263, 10),
 (2.9984683241, 1.218556343617447, 10)]

In [None]:
def summarize_by_class(dataset):
  separated = separate_by_class(dataset)
  summaries = dict()
  for class_value,rows in separated.items():
    summaries[class_value] = summarize_dataset(rows)
  return summaries

In [None]:
summary = summarize_by_class(dataset)

In [None]:
summary

{0: [(2.7420144012, 0.9265683289298018, 5),
  (3.0054686692, 1.1073295894898725, 5)],
 1: [(7.6146523718, 1.2344321550313704, 5),
  (2.9914679790000003, 1.4541931384601618, 5)]}

In [None]:
for label in summary:
  print(label)
  for row in summary[label]:
    print(row)

0
(2.7420144012, 0.9265683289298018, 5)
(3.0054686692, 1.1073295894898725, 5)
1
(7.6146523718, 1.2344321550313704, 5)
(2.9914679790000003, 1.4541931384601618, 5)


In [None]:
def calculate_probability(x,mean,stdev):
  exponent = exp(-((x-mean)**2/(2 * stdev**2)))
  return (1/(sqrt(2*pi)*stdev)) * exponent

In [None]:
print(calculate_probability(1.0, 1.0, 1.0))

0.3989422804014327


In [None]:
def calculate_class_probabilities(summaries,row):
  total_rows = sum([summaries[label][0][2] for label in summaries])
  probabilities = dict()
  for class_value,class_summaries in summaries.items():
    probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
    for i in range(len(summaries)):
      mean,stdev,count = class_summaries[i]
      probabilities[class_value] *= calculate_probability(row[i],mean,stdev)
  return probabilities

In [None]:
dataset = [[3.393533211,2.331273381,0],
[3.110073483,1.781539638,0],
[1.343808831,3.368360954,0],
[3.582294042,4.67917911,0],
[2.280362439,2.866990263,0],
[7.423436942,4.696522875,1],
[5.745051997,3.533989803,1],
[9.172168622,2.511101045,1],
[7.792783481,3.424088941,1],
[7.939820817,0.791637231,1]]
summaries = summarize_by_class(dataset)
probabilities = calculate_class_probabilities(summary,dataset[0])

In [None]:
probabilities

{0: 0.05032427673372076, 1: 0.00011557718379945765}

In [None]:
def predict(summaries, row):
  probabilities = calculate_class_probabilities(summaries, row)
  best_label, best_prob = None, -1
  for class_value, probability in probabilities.items():
    if best_label is None or probability > best_prob:
      best_prob = probability
      best_label = class_value
  return best_label

In [None]:
def naive_bayes(train,test):
  summarize = summarize_by_class(train)
  predictions = list()
  for row in test:
    output = predict(summarize,row)
    predictions.append(output)
  return predictions

In [None]:
filename = 'Iris.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
  str_column_to_float(dataset, i)
str_column_to_int(dataset,len(dataset[0])-1)
n_folds = 5
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [100.0, 96.66666666666667, 96.66666666666667, 100.0, 100.0]
Mean Accuracy: 98.667%


In [None]:
def euclidean_distance(row1,row2):
  distance = 0.0
  for i in range(len(row1)-1):
    distance += (row1[i]-row2[i])**2
  return sqrt(distance)

In [None]:
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
row0 = dataset[0]
for row in dataset:
  distance = euclidean_distance(row0,row)
  print(distance)

0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795


In [None]:
def get_neighbors(train,test_row,num_neighbors):
  distances = list()
  for train_row in train:
    dist = euclidean_distance(test_row,train_row)
    distances.append((train_row,dist))
  distances.sort(key=lambda tup: tup[1])
  neighbors = list()
  for i in range(num_neighbors):
    neighbors.append(distances[i][0])
  return neighbors

In [None]:
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]

In [None]:
neighbors=get_neighbors(dataset,dataset[0],3)
for neighbor in neighbors:
  print(neighbor)

[2.7810836, 2.550537003, 0]
[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]


In [None]:
def predict_classification(train,test_row,num_neighbors):
  neighbors = get_neighbors(train,test_row,num_neighbors)
  output_values = [row[-1] for row in neighbors]
  prediction = max(set(output_values),key=output_values.count)
  return prediction

In [None]:
prediction = predict_classification(dataset, dataset[0], 3)
print('Expected %d, Got %d.' % (dataset[0][-1], prediction))

Expected 0, Got 0.


In [None]:
def predict_regression(train,test_row,num_neighbors):
  neighbors = get_neighbors(train,test_row,num_neighbors)
  output_values = [row[-1] for row in neighbors]
  prediction = sum(output_values)/float(len(output_values))
  return prediction

In [None]:
def k_nearest_neighbors(train,test,num_neighbors):
  predictions = list()
  for row in test:
    output = predict_classification(train,row,num_neighbors)
    predictions.append(output)
  return predictions

In [None]:
filename="abalone.csv"
dataset = load_csv(filename)
for i in range(1, len(dataset[0])):
  str_column_to_float(dataset, i)
# convert first column to integers
str_column_to_int(dataset, 0)

{'F': 2, 'I': 0, 'M': 1}

In [None]:
# evaluate algorithm
n_folds = 5
num_neighbors = 5
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [23.11377245508982, 23.592814371257482, 21.916167664670656, 24.191616766467067, 27.305389221556887]
Mean Accuracy: 24.024%


In [None]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset, n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set, test_set, *args)
    actual = [row[-1] for row in fold]
    rmse = rmse_metric(actual, predicted)
    scores.append(rmse)
  return scores

In [None]:
def k_nearest_neighbors(train, test, num_neighbors):
  predictions = list()
  for row in test:
    output = predict_regression(train, row, num_neighbors)
    predictions.append(output)
  return(predictions)

In [None]:
seed(1)
# load and prepare data
filename = 'abalone.csv'
dataset = load_csv(filename)
for i in range(1, len(dataset[0])):
  str_column_to_float(dataset, i)
# convert first column to integers
str_column_to_int(dataset, 0)

{'F': 2, 'I': 0, 'M': 1}

In [None]:
# evaluate algorithm
n_folds = 5
num_neighbors = 5
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean RMSE: %.3f' % (sum(scores)/float(len(scores))))

Scores: [2.170383116929243, 2.2087035241256405, 2.2321118594939215, 2.4013070293283603, 2.2274928845898017]
Mean RMSE: 2.248


In [None]:
def get_best_matching_unit(codebooks,test_row):
  distances = list()
  for codebook in codebooks:
    dist = euclidean_distance(codebook,test_row)
    distances.append((codebook,dist))
  distances.sort(key=lambda tup:tup[1])
  return distances[0][0]
  

In [None]:
# Test best matching unit function
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
test_row = dataset[0]
bmu = get_best_matching_unit(dataset, test_row)
print(bmu)

[2.7810836, 2.550537003, 0]


In [None]:
def random_codebook(train):
  n_records = len(train)
  n_features = len(train[0])
  codebook = [train[randrange(n_records)][i] for i in range(n_features)]
  return codebook


In [None]:
def train_codebooks(train,n_codebooks,lrate,epochs):
  codebooks = [random_codebook(train) for i in range(n_codebooks)]
  for epoch in range(epochs):
    rate = lrate * (1-(epoch/float(epochs)))
    sum_error = 0.0
    for row in train:
      bmu = get_best_matching_unit(codebooks,row)
      for i in range(len(row)-1):
        error=row[i]-bmu[i]
        sum_error += error**2
        if bmu[-1] == row[-1]:
          bmu[i] += rate * error
        else:
          bmu[i] -= rate * error
    print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, rate, sum_error))
  return codebooks

In [None]:
seed(1)
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
learn_rate = 0.3
n_epochs = 10
n_codebooks = 2
codebooks = train_codebooks(dataset, n_codebooks, learn_rate, n_epochs)
print('Codebooks: %s' % codebooks)

>epoch=0, lrate=0.300, error=43.270
>epoch=1, lrate=0.270, error=30.403
>epoch=2, lrate=0.240, error=27.146
>epoch=3, lrate=0.210, error=26.301
>epoch=4, lrate=0.180, error=25.537
>epoch=5, lrate=0.150, error=24.789
>epoch=6, lrate=0.120, error=24.058
>epoch=7, lrate=0.090, error=23.346
>epoch=8, lrate=0.060, error=22.654
>epoch=9, lrate=0.030, error=21.982
Codebooks: [[2.432316086217663, 2.839821664184211, 0], [7.319592257892681, 1.97013382654341, 1]]


In [None]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset, n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set, test_set, *args)
    actual = [row[-1] for row in fold]
    accuracy = accuracy_metric(actual, predicted)
    scores.append(accuracy)
  return scores

In [None]:
def predict(codebooks,test_row):
  bmu = get_best_matching_unit(codebooks,test_row)
  return bmu[-1]


In [None]:
def learning_vector_quantization(train,test,n_codebooks,lrate,epochs):
  codebooks = train_codebooks(train,n_codebooks,lrate,epochs)
  prediction = list()
  for row in test:
    output = predict(codebooks,row)
    predictions.append(output)
  return predictions

In [None]:
# Test LVQ on Ionosphere dataset
seed(1)
# load and prepare data
filename = 'ionosphere_data_kaggle.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
  str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

{'b': 0, 'g': 1}

In [None]:
n_folds = 5
learn_rate = 0.3
n_epochs = 50
n_codebooks = 20
scores = evaluate_algorithm(dataset, learning_vector_quantization, n_folds, n_codebooks,
learn_rate, n_epochs)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.300, error=2106.606
>epoch=1, lrate=0.294, error=2033.558
>epoch=2, lrate=0.288, error=1931.089
>epoch=3, lrate=0.282, error=1899.031
>epoch=4, lrate=0.276, error=1906.729
>epoch=5, lrate=0.270, error=1881.332
>epoch=6, lrate=0.264, error=1869.365
>epoch=7, lrate=0.258, error=1857.928
>epoch=8, lrate=0.252, error=1839.964
>epoch=9, lrate=0.246, error=1839.005
>epoch=10, lrate=0.240, error=1824.272
>epoch=11, lrate=0.234, error=1825.209
>epoch=12, lrate=0.228, error=1807.112
>epoch=13, lrate=0.222, error=1798.466
>epoch=14, lrate=0.216, error=1788.285
>epoch=15, lrate=0.210, error=1776.740
>epoch=16, lrate=0.204, error=1763.173
>epoch=17, lrate=0.198, error=1755.526
>epoch=18, lrate=0.192, error=1747.907
>epoch=19, lrate=0.186, error=1740.369
>epoch=20, lrate=0.180, error=1729.157
>epoch=21, lrate=0.174, error=1721.589
>epoch=22, lrate=0.168, error=1714.244
>epoch=23, lrate=0.162, error=1706.964
>epoch=24, lrate=0.156, error=1699.747
>epoch=25, lrate=0.150, error=1692.

In [None]:
# backpropagation
def initialize_network(n_inputs,n_hidden,n_outputs):
  network = list()
  hidden_layer = [{'weights':[random() for i in range(n_inputs+1)]} for i in range(n_hidden)]
  network.append(hidden_layer)
  output_layer = [{'weights':[random() for i in range(n_hidden+1)]} for i in range(n_outputs)]
  network.append(output_layer)
  return network

In [None]:
seed(1)
network = initialize_network(2,1,2)
for layer in network:
  print(layer)

[{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614]}]
[{'weights': [0.2550690257394217, 0.49543508709194095]}, {'weights': [0.4494910647887381, 0.651592972722763]}]


In [None]:
network

[[{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614]}],
 [{'weights': [0.2550690257394217, 0.49543508709194095]},
  {'weights': [0.4494910647887381, 0.651592972722763]}]]

In [None]:
def activate(weights,inputs):
  activation = weights[-1]
  for i in range(len(weights)-1):
    activation += weights[i] * inputs[i]
  return activation

In [None]:
def transfer(activation):
  return 1.0/(1.0+exp(-activation))

In [None]:
def forward_propagate(network,row):
  inputs = row
  for layer in network:
    new_inputs = []
    for neuron in layer:
      activation = activate(neuron['weights'],inputs)
      neuron['output'] = transfer(activation)
      new_inputs.append(neuron['output'])
    inputs = new_inputs
  return inputs

In [None]:
# test forward propagation
network = [[{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614]}],
[{'weights': [0.2550690257394217, 0.49543508709194095]}, {'weights':
[0.4494910647887381, 0.651592972722763]}]]
row = [1, 0, None]
output = forward_propagate(network,row)
print(output)

[0.6629970129852887, 0.7253160725279748]


In [None]:
def transfer_derivative(output):
  return output * (1.0 -output)

In [None]:
def backward_propagate_error(network,expected):
  for i in reversed(range(len(network))):
    layer = network[i]
    errors = list()
    if i!= len(network)-1:
      for j in range(len(layer)):
        error = 0.0
        for neuron in network[i+1]:
          error += (neuron['weights'][j] * neuron.get('delta',0))
        errors.append(error)
    else:
      for j in range(len(layer)):
        neuron = layer[j]
        errors.append(expected[j]-neuron['output'])
    for j in range(len(layer)):
      neuron = layer[i]
      neuron['delta'] = errors[j] * transfer_derivative(neuron['output'])

In [None]:
# test backpropagation of error
network = [[{'output': 0.7105668883115941, 'weights': [0.13436424411240122,
0.8474337369372327, 0.763774618976614]}],
[{'output': 0.6213859615555266, 'weights': [0.2550690257394217, 0.49543508709194095]},
{'output': 0.6573693455986976, 'weights': [0.4494910647887381, 0.651592972722763]}]]

In [None]:
expected = [0,1]
backward_propagate_error(network,expected)
for layer in network:
  print(layer)

[{'output': 0.7105668883115941, 'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614], 'delta': 0.007134049565623459}]
[{'output': 0.6213859615555266, 'weights': [0.2550690257394217, 0.49543508709194095]}, {'output': 0.6573693455986976, 'weights': [0.4494910647887381, 0.651592972722763], 'delta': 0.0771723774346327}]


In [None]:
def update_weights(network,row,l_rate):
  for i in range(len(network)):
    inputs = row[:-1]
    if i != 0:
      inputs = [neuron['output'] for neuron in network[i]]
    for neuron in network[i]:
      for j in range(len(inputs)):
        neuron['weights'][j] += l_rate * neuron.get('delta',0.0) * inputs[j]
      neuron['weights'][-1] += l_rate * neuron.get('delta',0.0)


In [None]:
def train_network(network,train,l_rate,n_epoch,n_outputs):
  for epoch in range(n_epoch):
    sum_error = 0.0
    for row in train:
      outputs = forward_propagate(network,row)
      expected = [0 for i in range(n_outputs)]
      expected[row[-1]] = 1
      sum_error += sum([(expected[i] - output[i]) ** 2 for i in range(len(expected))])
      backward_propagate_error(network,expected)
      update_weights(network,row,l_rate)
    print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))

In [None]:
seed(1)
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
n_inputs = len(dataset[0]) - 1
n_outputs = len(set([row[-1] for row in dataset]))
network = initialize_network(n_inputs, 2, n_outputs)
train_network(network, dataset, 0.5, 20, n_outputs)

>epoch=0, lrate=0.500, error=5.773
>epoch=1, lrate=0.500, error=5.773
>epoch=2, lrate=0.500, error=5.773
>epoch=3, lrate=0.500, error=5.773
>epoch=4, lrate=0.500, error=5.773
>epoch=5, lrate=0.500, error=5.773
>epoch=6, lrate=0.500, error=5.773
>epoch=7, lrate=0.500, error=5.773
>epoch=8, lrate=0.500, error=5.773
>epoch=9, lrate=0.500, error=5.773
>epoch=10, lrate=0.500, error=5.773
>epoch=11, lrate=0.500, error=5.773
>epoch=12, lrate=0.500, error=5.773
>epoch=13, lrate=0.500, error=5.773
>epoch=14, lrate=0.500, error=5.773
>epoch=15, lrate=0.500, error=5.773
>epoch=16, lrate=0.500, error=5.773
>epoch=17, lrate=0.500, error=5.773
>epoch=18, lrate=0.500, error=5.773
>epoch=19, lrate=0.500, error=5.773


In [None]:
for layer in network:
  print(layer)

[{'weights': [0.3468014439838053, 0.6505135531585023, 0.7129247930551437], 'output': 0.9964992964198438, 'delta': 0.0001462104326616336}, {'weights': [0.2550690257394217, 0.49543508709194095, 0.4494910647887381], 'output': 0.9844051047537846}]
[{'weights': [0.651592972722763, 0.7887233511355132, 0.0938595867742349], 'output': 0.8204788078436999}, {'weights': [-0.1725281891632022, 0.4159668626900584, 0.1284512081754094], 'output': 0.5590505547396606, 'delta': 0.10869978470320286}]


In [None]:
def predict(network,row):
  outputs = forward_propagate(network,row)
  return outputs.index(max(outputs))

In [None]:
# Test making predictions with the network
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
network = [[{'weights': [-1.482313569067226, 1.8308790073202204, 1.078381922048799]},
{'weights': [0.23244990332399884, 0.3621998343835864, 0.40289821191094327]}],
[{'weights': [2.5001872433501404, 0.7887233511355132, -1.1026649757805829]}, {'weights':
[-2.429350576245497, 0.8357651039198697, 1.0699217181280656]}]]

In [None]:
for row in dataset:
  prediction = predict(network, row)
  print('Expected=%d, Got=%d' % (row[-1], prediction))

Expected=0, Got=0
Expected=0, Got=0
Expected=0, Got=0
Expected=0, Got=0
Expected=0, Got=0
Expected=1, Got=1
Expected=1, Got=1
Expected=1, Got=1
Expected=1, Got=1
Expected=1, Got=1


In [None]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset, n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set, test_set, *args)
    actual = [row[-1] for row in fold]
    accuracy = accuracy_metric(actual, predicted)
    scores.append(accuracy)
  return scores

In [None]:
def train_network(network, train, l_rate, n_epoch, n_outputs):
  for _ in range(n_epoch):
    for row in train:
      forward_propagate(network, row)
      expected = [0 for i in range(n_outputs)]
      expected[round(row[-1])] = 1
      backward_propagate_error(network, expected)
      update_weights(network, row, l_rate)


In [None]:
def backward_propagation(train,test,l_rate,n_epoch,n_hidden):
  n_inputs = len(train[0])-1
  n_outputs = len(set([row[-1] for row in train]))
  network = initialize_network(n_inputs,n_hidden,n_outputs)
  train_network(network,train,l_rate,n_epoch,n_outputs)
  predictions = list()
  for row in test:
    prediction=predict(network,row)
    predictions.append(predict)
  return predictions


In [None]:
# Test Backprop on Seeds dataset
seed(1)
# load and prepare data
filename = 'seeds.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
  str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# normalize input variables
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

In [None]:
# evaluate algorithm
n_folds = 5
l_rate = 0.3
n_epoch = 500
n_hidden = 5
scores = evaluate_algorithm(dataset, backward_propagation, n_folds, l_rate, n_epoch, n_hidden)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [0.0, 0.0, 0.0, 0.0, 0.0]
Mean Accuracy: 0.000%


In [None]:
def subsample(dataset, ratio=1.0):
  sample = list()
  n_sample = round(len(dataset) * ratio)
  while len(sample) < n_sample:
    index = randrange(len(dataset))
    sample.append(dataset[index])
  return sample

In [None]:
# Test subsampling a dataset
seed(1)
# True mean
dataset = [[randrange(10)] for i in range(20)]
print('True Mean: %.3f' % mean([row[0] for row in dataset]))
# Estimated means
ratio = 0.10
for size in [1, 10, 100]:
  sample_means = list()
  for i in range(size):
    sample = subsample(dataset, ratio)
    sample_mean = mean([row[0] for row in sample])
    sample_means.append(sample_mean)
  print('Samples=%d, Estimated Mean: %.3f' % (size, mean(sample_means)))

True Mean: 4.500
Samples=1, Estimated Mean: 4.000
Samples=10, Estimated Mean: 4.700
Samples=100, Estimated Mean: 4.570


In [None]:
def predict(node, row):
  if row[node['index']] < node['value']:
    if isinstance(node['left'], dict):
      return predict(node['left'], row)
    else:
      return node['left']
  else:
    if isinstance(node['right'], dict):
      return predict(node['right'], row)
    else:
      return node['right']

In [None]:
# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
  predictions = [predict(tree, row) for tree in trees]
  return max(set(predictions), key=predictions.count)

In [None]:
# Bootstrap Aggregation Algorithm
def bagging(train, test, max_depth, min_size, sample_size, n_trees):
  trees = list()
  for _ in range(n_trees):
    sample = subsample(train, sample_size)
    tree = build_tree(sample, max_depth, min_size)
    trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
  return(predictions)

In [None]:
seed(1)
# load and prepare data
filename = 'sonar_csv.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])-1):
  str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

{'Mine': 1, 'Rock': 0}

In [None]:
# evaluate algorithm
n_folds = 5
max_depth = 1
min_size = 2
sample_size = 0.50
for n_trees in [1, 5]:
  scores = evaluate_algorithm(dataset, bagging, n_folds, max_depth, min_size, sample_size,
  n_trees)
  print('Trees: %d' % n_trees)
  print('Scores: %s' % scores)
  print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
X60 < 0.017 Gini=0.489
X60 < 0.002 Gini=0.492
X60 < 0.002 Gini=0.492
X60 < 0.010 Gini=0.492
X60 < 0.005 Gini=0.487
X60 < 0.003 Gini=0.491
X60 < 0.009 Gini=0.491
X60 < 0.007 Gini=0.493
X60 < 0.002 Gini=0.492
X60 < 0.002 Gini=0.492
X60 < 0.006 Gini=0.492
X60 < 0.002 Gini=0.492
X60 < 0.006 Gini=0.492
X60 < 0.021 Gini=0.483
X60 < 0.010 Gini=0.476
X60 < 0.013 Gini=0.492
X60 < 0.005 Gini=0.489
X60 < 0.003 Gini=0.491
X60 < 0.004 Gini=0.493
X60 < 0.005 Gini=0.490
X60 < 0.011 Gini=0.481
X60 < 0.004 Gini=0.493
X60 < 0.003 Gini=0.492
X60 < 0.002 Gini=0.492
X60 < 0.004 Gini=0.492
X60 < 0.004 Gini=0.492
X60 < 0.009 Gini=0.492
X60 < 0.003 Gini=0.493
X60 < 0.004 Gini=0.493
X60 < 0.005 Gini=0.490
X60 < 0.002 Gini=0.492
X60 < 0.003 Gini=0.492
X60 < 0.010 Gini=0.492
X60 < 0.004 Gini=0.493
X60 < 0.003 Gini=0.492
X60 < 0.010 Gini=0.481
X60 < 0.005 Gini=0.491
X60 < 0.013 Gini=0.492
X60 < 0.010 Gini=0.490
X60 < 0.017 Gini=0.489
X60 < 0.007 Gin

In [None]:
# Select the best split point for a dataset
def get_split(dataset, n_features):
  class_values = list(set(row[-1] for row in dataset))
  b_index, b_value, b_score, b_groups = 999, 999, 999, None
  features = list()
  while len(features) < n_features:
    index = randrange(len(dataset[0])-1)
    if index not in features:
      features.append(index)
  for index in features:
    for row in dataset:
      groups = test_split(index, row[index], dataset)
      gini = gini_index(groups, class_values)
      if gini < b_score:
        b_index, b_value, b_score, b_groups = index, row[index], gini, groups
  return {'index':b_index, 'value':b_value, 'groups':b_groups}

In [None]:
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, n_features, depth):
  left, right = node['groups']
  del(node['groups'])
  # check for a no split
  if not left or not right:
    node['left'] = node['right'] = to_terminal(left + right)
    return
  # check for max depth
  if depth >= max_depth:
    node['left'], node['right'] = to_terminal(left), to_terminal(right)
    return
  # process left child
  if len(left) <= min_size:
    node['left'] = to_terminal(left)
  else:
    node['left'] = get_split(left, n_features)
    split(node['left'], max_depth, min_size, n_features, depth+1)
    # process right child
  if len(right) <= min_size:
    node['right'] = to_terminal(right)
  else:
    node['right'] = get_split(right, n_features)
    split(node['right'], max_depth, min_size, n_features, depth+1)

In [None]:
def build_tree(train, max_depth, min_size, n_features):
  root = get_split(train, n_features)
  split(root, max_depth, min_size, n_features, 1)
  return root

In [None]:
def bagging_predict(trees, row):
  predictions = [predict(tree, row) for tree in trees]
  return max(set(predictions), key=predictions.count)

In [None]:
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
  trees = list()
  for _ in range(n_trees):
    sample = subsample(train, sample_size)
    tree = build_tree(sample, max_depth, min_size, n_features)
    trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
  return(predictions)

In [None]:
seed(1)
# load and prepare data
filename = 'sonar_csv.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])-1):
  str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

{'Mine': 1, 'Rock': 0}

In [None]:
# evaluate algorithm
n_folds = 5
max_depth = 1
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset[0])-1))
for n_trees in [1, 5]:
  scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size,
  sample_size, n_trees, n_features)
  print('Trees: %d' % n_trees)
  print('Scores: %s' % scores)
  print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Trees: 1
Scores: [60.97560975609756, 65.85365853658537, 68.29268292682927, 68.29268292682927, 63.41463414634146]
Mean Accuracy: 65.366%
Trees: 5
Scores: [75.60975609756098, 78.04878048780488, 65.85365853658537, 75.60975609756098, 80.48780487804879]
Mean Accuracy: 75.122%


In [None]:
def knn_model(train):
  return train

In [None]:
# Make a prediction with KNN
def knn_predict(model, test_row, num_neighbors=2):
  neighbors = get_neighbors(model, test_row, num_neighbors)
  output_values = [row[-1] for row in neighbors]
  prediction = max(set(output_values), key=output_values.count)
  return prediction

In [None]:
def perceptron_predict(model, row):
  activation = model[0]
  for i in range(len(row)-1):
    activation += model[i + 1] * row[i]
  return 1.0 if activation >= 0.0 else 0.0

In [None]:
def perceptron_model(train, l_rate=0.01, n_epoch=5000):
  weights = [0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):
    for row in train:
      prediction = perceptron_predict(weights, row)
      error = row[-1] - prediction
      weights[0] = weights[0] + l_rate * error
      for i in range(len(row)-1):
        weights[i + 1] = weights[i + 1] + l_rate * error * row[i]
  return weights

In [None]:
def logistic_regression_predict(model, row):
  yhat = model[0]
  for i in range(len(row)-1):
    if model[i+1] and row[i]:
      yhat += model[i + 1] * row[i]
  return 1.0 / (1.0 + exp(-yhat))

In [None]:
def logistic_regression_model(train, l_rate=0.01, n_epoch=5000):
  coef = [0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):
    for row in train:
      yhat = logistic_regression_predict(coef, row)
      error = row[-1] - yhat if row[-1] else 0.0
      coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
      for i in range(len(row)-1):
        coef[i + 1]
  return coef

In [None]:
def to_stacked_row(models, predict_list, row):
  stacked_row = list()
  for i in range(len(models)):
    prediction = predict_list[i](models[i], row)
    stacked_row.append(prediction)
    stacked_row.append(row[-1])
  return row[0:len(row)-1] + stacked_row

In [None]:
# Stacked Generalization Algorithm
def stacking(train, test):
  model_list = [knn_model, perceptron_model]
  predict_list = [knn_predict, perceptron_predict]
  models = list()
  for i in range(len(model_list)):
    model = model_list[i](train)
    models.append(model)
    stacked_dataset = list()
  for row in train:
    stacked_row = to_stacked_row(models, predict_list, row)
    stacked_dataset.append(stacked_row)
    stacked_model = logistic_regression_model(stacked_dataset)
    predictions = list()
  for row in test:
    stacked_row = to_stacked_row(models, predict_list, row)
    stacked_dataset.append(stacked_row)
  stacked_model = logistic_regression_model(stacked_dataset)
  predictions = list()
  for row in test:
    stacked_row = to_stacked_row(models, predict_list, row)
    stacked_dataset.append(stacked_row)
    prediction = logistic_regression_predict(stacked_model, stacked_row)
    prediction = round(prediction)
    predictions.append(prediction)
  return predictions

In [None]:
seed(1)
# load and prepare data
filename = 'sonar_csv.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])-1):
  str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

{'Mine': 1, 'Rock': 0}

In [None]:
n_folds = 2
scores = evaluate_algorithm(dataset, stacking, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [49.03846153846153, 57.692307692307686]
Mean Accuracy: 53.365%


In [None]:
# completion