In [None]:
#Dataset with two skin segments (1, 2 based on Red/Green/Blue value features) 
#from: https://archive.ics.uci.edu/ml/datasets/Skin+Segmentation

In [1]:
import pandas as pd
import math
import random


skin_segmentation = pd.read_csv("../Data/SkinSegmentation.txt", sep="\t")
#Randomizing dataset so that we don't have deterministic results (i.e. results can vary from run to run)
skin_segmentation = skin_segmentation.sample(frac=1)
#Adding column-names to dataset for readability and usability
skin_segmentation.columns = ["R","G","B","Class"]

#k-fold cross validation
k=10

chunk_size = math.floor(skin_segmentation.shape[0]/k)
accuracy_accumulation = []

for iteration in range(1, k+1):
    print("Computing fold where test chunk is:", k-iteration)
    
    #Ascribing the test_set and training_set depending on the fold
    test_set = skin_segmentation.iloc[(k-iteration)*chunk_size:(k-iteration+1)*chunk_size, ]
    training_set_left = skin_segmentation.iloc[:(k-iteration)*chunk_size, ]
    training_set_right = skin_segmentation.iloc[(k-iteration+1)*chunk_size:, ]
    training_set = training_set_left.append(training_set_right)
    
    #We train on the training-data here, very simple training classifier/model where we just tally up how many 
    #of a certain class we see and compute the probability of the given class for the whole training data
    print("Training...")
    ones = sum((training_set[["Class"]] == 1).values.T.tolist()[0])
    twos = sum((training_set[["Class"]] == 2).values.T.tolist()[0])

    total = ones + twos
    ones_prob = ones/total
    twos_prob = twos/total
    
    print("Finished Training, now Testing...")
    # We trained out simple model, now we test on our test data and see how well we do
    true_values = test_set[["Class"]]
    predicted_values = []
    for sample in test_set.iterrows():
        #Random number between 0 and 1
        class_generation = random.random()
        #Here we pick the class that has the closest probability (from training) based on the random draw
        choose_one = abs(class_generation - ones_prob)
        choose_two = abs(class_generation - twos_prob)
        if choose_one < choose_two:
            predicted_values.append(1)
        else:
            predicted_values.append(2)

    comparison_list = (true_values == predicted_values).values.T.tolist()
    accuracy = sum(comparison_list[0])/len(predicted_values)
    print("Fold Accuracy:", accuracy*100, "%")
    accuracy_accumulation.append(accuracy)
    
#Printing accuracy for each iteration
print(accuracy_accumulation)
#Printing averaged accuracy
print("Averaged Accuracy:", (sum(accuracy_accumulation)/len(accuracy_accumulation))*100, "%")
#Printing averaged error
print("Averaged Error:", (1-sum(accuracy_accumulation)/len(accuracy_accumulation))*100, "%")


Computing fold where test chunk is: 9
Training...
Finished Training, now Testing...
Fold Accuracy: 49.23893083044277 %
Computing fold where test chunk is: 8
Training...
Finished Training, now Testing...
Fold Accuracy: 50.018363599265456 %
Computing fold where test chunk is: 7
Training...
Finished Training, now Testing...
Fold Accuracy: 49.94082840236687 %
Computing fold where test chunk is: 6
Training...
Finished Training, now Testing...
Fold Accuracy: 50.27137318914507 %
Computing fold where test chunk is: 5
Training...
Finished Training, now Testing...
Fold Accuracy: 50.11222199551112 %
Computing fold where test chunk is: 4
Training...
Finished Training, now Testing...
Fold Accuracy: 50.00204039991838 %
Computing fold where test chunk is: 3
Training...
Finished Training, now Testing...
Fold Accuracy: 49.675576412976945 %
Computing fold where test chunk is: 2
Training...
Finished Training, now Testing...
Fold Accuracy: 49.85513160579473 %
Computing fold where test chunk is: 1
Training