In [2]:
#Import Functions
import numpy as np
import pandas as pd
import math
from scipy.stats import bernoulli, norm
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from tabulate import tabulate
import os

In [3]:
def variable_split(mat): # Gets Y values after correlation (Selecting a Variable to Split On)
    temp_mat = pd.DataFrame(mat)
    corr_col = mat.corr().iloc[:,-1]
    corr_col = corr_col[:-1].abs().sort_values(ascending = False)
    return pd.DataFrame(corr_col).index[0]

def midpt_arr(mat, ind): #Gets Midpoints based on the index found in variable_split [Treat as alpha values to find the best one]
    mid_mat = []
    mat_sort = pd.DataFrame(mat[ind].sort_values(ascending = False))
    for i in range(len(mat_sort) - 1):
        midpt = (mat_sort.iloc[i] + mat_sort.iloc[i+1]) / 2
        mid_mat.append(midpt)
    return pd.DataFrame(mid_mat)

def threshold_calc(mat): #Calculates the Thresholds after having the variable to split on and the midpoints
    feature = variable_split(mat)
    midpt_mat = midpt_arr(mat, feature)
    sorted_mat = mat[feature].sort_values(ascending = False).reset_index()
    err_test = []

    for i in range(len(mat) - 1):
        alpha = float(midpt_mat.values[i])

        left_partition = pd.DataFrame(sorted_mat[(sorted_mat[feature] <= alpha)])
        right_partition = pd.DataFrame(sorted_mat[(sorted_mat[feature] > alpha)])        
        left_error = 0
        right_error = 0

        if(len(left_partition) > 0):
            y_left_avg = float(left_partition.iloc[:,-1:].mean())
            for j in range(len(left_partition)):
                error = (1 / len(left_partition)) * (float(left_partition.iloc[:,-1:].values[j]) - y_left_avg)**2
                left_error += error
        if(len(right_partition) > 0):
             y_right_avg = float(right_partition.iloc[:,-1:].mean())
             for j in range(len(right_partition)):
                error = (1 / len(right_partition)) * (float(right_partition.iloc[:,-1:].values[j]) - y_right_avg)**2
                right_error += error
        lr_data = len(left_partition) + len(right_partition)
        total_error = ((len(left_partition) / lr_data) * left_error) + ((len(right_partition) / lr_data) * right_error)
        err_test.append(total_error)

    min_err = pd.DataFrame(err_test).sort_values(0, ascending = True)
    min_err_res = min_err.index[0]
    return sorted_mat['index'][min_err_res]
        

In [4]:
#Creates a class called Node to use as the tree depth
class Node:
    def __init__(self, name = "Leaf", feature_index=None, threshold=None, left=None, right=None, dataset=None, depth=None):
        self.name = name
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.dataset = dataset
        self.depth = depth

In [5]:
#Generates a tree with a node inserted as the starting point. Uses all the functions above to generate a tree based on the thresholds of the data. 
def tree_sample_size(node, depth):
    node.depth = depth
    if((len(node.dataset) <= 1)):
        return node
    feature = variable_split(node.dataset)
    threshold = threshold_calc(node.dataset)
    temp_dataset = pd.DataFrame(node.dataset)
    if((len(node.dataset) < temp_dataset[feature][threshold])):
        return node
        
    node.name = feature
    node.feature_index = feature
    node.threshold = threshold

    left_partition = pd.DataFrame(temp_dataset[temp_dataset[feature] <= temp_dataset[feature][threshold]]).reset_index(drop = True)
    right_partition = pd.DataFrame(temp_dataset[temp_dataset[feature] > temp_dataset[feature][threshold]]).reset_index(drop = True)
    
    if(len(right_partition) == 0):
        left_partition = pd.DataFrame(temp_dataset[temp_dataset[feature] < temp_dataset[feature][threshold]]).reset_index(drop = True)
        right_partition = pd.DataFrame(temp_dataset[temp_dataset[feature] >= temp_dataset[feature][threshold]]).reset_index(drop = True)
    
    node.left = Node(dataset = left_partition)
    node.right = Node(dataset = right_partition)

    node.left = tree_sample_size(node.left, depth + 1)
    node.right = tree_sample_size(node.right, depth + 1)

    return node

#Calculates the error upon the decision tree being made. Returns the Y values of all the values after the decision tree has been traversed
def error_calc(node, dataset):
    mat = dataset
    total_sum = []
    sum = 0
    for i in range(len(mat)):
        queue = []
        temp = node
        refresh = 0
        queue.append(temp)
        while ((len(queue) > 0) and refresh == 0):
            temp = queue.pop(0)
            if temp.name == "Leaf":
                curr_mat = pd.DataFrame(temp.dataset)
                sum = curr_mat.iloc[:,-1].mean()
                refresh = 1
            else: 
                if(mat[temp.feature_index][i] <= node.dataset[temp.feature_index][temp.threshold]):
                    if temp.left != None:
                        queue.append(temp.left) 
                else:
                    if temp.right != None:
                        queue.append(temp.right)
        total_sum.append(sum)
    return total_sum

In [110]:
#Generates Dataset for both leaves and wood image
leaves_image = 'Leaves_Masked.jpg'
woods_image = 'Wood_Masked.jpg'

image_leaf = Image.open(leaves_image)
image_wood = Image.open(woods_image)
arr_image_leaf = np.asarray(image_leaf)
arr_image_woods = np.asarray(image_wood)

white_count = 0
leaves_df = []
Y_r = []
Y_g = []
Y_b = []
for i in range(0,900):
    for j in range(0,900):
        if((int(arr_image_leaf[i][j][0]) > 235) & (int(arr_image_leaf[i][j][1]) > 235) & (int(arr_image_leaf[i][j][2]) > 235)): white_count+= 1
        else:
            leaves_df.append([int(arr_image_leaf[i][j][0]), int(arr_image_leaf[i][j][1]), int(arr_image_leaf[i][j][2])])
            if(j!= 899):
                Y_r.append(int(arr_image_leaf[i][j+1][0]))
                Y_g.append(int(arr_image_leaf[i][j+1][1]))
                Y_b.append(int(arr_image_leaf[i][j+1][2]))
            else:
                Y_r.append(0)
                Y_g.append(0)
                Y_b.append(0)

white_count = 0
woods_df = []
Y_r_w = []
Y_g_w = []
Y_b_w = []
for i in range(0,900):
    for j in range(0,900):
        if((int(arr_image_woods[i][j][0]) > 235) & (int(arr_image_woods[i][j][1]) > 235) & (int(arr_image_woods[i][j][2]) > 235)): white_count+= 1
        else:
            woods_df.append([int(arr_image_woods[i][j][0]), int(arr_image_woods[i][j][1]), int(arr_image_woods[i][j][2])])
            if(j!= 899):
                Y_r_w.append(int(arr_image_woods[i][j+1][0]))
                Y_g_w.append(int(arr_image_woods[i][j+1][1]))
                Y_b_w.append(int(arr_image_woods[i][j+1][2]))
            else:
                Y_r_w.append(0)
                Y_g_w.append(0)
                Y_b_w.append(0)

In [111]:
#Creates 3 separate datasets for wood and leaves image, one for red values, one for green values, and one for blue values
leaves_df = pd.DataFrame(leaves_df)
Y_r_df = pd.DataFrame(Y_r)
Y_g_df = pd.DataFrame(Y_g)
Y_b_df = pd.DataFrame(Y_b)
synth_data_red = pd.merge(leaves_df, Y_r_df, left_index=True, right_index=True)
synth_data_red = synth_data_red.rename(columns = {'0_x': 0, '0_y': 3})
synth_data_green = pd.merge(leaves_df, Y_g_df, left_index=True, right_index=True)
synth_data_green = synth_data_green.rename(columns = {'0_x': 0, '0_y': 3})
synth_data_blue = pd.merge(leaves_df, Y_b_df, left_index=True, right_index=True)
synth_data_blue = synth_data_blue.rename(columns = {'0_x': 0, '0_y': 3})


woods_df = pd.DataFrame(woods_df)
Y_r_w_df = pd.DataFrame(Y_r_w)
Y_g_w_df = pd.DataFrame(Y_g_w)
Y_b_w_df = pd.DataFrame(Y_b_w)
synth_data_red_w = pd.merge(woods_df, Y_r_w_df, left_index=True, right_index=True)
synth_data_red_w = synth_data_red_w.rename(columns = {'0_x': 0, '0_y': 3})
synth_data_green_w = pd.merge(woods_df, Y_g_w_df, left_index=True, right_index=True)
synth_data_green_w = synth_data_green_w.rename(columns = {'0_x': 0, '0_y': 3})
synth_data_blue_w = pd.merge(woods_df, Y_b_w_df, left_index=True, right_index=True)
synth_data_blue_w = synth_data_blue_w.rename(columns = {'0_x': 0, '0_y': 3})


In [113]:
#Creates a decision tree for training data
mat_train = pd.DataFrame(synth_data_red).loc[0:2000]
mat_test = pd.DataFrame(synth_data_red).loc[0:500]
tree_node_red = Node(dataset = mat_train)
test_tree_red = tree_sample_size(tree_node_red, 1)

mat_train = pd.DataFrame(synth_data_green).loc[0:2000]
mat_test = pd.DataFrame(synth_data_green).loc[0:500]
tree_node_green = Node(dataset = mat_train)
test_tree_green = tree_sample_size(tree_node_green, 1)

mat_train = pd.DataFrame(synth_data_blue).loc[0:2000]
mat_test = pd.DataFrame(synth_data_blue).loc[0:500]
tree_node_blue = Node(dataset = mat_train)
test_tree_blue = tree_sample_size(tree_node_blue, 1)

mat_train = pd.DataFrame(synth_data_red_w).loc[0:2000]
mat_test = pd.DataFrame(synth_data_red_w).loc[0:500]
tree_node_red_w = Node(dataset = mat_train)
test_tree_red_w = tree_sample_size(tree_node_red_w, 1)

mat_train = pd.DataFrame(synth_data_green_w).loc[0:2000]
mat_test = pd.DataFrame(synth_data_green_w).loc[0:500]
tree_node_green_w = Node(dataset = mat_train)
test_tree_green_w = tree_sample_size(tree_node_green_w, 1)

mat_train = pd.DataFrame(synth_data_blue_w).loc[0:2000]
mat_test = pd.DataFrame(synth_data_blue_w).loc[0:500]
tree_node_blue_w = Node(dataset = mat_train)
test_tree_blue_w = tree_sample_size(tree_node_blue_w, 1)

In [118]:
#Uses Decision trees to predict results within white space. Additionally appends predictions to original array
leaves_image = 'Leaves_Masked.jpg'
wood_image = 'Wood_Masked.jpg'

temp_tree_red = test_tree_red
temp_tree_green = test_tree_green
temp_tree_blue = test_tree_blue
image = Image.open(leaves_image)
arr_image = np.asarray(image)

temp_tree_red_w = test_tree_red_w
temp_tree_green_w = test_tree_green_w
temp_tree_blue_w = test_tree_blue_w
image = Image.open(leaves_image)
arr_image = np.asarray(image)
image_wood = Image.open(wood_image)
arr_image_woods = np.asarray(image_wood)

vals_two = []
for i in range(300, 600):
    for j in range(300, 600):
        red_df = pd.DataFrame(synth_data_red[i*900 + j: i*900 + j + 1]).reset_index(drop = True)
        green_df = pd.DataFrame(synth_data_blue[i*900 + j: i*900 + j + 1]).reset_index(drop = True)
        blue_df = pd.DataFrame(synth_data_blue[i*900 + j: i*900 + j + 1]).reset_index(drop = True)
        r_val = error_calc(test_tree_red, red_df)
        g_val = error_calc(test_tree_green, green_df)
        b_val = error_calc(test_tree_blue, blue_df)
        vals_two.append([r_val[0], g_val[0], b_val[0]])

k = 0
for i in range(300, 600):
    for j in range(300,600):
        arr_image_leaf[i][j] = vals_two[k]
        k+= 1

vals_two_w = []
for i in range(300, 600):
    for j in range(300, 600):
        red_df = pd.DataFrame(synth_data_red_w[i*900 + j: i*900 + j + 1]).reset_index(drop = True)
        green_df = pd.DataFrame(synth_data_blue_w[i*900 + j: i*900 + j + 1]).reset_index(drop = True)
        blue_df = pd.DataFrame(synth_data_blue_w[i*900 + j: i*900 + j + 1]).reset_index(drop = True)
        r_val = error_calc(test_tree_red_w, red_df)
        g_val = error_calc(test_tree_green_w, green_df)
        b_val = error_calc(test_tree_blue_w, blue_df)
        vals_two_w.append([r_val[0], g_val[0], b_val[0]])

k = 0
for i in range(300, 600):
    for j in range(300,600):
        arr_image_woods[i][j] = vals_two_w[k]
        k+= 1

In [None]:
#Returns image with appended prediction over white space
leaf_final = Image.fromarray(arr_image_leaf)
wood_final = Image.fromarray(arr_image_woods)
leaf_final.save('final_leaf_dt_png')
wood_final.save('final_wood_dt.png')