### Only for format  the raw data.

In [7]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log
from anytree import Node, RenderTree
from anytree import search as anys
from anytree.exporter import DotExporter
from IPython.display import Image

#### Load and format dataset Titanic survived dataset. 

In [8]:
# laod original trainning dataset.
train_data = pd.read_csv(
    "train.csv.titanic",
    sep=r'\s*,\s*',
    quotechar="'",
    engine='python',
    index_col=False,
    na_values="?")

# Counting...
Target = 'Survived'
Labels = train_data[Target].unique()
counts = train_data[Target].value_counts()
print(counts)

# laod original testing dataset (dataset without labels).
test_data = pd.read_csv(
    "test.csv.titanic",
    sep=r'\s*,\s*',
    engine='python',
    quotechar="'",
    index_col=False,
    na_values="?")

# Load labels for testing dataset.
test_labels = pd.read_csv(
    "gender_submission.csv.titanic",
    sep=r'\s*,\s*',
    engine='python',
    na_values="?")

# Counting...
Target = 'Survived'
Labels = test_labels[Target].unique()
counts = test_labels[Target].value_counts()
print(counts)

0    549
1    342
Name: Survived, dtype: int64
0    266
1    152
Name: Survived, dtype: int64


In [9]:
# Fusion labels to fusion train and test.
test_data['Survived'] = test_labels['Survived']

In [10]:
# Fusion the datasets using only selected features.
final = pd.concat([train_data[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Survived"]], test_data[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Survived"]]])

In [11]:
# Shuffling the data
final = final.sample(frac=1)
# Drop missing data
final = final.dropna()
# Convert to integer
final['Age'] = final['Age'].astype('float').astype('int64')
final['Fare'] = final['Fare'].astype('float').astype('int64')
# Converto to categorical features
final = final.astype('category')

In [12]:
# How much samples remaing.
Target = 'Survived'
Labels = final[Target].unique()
counts = final[Target].value_counts()
print(counts)

0    628
1    415
Name: Survived, dtype: int64


In [13]:
# Split the data by labels.
final0 = final.loc[final['Survived'] == 0]
final1 = final.loc[final['Survived'] == 1]
final0.reset_index(inplace=True)
final1.reset_index(inplace=True)

In [14]:
# A function to use 'frac' parameter.
# Split the data in frac < 1 for trainning and (1 - frac) for testing.

def makedata(final0,final1,frac):
    side0A = final0.loc[1:int(round(final0['index'].count()*frac)),:]
    side0B = final0.loc[int(round(final0['index'].count()*frac)):,:]
    side1A = final1.loc[1:int(round(final1['index'].count()*frac)),:]
    side1B = final1.loc[int(round(final1['index'].count()*frac)):,:]
    train = [side0A, side1A]
    test = [side0B, side1B]
    tempA = pd.concat(train).sample(frac=1)
    tempA.reset_index(inplace=True)
    tempB = pd.concat(test).sample(frac=1)
    tempB.reset_index(inplace=True)
    tempA = tempA.drop(tempA.columns[0:2], axis=1)
    tempB = tempB.drop(tempB.columns[0:2], axis=1)
    tempA = tempA.astype('str')
    tempB = tempB.astype('str')
    return tempA, tempB

In [15]:
train, test = makedata(final0,final1,0.6)

In [16]:
# Counting...

Target = 'Survived'
Labels = train[Target].unique()
counts = train[Target].value_counts()
print(counts)

Target = 'Survived'
Labels = test[Target].unique()
counts = test[Target].value_counts()
print(counts)

0    377
1    249
Name: Survived, dtype: int64
0    251
1    166
Name: Survived, dtype: int64


In [17]:
# Save the dataset in CSV.
train.to_csv('dataset.training.csv.titanic', sep=',',  encoding='ascii', decimal='.', index=False, header=False)
test.to_csv('dataset.test.csv.titanic', sep=',',  encoding='ascii', decimal='.', index=False, header=False)

In [18]:
def find_entropy(df):
    entropy = 0
    values = df[Target].unique()
    for value in values:
        temp = df[Target].value_counts()[value]/len(df[Target])
        entropy += -temp*np.log2(temp)
    return entropy

def find_entropy_attribute(df,attribute):
    
    if not np.issubdtype(df[attribute].dtype, np.number):   
        return find_entropy_attribute_not_number(df,attribute), None
    else:
        return find_entropy_attribute_number(df,attribute)
          
        
def find_entropy_attribute_not_number(df,attribute):
    target_variables = df[Target].unique()  #This gives all 'Yes' and 'No'
    variables = df[attribute].unique()    #This gives different features in that attribute (like 'Hot','Cold' in Temperature)
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df[Target] ==target_variable])
            den = len(df[attribute][df[attribute]==variable])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        entropy2 += -(den/len(df))*entropy
    return abs(entropy2)


def find_entropy_attribute_number(df,attribute):
    target_variables = df[Target].unique()  #This gives all 'Yes' and 'No'
    variables = df[attribute].unique()    #This gives different features in that attribute (like 'Hot','Cold' in Temperature)
    variables.sort()
    if len(variables)>2:
        variables = variables[1:-1]
        vk3 = variables[0]
        entropy3 = 0
    else:
        vk3 = variables[0]
        entropy3 = np.Inf
    
    for vk in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]<=vk][df[Target] ==target_variable])
            den = len(df[attribute][df[attribute]<=vk])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]>vk][df[Target] ==target_variable])
            den = len(df[attribute][df[attribute]>vk])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        entropy2 = (den/len(df))*abs(entropy)
        #print(str(entropy2)+"|"+str(vk))
        if entropy2>entropy3:
            entropy3 = entropy2
            vk3 = vk
    return abs(entropy3),vk3

def find_winner(df):
    IG = []
    vk = list()
    for key in df.columns.difference([Target]):
        temp,temp2 = find_entropy_attribute(df,key)
        vk.append(temp2)
        IG.append(find_entropy(df)-temp)
    return df.columns.difference([Target])[np.argmax(IG)], vk[np.argmax(IG)]

def buildtree(df,tree=None, mytree=None, T_pro=0.1, T_pro_num=0.4):
    
    def ramificatree(Thd):
        if (len(clValue)==1):
            tree[node][value] = clValue[0]
            print(node +' : '+value+' : '+clValue[0])
        else:
            rel_counts = counts.min() / counts.max()
            if (rel_counts<Thd):
                tree[node][value] = clValue[counts.argmax()]
                print(node +' : '+value+' : '+clValue[counts.argmax()])
            else:
                tree[node][value] = buildtree(subtable)
                print(node +' : '+value+' : *')

    #print(find_winner(df))
    #formata_dados(dados)
    node,vk = find_winner(df)

    if tree is None:
        tree={}
        tree[node] = {}

    if vk is None:
        attValue = np.unique(df[node])
        for value in attValue:

            subtable = df[df[node] == value].reset_index(drop=True)
            clValue,counts = np.unique(subtable[Target],return_counts=True)

            ramificatree(T_pro)
    else:
        
        if (len(df[node][df[node] <= vk].unique())>0) and (len(df[node][df[node] > vk].unique())>0):
           
            # >vk
            value = node+' >'+str(vk)
            subtable = df[df[node] > vk].rename(columns = {node:value}).reset_index(drop=True)
            clValue,counts = np.unique(subtable[Target],return_counts=True)            
            if (len(subtable[value].unique())==1) and (len(clValue)>1):
                clValue = clValue[counts.argmax()]
                tree[node][value] = clValue[0]
                print(node +' : '+value+' : '+clValue[0])
            else:
                ramificatree(T_pro_num)
            clValue_antes = clValue[0]
            value_antes = value
            # <=vk
            value = node+' <='+str(vk)
            subtable = df[df[node] <= vk].rename(columns = {node:value}).reset_index(drop=True)
            clValue,counts = np.unique(subtable[Target],return_counts=True)
            if ((len(subtable[value].unique())==1) and (len(clValue)>1)):
                tree[node][value] = clValue[counts.argmax()]
                print(node +' : '+value+' : '+clValue[counts.argmax()])
            else:
                ramificatree(T_pro_num)

        else:
            df[node] = df[node].astype(str)
            buildtree(df)
        
    return tree


# Only to see

def print_tree(arg):
    for pre, fill, node in RenderTree(arg):
        print("%s%s" % (pre, node.name))
        
def converte_para_anytree(tree,node=None,mytree=None):
    
    if node is None:
        temp = list(tree.keys())
        node = temp[0]
        mytree = {}
        mytree[node] = Node(node)
        converte_para_anytree(tree,node,mytree)
    else:
        tree = tree[node]
        if not isinstance(tree, str):
            childs = list(tree.keys())
            for child in childs:
                if ((tree[child] == Labels[0]) or (tree[child] == Labels[1])):
                    temp = mytree[node]
                    mytree[child] = Node(child, parent=temp, target=tree[child])
                else:
                    temp = mytree[node]
                    mytree[child] = Node(child, parent=temp)
                    converte_para_anytree(tree,child,mytree)
        else:
            mytree[node] = 'Fim'
                
    return mytree

#anys.findall_by_attr(mytree['Taste'], name="target", value='Yes')

def mostra_tree(tree):
    mytree = converte_para_anytree(tree)

    temp = list(tree.keys())
    root = temp[0]
    mytree[root]

    for pre, fill, node in RenderTree(mytree[root]):
        txt_node = str(node)
        if  Labels[0] in txt_node:
            print("%s%s" % (pre, node.name+': '+Labels[0]))
        elif Labels[1] in txt_node:
            print("%s%s" % (pre, node.name+': '+Labels[1]))
        else:
            print("%s%s" % (pre, node.name))
    
    
def mostra_tree_graph(tree, largura=None, altura=None):
    mytree = converte_para_anytree(tree)

    temp = list(tree.keys())
    root = temp[0]
    mytree[root]
    DotExporter(mytree[root]).to_picture("tree.png")
    return Image(filename='tree.png', width=largura, height=altura) 


def predict(inst,tree):

    for node in tree.keys():     

        if ('<=' in str(tree[node].keys())):

            childs = list(tree[node].keys())

            if ('<=' in childs[1]):
                temp = childs[1]
                childs[1] = childs[0]
                childs[0] = temp

            vk = float(childs[1].split('>')[1])
            valor = float(str(inst[node]))
            if (valor > vk):
                tree = tree[node][childs[1]]
                prediction = None
                if type(tree) is dict:
                    prediction = predict(inst, tree)
                else:
                    prediction = tree
                    break;
            else:
                tree = tree[node][childs[0]]
                prediction = None
                if type(tree) is dict:
                    prediction = predict(inst, tree)
                else:
                    prediction = tree
                    break;
            
        else:
            value = inst[node]
            if value in tree[node].keys():
                tree = tree[node][value]
                prediction = None
                if type(tree) is dict:
                    prediction = predict(inst, tree)
                else:
                    prediction = tree
                    break;
            else:
                prediction = 'Not exists node.'
        
    return prediction

def test_step(arg,tree):
    S = 0
    for i in range(0,len(arg)):
        S += (predict(arg.iloc[i],tree) == arg.iloc[i].Target)*1

    print(S / len(arg))

In [19]:
tree = None
tree = buildtree(train_data,T_pro = 0.2)

TypeError: must be str, not numpy.int64