In [3]:
import pandas as pd
import numpy as np
from pprint import pprint
import random


![](https://github.com/thesouth97/Code/blob/master/wether_tree.png?raw=true)

In [4]:
df = pd.read_csv('weather.csv')
df = df.drop('id', axis=1)

In [5]:
def train_test_split(df, test_size=0.2):
    
    n_rows, m_columns = df.shape
    indices = np.arange(0, n_rows).tolist()
    test_indices = random.sample(population=indices, k=int(n_rows*test_size))
    test_df = df.iloc[test_indices]
    train_df = df.drop(test_indices)

    return train_df, test_df

In [169]:
train_df, test_df = train_test_split(df, test_size=0.4)

# Helper functions

## Data pure?

In [7]:
def check_purity(df):
    labels = df.iloc[:,-1]
    labels_unique = np.unique(labels)
    if labels_unique.shape[0] == 1:
        return True
    else:
        return False

## Classify

In [9]:
def classify_data(df):
    
    labels = df.iloc[:,-1]
    labels_unique, counts = np.unique(labels, return_counts=True)
    index_max = np.argmax(counts)
    classification = labels_unique[index_max]
    
    return classification

## Potential splits

In [11]:
def get_potential_split(df):
    
    potential_splits = {}
    columns = df.columns
    n_rows, n_cols = df.shape
    for col in range(n_cols - 1):
        features = df[columns[col]].tolist()
        unique_features = np.unique(features).tolist()

        potential_splits[columns[col]] = unique_features
        
    return potential_splits

## Split data

In [35]:
def split_data(df, split_column, split_feature):
    
    data_same_feature = df[df[split_column] == str(split_feature)]
    data_different_feature = df[df[split_column] != str(split_feature)]
    
    return data_same_feature, data_different_feature

In [14]:
def split_data_column(df, split_column):
    
    df_features = []
    features = df[split_column].tolist()
    unique_features = np.unique(features)
    for i in unique_features:
        data_same_feature = df[df[split_column] == i]
        df_features.append(data_same_feature)
    return df_features

## Lowest overall entropy 

In [17]:
def calculate_entropy(df):
    
    labels = df.iloc[:,-1].tolist()
    _, num = np.unique(labels, return_counts=True)
    probability = num*1.0/np.sum(num)
    entropy = np.sum(-probability*np.log(probability))
    
    return entropy

In [41]:
def calculate_overall_entropy(data_same_feature, data_different_feature):
    
    num1 = data_same_feature.shape[0]
    num2 = data_different_feature.shape[0]
    
    num = num1 + num2
    
    p1 = calculate_entropy(data_same_feature)
    p2 = calculate_entropy(data_different_feature)
    
    overall_entropy = p1*(num1*1.)/num + p2*(num2*1.)/num
    
    
    return overall_entropy

In [44]:
def determine_best_split(df, potential_split):
    
    overall_entropy = 1000
    for columns_index in potential_split:
        for feature in potential_split[columns_index]:
            data_same_feature, data_different_feature = split_data(df, columns_index, feature)
            curr_overall_entropy = calculate_overall_entropy(data_same_feature, data_different_feature)
            if curr_overall_entropy < overall_entropy:
                overall_entropy = curr_overall_entropy
                best_column_split = columns_index
                best_feature_split = feature
    return best_column_split, best_feature_split

# Decision Tree Algorithm

sub_tree = {question: [yes_answer, no_answer]}

In [23]:
tree_example = {'outlook = sunny':[{'hummidity = normal':['play'], 
                                    'hummidity = high':['no']}], 
                
                'outlook = overcast':['play'], 
                
                'outlook = rainy':[{'wind = weak':['play'], 
                                    'wind = strong':['no']}]}

### Algorithm

In [170]:
def Decition_tree(df, counter=0, max_depth=5):
    if counter == 0:
        global columns
        
        columns = df.columns
        data = df
    else:
        data = df
        
    if (check_purity(data)) or max_depth == counter:
        classification = classify_data(data)
        return classification
    else:
        counter += 1
        potential_split = get_potential_split(data)
        best_split_column, best_split_feature = determine_best_split(data, potential_split)
        data_same_feature, data_different_feature = split_data(data, best_split_column, best_split_feature)
      
        question= '{} = {}'.format(best_split_column, best_split_feature)
        sub_tree = {question:[]}
        
        answer1 = Decition_tree(data_same_feature, counter, max_depth)
        answer2 = Decition_tree(data_different_feature, counter, max_depth)
        sub_tree[question].append(answer1)
        sub_tree[question].append(answer2)
        return sub_tree

In [176]:
tree = Decition_tree(train_df, counter=0, max_depth=2)
pprint(tree)

{'outlook = sunny': [{'humidity = high': ['no', 'yes']},
                     {'wind = strong': ['no', 'yes']}]}


## Classification

In [178]:
def classify_example(example, tree):
    
    question = list(tree.keys())[0]
    columns, compare_opertor, features = question.split()

    if example[columns] == features:
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree = answer
        
        return classify_example(example, residual_tree)

## Accuracy

In [179]:
def accuracy(df, tree):
    lst_example = []
    true_labels = df.iloc[:,-1].tolist()
    pred_labels = []
    num_test = df.shape[0]
    for i in range(num_test):
        lst_example.append(df.iloc[i,:])
        
    for example in lst_example:
        predict = classify_example(example, tree)
        pred_labels.append(predict)
    
    lst = []
    for i in range(len(pred_labels)):
        if pred_labels[i] == true_labels[i]:
            lst.append(True)
        else:
            lst.append(False)
    lst = np.asarray(lst)
    acc = lst.mean()
    return acc

In [180]:
accuracy(test_df, tree)

0.8