In [19]:
import numpy as np 
import pandas as pd
from scipy.io import loadmat

In [2]:
def check_purity(data):
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)
    
    if len(unique_classes) == 1:
        return True
    else:
        return False
#     haidgocuwgqd

In [4]:
def create_leaf(data, ml_task):
    label_column = data[:, -1]
    if ml_task == 'regression':
        leaf = np.mean(label_column)
        
    elif ml_task=='classification':
        unique_classes, counts_unique_classes = np.unique(label_column, return_counts = True)
        index = counts_unique_classes.argmax()
        leaf = unique_classes[index]
        
    else:
        print("ML TASK NOT FOUND!!")
        
    return leaf 

        

In [5]:
def get_potential_splits(data):
    potential_splits = {}
    _, n_columns = data.shape 
    
    for column_index in range(n_columns-1):
        values = data[:, column_index]
        unique_values = np.unique(values)
        potential_splits[column_index] = unique_values 
        
    return potential_splits
        

In [6]:
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts = True)
    
    probabilities = counts/counts.sum()
    entropy = sum(probabilities* -np.log2(probabilities))
    
    return entropy

In [7]:
def calculate_mse(data):
    actual_values = data[:, -1]
    if len(actual_values) == 0: #Empty Data
        mse = 0
        
    else:
        prediction = np.mean(actual_values)
        mse = np.mean((actual_values - prediction)**2)
        
    return mse       
    

In [8]:
def calculate_overall_metric(data_below, data_above, metric_function):
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below)/n
    p_data_above = len(data_above)/n
    
    overall_metric =  (p_data_below * metric_function(data_below) + p_data_above * metric_function(data_above))
    return overall_metric

    

In [86]:
def determine_best_split(data, potential_splits, ml_task):
    first_iteration = True
    
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column = column_index, split_value=value)
            
            if ml_task == 'regression':
                current_overall_metric = calculate_overall_metric(data_below, data_above, metric_function = calculate_mse)
            
            else:
                current_overall_metric = calculate_overall_metric(data_below, data_above, metric_function = calculate_entropy)
            
            if first_iteration or current_overall_metric<= best_overall_metric:
                first_iteration = False 
                
                best_overall_metric = current_overall_metric 
                best_split_column = column_index 
                best_split_value = value
                
    return best_split_column, best_split_value

In [10]:
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values >  split_value]
    
    # feature is categorical   
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    
    return data_below, data_above

In [77]:
def determine_type_of_feature(df):
    
    feature_types = []
    n_unique_values_threshold = 15
    for feature in df.columns:
        if feature != "label":
            unique_values = df[feature].unique()
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_threshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")
    
    return feature_types

In [88]:
def decision_tree_algorithm(df, ml_task, counter = 0, min_samples=2, max_depth = 5):
    if counter ==0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = determine_type_of_feature(df)
        data = df.values
        
    else:
        data = df
        
    if (check_purity(data))or (len(data)<min_samples) or counter==max_depth:
        leaf = create_leaf(data, ml_task)
        return leaf 
    else:
        counter+=1
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits, ml_task)
        data_below, data_above = split_data(data, split_column, split_value)
        
        if len(data_below) == 0 or len(data_above) == 0:
            leaf = create_leaf(data, ml_task)
            return leaf
        
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name, split_value)
            
        else:
            question = "{} = {}".format(feature_name, split_value)
            
        sub_tree = {question: []}
        
        yes_answer = decision_tree_algorithm(data_below, ml_task, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, ml_task, counter, min_samples, max_depth)
        
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [115]:

question = list(tree.keys())[0]
feature_name, operator, value = question.split()
example[int(feature_name)]

5.842658519744873

In [116]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split()

    # ask question
    
    if example[int(feature_name)] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]
    
    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [98]:
def make_predictions(df, tree):
    
    if len(df) != 0:
        predictions = df.apply(predict_example, args=(tree,), axis=1)
    else:
        # "df.apply()"" with empty dataframe returns an empty dataframe,
        # but "predictions" should be a series instead
        predictions = pd.Series()
        
    return predictions


In [92]:
def calculate_accuracy(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [82]:
data = loadmat('2010rainfall.mat')

In [83]:
XR1 = data['XR1']
ZR1 = data['ZR1']
dfx = pd.DataFrame(XR1)
dfz = pd.DataFrame(ZR1.T, columns=['target']).transpose()  
dfz.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,112,113,114,115,116,117,118,119,120,121
target,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
data = loadmat('2011rainfall.mat')
XR = data['XR']
ZR = data['ZR']
dfx1 = pd.DataFrame(XR)
dfz1 = pd.DataFrame(ZR.T, columns=['target']).transpose()  
dfz1.head()
df1 = pd.concat([dfx1.T, dfz1.T] , axis = 1)
example = df1.iloc[0]
example[0] = 121

0          0.000000
1          0.000000
2          0.000000
3          0.000000
4          0.000000
5          0.000000
6          0.000000
7          0.000000
8          0.000000
9          0.000000
10         0.000000
11         0.000000
12         0.000000
13         0.000000
14         0.000000
15         0.000000
16         0.000000
17         0.000000
18         0.000000
19         0.000000
20         0.000000
21         0.000000
22         0.000000
23         0.000000
24         0.000000
25         1.676334
26         0.000000
27         0.000000
28         0.000000
29         0.000000
            ...    
328        3.864232
329        0.000000
330       11.902644
331       17.620415
332        6.513855
333        2.125646
334        0.000000
335        7.720507
336        5.400000
337        5.404264
338        5.785634
339        4.732689
340       15.177852
341       15.400001
342        5.435688
343        7.697778
344       13.311983
345       14.942623
346       30.764194


In [84]:
df = pd.concat([dfx.T, dfz.T] , axis = 1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,348,349,350,351,352,353,354,355,356,target
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,15.400000,15.404682,15.400000,7.700000,15.400000,10.266666,0.0,0.0,0.0,0
1,0.000000,0.000000,0.467299,0.469477,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,22.299999,22.299438,22.299999,11.150000,22.299999,14.866666,0.0,0.0,0.0,0
2,0.000000,0.000000,10.927622,6.805963,6.611897,0.000000,0.0,16.400000,15.209424,6.960495,...,3.600000,3.600033,3.600000,1.800000,3.600000,2.400000,0.0,0.0,0.0,0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,18.700001,18.699162,18.700001,9.350000,18.700003,12.466667,0.0,0.0,0.0,0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,5.200000,5.199905,5.200000,2.600000,5.200000,3.466666,0.0,0.0,0.0,0
5,13.246254,13.200000,18.999964,23.731779,10.257844,13.199999,0.0,0.000000,0.000000,0.000000,...,34.400002,34.400826,34.400002,17.200001,34.400005,22.933334,0.0,0.0,0.0,0
6,67.325218,67.599998,27.752983,33.303631,56.923618,67.599998,0.0,17.500000,22.559948,66.398994,...,11.800000,11.802683,11.800000,5.900000,11.800001,7.866667,0.0,0.0,0.0,0
7,0.000000,0.000000,2.414766,0.000000,0.000000,0.000000,0.0,6.700000,6.095693,0.000000,...,9.900000,9.900876,9.900000,4.950000,9.900000,6.600000,0.0,0.0,0.0,0
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,10.400000,10.399607,10.400000,5.200000,10.400001,6.933333,0.0,0.0,0.0,0
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,33.799999,33.799137,33.799999,16.900000,33.800003,22.533335,0.0,0.0,0.0,0


In [95]:
tree = decision_tree_algorithm(df, ml_task='classification', counter = 0, min_samples=2, max_depth = 5)   

In [117]:
classify_example(example, tree)

1.0

In [129]:
predictions = []
for row in df1.to_numpy():
    ans = classify_example(row, tree)
    predictions.append(ans)  

In [138]:
predictions
actual_values = df1['target'].to_numpy()

correct_predictions = 0
for i in range(122):
    if predictions[i]==actual_values[i]:
        correct_predictions+=1
        

# correct_predictions/122
Accuracy = correct_predictions/len(predictions)
print(Accuracy)

0.7377049180327869
