In [128]:
import numpy as np
import pandas as pd
from time import time
from sklearn import tree

In [129]:
def train_test_split(X, ratio=0.8):
    X_shuffled = np.copy(X)
    np.random.shuffle(X_shuffled)
    return X_shuffled[:int(len(X_shuffled) * ratio)], X_shuffled[int(len(X_shuffled) * ratio):]

In [130]:
class Node:
    def __init__(self, feature=-1, split=None, entropy=0):
        # Split feature
        self.feature = feature
        # Split criterion
        self.split = split
        self.entropy = entropy
        self.children = []
        self.parent = None
        self.leaf = False
        self.label = None
        self.depth = 0
        
    def get_elements(self, X, y=None):
        # Categorical feature
        if isinstance(self.split, list):
            splitted_data = []
            for value in self.split:
                indices = X.iloc[:, self.feature] == value
                if y is None:
                    splitted_data.append(X[indices])
                else:
                    splitted_data.append((X[indices], y[indices]))
                    
            return splitted_data
        
        # Numerical feature
        indices_left = X.iloc[:, self.feature] < self.split
        indices_right = X.iloc[:, self.feature] >= self.split
        
        if y is None:
            return [X[indices_left], X[indices_right]]
        
        return [(X[indices_left], y[indices_left]), (X[indices_right], y[indices_right])]
            
    def entropy_for_split(self, X, y):
        splitted_data = self.get_elements(X, y)
        entropies = np.zeros(len(splitted_data))
        for index, data in enumerate(splitted_data):
            entropies[index] = self.calc_entropy(data[1]) * len(data[0])
        return np.sum(entropies) / len(X)
    
    def calc_entropy(self, y, store=False):
        unique_y = np.unique(y)
        probs = np.zeros(len(unique_y))
        y_len = len(y)
        for i, y_i in enumerate(unique_y):
            probs[i] = len(y[y == y_i]) / y_len
        
        entropy = -np.sum(probs * np.log2(probs + 10e-8))
        if store:
            self.entropy = entropy
        return entropy

In [205]:
class DecisionTreeClassifier:
    def __init__(self, tol=0.5, max_depth=10, min_members=50):
        self.tol = tol
        self.tree = None
        self.tree_depth = 0
        self.max_depth = max_depth
        self.min_members = min_members
    
    def fit(self, X, y):
        self.tree_ = Node()
        self.__generate_tree(self.tree_, X, y)
    
    def __generate_tree(self, tree, X, y):
        if len(y) <= self.min_members or tree.calc_entropy(y, store=True) < self.tol:
            self.__label_node(tree, y)
            return
        
        best_feature_split = self.__split_attribute(tree, X, y)        
        tree.feature = best_feature_split[0]
        tree.split = best_feature_split[1]
        
        if tree.feature is None or tree.split is None:
            self.__label_node(tree, y)
            return
        
        splitted_data = tree.get_elements(X, y)
        
        if len(splitted_data) < 2:
            self.__label_node(tree, y)
            return
                
        for el in splitted_data:
            new_node = Node()
            tree.children.append(new_node)
            new_node.parent = tree
            self.__generate_tree(new_node, el[0], el[1])
        
    
    def __split_attribute(self, tree, X, y):
        min_entropy = 100 if tree.parent is None else tree.parent.entropy
        entropy = min_entropy
        best_feature = None
        best_split_value = None
        for index, feature in enumerate(X.columns):
            tree.feature = index
            if X[feature].dtype.name == 'category':
                tree.split = list(X[feature].unique())
                if len(tree.split) < 2:
                    continue
                entropy = tree.entropy_for_split(X, y)
                if entropy < min_entropy:
                    min_entropy = entropy
                    best_feature = index
                    best_split_value = tree.split
            else:
                X_feature_sorted = X.iloc[1:, index].sort_values()
                y_sorted = y[X_feature_sorted.index].values
                X_feature_sorted_values = X_feature_sorted.values
                thresholds = (X_feature_sorted_values[1:] + X_feature_sorted_values[:-1])/2
                thresholds_len = len(thresholds)
                for value_index, value in enumerate(thresholds):
                    if (value_index < thresholds_len - 1) and (y_sorted[value_index] == y_sorted[value_index+1] or thresholds[value_index] == thresholds[value_index+1]):
                        continue
                    
                    tree.split = value
                    entropy = tree.entropy_for_split(X, y)

                    if entropy < min_entropy:
                        min_entropy = entropy
                        best_feature = index
                        best_split_value = tree.split
                
        return best_feature, best_split_value
    
    def __label_node(self, node, y):
        most_frequent = y.mode()
        rand = np.random.randint(len(most_frequent))
        node.leaf = True
        node.label = y.mode()[rand]
    
    def predict(self, X):
        pred = pd.Series(-1, X.index)
        self.__decide(self.tree_, X, pred)
        return pred
    def __decide(self, node, X, pred):
        if node.leaf:
            pred[X.index] = node.label
            return
            
        branches = node.get_elements(X)
        for index, branch in enumerate(branches):
            self.__decide(node.children[index], branch, pred)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return y_pred[y == y_pred].size / y_pred.size
        

In [206]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [207]:
df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [208]:
df.corr()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
battery_power,1.0,0.011252,0.011482,-0.041847,0.033334,0.015665,-0.004004,0.034085,0.001844,-0.029727,...,0.014901,-0.008402,-0.000653,-0.029959,-0.021421,0.05251,0.011522,-0.010516,-0.008343,0.200723
blue,0.011252,1.0,0.021419,0.035198,0.003593,0.013443,0.041177,0.004049,-0.008605,0.036161,...,-0.006872,-0.041533,0.026351,-0.002952,0.000613,0.013934,-0.030236,0.010061,-0.021863,0.020573
clock_speed,0.011482,0.021419,1.0,-0.001315,-0.000434,-0.043073,0.006545,-0.014364,0.01235,-0.005724,...,-0.014523,-0.009476,0.003443,-0.029078,-0.007378,-0.011432,-0.046433,0.019756,-0.024471,-0.006606
dual_sim,-0.041847,0.035198,-0.001315,1.0,-0.029123,0.003187,-0.015679,-0.022142,-0.008979,-0.024658,...,-0.020875,0.014291,0.041072,-0.011949,-0.016666,-0.039404,-0.014008,-0.017117,0.02274,0.017444
fc,0.033334,0.003593,-0.000434,-0.029123,1.0,-0.01656,-0.029133,-0.001791,0.023618,-0.013356,...,-0.00999,-0.005176,0.015099,-0.011014,-0.012373,-0.006829,0.001793,-0.014828,0.020085,0.021998
four_g,0.015665,0.013443,-0.043073,0.003187,-0.01656,1.0,0.00869,-0.001823,-0.016537,-0.029706,...,-0.019236,0.007448,0.007313,0.027166,0.037005,-0.046628,0.584246,0.016758,-0.01762,0.014772
int_memory,-0.004004,0.041177,0.006545,-0.015679,-0.029133,0.00869,1.0,0.006886,-0.034214,-0.02831,...,0.010441,-0.008335,0.032813,0.037771,0.011731,-0.00279,-0.009366,-0.026999,0.006993,0.044435
m_dep,0.034085,0.004049,-0.014364,-0.022142,-0.001791,-0.001823,0.006886,1.0,0.021756,-0.003504,...,0.025263,0.023566,-0.009434,-0.025348,-0.018388,0.017003,-0.012065,-0.002638,-0.028353,0.000853
mobile_wt,0.001844,-0.008605,0.01235,-0.008979,0.023618,-0.016537,-0.034214,0.021756,1.0,-0.018989,...,0.000939,9e-05,-0.002581,-0.033855,-0.020761,0.006209,0.001551,-0.014368,-0.000409,-0.030302
n_cores,-0.029727,0.036161,-0.005724,-0.024658,-0.013356,-0.029706,-0.02831,-0.003504,-0.018989,1.0,...,-0.006872,0.02448,0.004868,-0.000315,0.025826,0.013148,-0.014733,0.023774,-0.009964,0.004399


In [209]:
df['ram_below_mean'] = (df['ram'] < df['ram'].mean()) * 1

In [210]:
df['battery_power_below_mean'] = (df['battery_power'] < df['battery_power'].mean()) * 1

In [211]:
df['ram_below_75'] = (df['ram'] < df['ram'].quantile(0.75)) * 1

In [212]:
for feature in ['ram_below_mean', 'battery_power_below_mean', 'ram_below_75']:
    df[feature] = df[feature].astype('category')
    #df_test[feature] = df_test[feature].astype('category')    

In [213]:
model = DecisionTreeClassifier()

In [214]:
X = df[['ram', 'battery_power']]
y = df['price_range']

In [215]:
model.fit(X, y)

In [216]:
model.score(X, y)

0.8275

In [127]:
model1 = tree.DecisionTreeClassifier(min_samples_leaf=2)

In [219]:
df = pd.read_csv('weatherAUS.csv')

In [220]:
df.dropna(inplace=True)

In [221]:
df.iloc[:, 9:]

Unnamed: 0,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
5939,ENE,SW,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No,0.0,No
5940,SSE,SSE,19.0,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,No,0.0,No
5942,NNE,NNW,30.0,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,No,0.0,No
5943,WNW,WSW,6.0,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,No,0.0,No
5944,NW,WNW,17.0,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,No,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139108,SE,NE,9.0,20.0,63.0,32.0,1013.9,1010.5,0.0,1.0,24.5,32.3,No,0.0,No
139109,SE,SE,13.0,11.0,56.0,28.0,1014.6,1011.2,7.0,0.0,24.8,32.0,No,0.0,No
139110,E,W,17.0,11.0,46.0,23.0,1015.3,1011.8,0.0,0.0,24.8,32.1,No,0.0,No
139111,SE,NNW,9.0,17.0,62.0,58.0,1014.9,1010.7,1.0,1.0,24.8,29.2,No,0.0,No


In [222]:
model = DecisionTreeClassifier(min_members=20)

In [223]:
for feature in ['Location', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    df[feature] = df[feature].astype('category')
    df[feature] = df[feature].cat.codes

In [224]:
X = df[['Location', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'RainToday', 'Temp3pm']]
y = df['RainTomorrow']

In [160]:
a=time()
model.fit(X, y)
b=time()

b-a

27.67387342453003

In [61]:
y_pred = model.predict(X)

In [69]:
y_pred[y==y_pred].size / y.size

0.7880716058135413

In [161]:
model_s = tree.DecisionTreeClassifier(min_samples_leaf=20)

In [164]:
a=time()
model_s.fit(X, y)
b=time()

b-a

0.1601123809814453

In [165]:
model_s.score(X, y)

0.8345267635590217

In [239]:
np.issubdtype(x[:, 0].dtype, np.object)

  np.issubdtype(x[:, 0].dtype, np.object)


True

In [None]:
X