In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/anime-traits-bett/anime_traits_better.csv


In [62]:
import numpy as np
from collections import Counter

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None,*,value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf_node(self):
        return self.value is not None


class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_features=None):
        self.min_samples_split=min_samples_split
        self.max_depth=max_depth
        self.n_features=n_features
        self.root=None
        
    

    def fit(self, X, y):
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1],self.n_features)
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_feats = X.shape
        n_labels = len(np.unique(y))

        # check the stopping criteria
        if (depth>=self.max_depth or n_labels==1 or n_samples<self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)

        # find the best split
        best_feature, best_thresh = self._best_split(X, y, feat_idxs)

        # create child nodes
        left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
        return Node(best_feature, best_thresh, left, right)


    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None

        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)

            for thr in thresholds:
                # calculate the information gain
                gain = self._information_gain(y, X_column, thr)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = thr

        return split_idx, split_threshold


    def _information_gain(self, y, X_column, threshold):
        # parent entropy
        parent_entropy = self._entropy(y)

        # create children
        left_idxs, right_idxs = self._split(X_column, threshold)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        
        # calculate the weighted avg. entropy of children
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l/n) * e_l + (n_r/n) * e_r

        # calculate the IG
        information_gain = parent_entropy - child_entropy
        return information_gain

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log(p) for p in ps if p>0])


    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        print(value)
        return value

    def predict(self, x):
        return np.array([self._traverse_tree(x, self.root)])
    
    def akinator(self):
        return np.array([self._akinator_q(self.root)])

    
    def _akinator_q(self,node):
        if node.is_leaf_node():
            return node.value
        
        if node==self.root:
            print(self.root.feature)
            global s
            s=float(input("yes/no"))
       
        if s <= node.threshold:
            print(node.left.feature)
            print("L")
            s=float(input("yes/no"))
            return self._akinator_q(node.left)
        else:
            print(node.right.feature)
            print("R")
            s=float(input("yes/no"))
            return self._akinator_q(node.right)

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            print(node.feature)
            print(x[node.feature])
            return self._traverse_tree(x, node.left)
        else:
            print(node.feature)
            print(x[node.feature])
            return self._traverse_tree(x, node.right)

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
data=pd.read_csv("/kaggle/input/anime-traits-bett/anime_traits_better.csv")

a=data['Names']
b=data.drop(['Names'],axis=1)
c=pd.get_dummies(b)
y=c['Id']
x=c.drop(['Id'],axis=1)


y=y.to_numpy()
x=x.to_numpy()

y,x
k=xv[4]
xt,xv,yt,yv=train_test_split(x,y,test_size=0.2,random_state=1)
clf=DecisionTree()
clf.fit(xt,yt)
clf.akinator()

Counter({179: 1})
Counter({521: 1})
Counter({1003: 1})
Counter({31: 1})
Counter({32: 1})
Counter({968: 1})
Counter({219: 1})
Counter({996: 1})
Counter({1174: 1})
Counter({325: 1})
Counter({652: 1})
Counter({83: 1})
Counter({1087: 1})
Counter({1137: 1})
Counter({434: 1})
Counter({322: 1})
Counter({207: 1})
Counter({1002: 1})
Counter({869: 1})
Counter({448: 1})
Counter({313: 1})
Counter({1191: 1})
Counter({1115: 1})
Counter({371: 1})
Counter({338: 1})
Counter({329: 1})
Counter({93: 1})
Counter({964: 1})
Counter({512: 1})
Counter({642: 1})
Counter({253: 1})
Counter({1085: 1})
Counter({597: 1})
Counter({241: 1})
Counter({986: 1})
Counter({829: 1})
Counter({1179: 1})
Counter({246: 1})
Counter({461: 1})
Counter({346: 1})
Counter({533: 1})
Counter({167: 1})
Counter({823: 1})
Counter({144: 1})
Counter({403: 1})
Counter({1188: 1})
Counter({882: 1})
Counter({991: 1})
Counter({381: 1})
Counter({1081: 1})
Counter({1020: 1})
Counter({514: 1})
Counter({958: 1})
Counter({245: 1})
Counter({532: 1})
Co

In [71]:
k=xv[4]
for i in range(465):
    print("index",i,":",k[i])



index 0 : 0
index 1 : 0
index 2 : 0
index 3 : 0
index 4 : 0
index 5 : 0
index 6 : 0
index 7 : 0
index 8 : 0
index 9 : 0
index 10 : 0
index 11 : 0
index 12 : 0
index 13 : 0
index 14 : 0
index 15 : 0
index 16 : 0
index 17 : 0
index 18 : 0
index 19 : 0
index 20 : 1
index 21 : 0
index 22 : 0
index 23 : 0
index 24 : 0
index 25 : 0
index 26 : 0
index 27 : 0
index 28 : 0
index 29 : 0
index 30 : 0
index 31 : 0
index 32 : 0
index 33 : 0
index 34 : 0
index 35 : 0
index 36 : 0
index 37 : 0
index 38 : 0
index 39 : 0
index 40 : 0
index 41 : 0
index 42 : 0
index 43 : 0
index 44 : 0
index 45 : 0
index 46 : 0
index 47 : 0
index 48 : 0
index 49 : 0
index 50 : 0
index 51 : 0
index 52 : 0
index 53 : 0
index 54 : 0
index 55 : 0
index 56 : 0
index 57 : 0
index 58 : 0
index 59 : 0
index 60 : 0
index 61 : 0
index 62 : 0
index 63 : 0
index 64 : 0
index 65 : 0
index 66 : 0
index 67 : 0
index 68 : 0
index 69 : 0
index 70 : 0
index 71 : 0
index 72 : 0
index 73 : 0
index 74 : 0
index 75 : 0
index 76 : 0
index 77 

In [60]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
data=datasets.load_breast_cancer()
X1,y1=data.data,data.target

xt,xv,yt,yv=train_test_split(X1,y1,test_size=0.2,random_state=1)
clf=DecisionTree()
clf.fit(xt,yt)
k1=xv[4]
clf.predict(k1)

Counter({1: 249})
Counter({0: 1})
Counter({1: 2})
Counter({1: 6})
Counter({0: 1})
Counter({1: 2})
Counter({0: 5})
Counter({1: 13})
Counter({1: 8})
Counter({1: 2})
Counter({0: 5})
Counter({1: 2})
Counter({0: 2})
Counter({0: 21})
Counter({1: 1})
Counter({0: 3})
Counter({0: 132})
22
110.1
27
0.06575
21
36.91
20
17.26
26
0.1547
1
29.81


array([0])

In [61]:
y1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [31]:
xv[4]

array([1.513e+01, 2.981e+01, 9.671e+01, 7.195e+02, 8.320e-02, 4.605e-02,
       4.686e-02, 2.739e-02, 1.852e-01, 5.294e-02, 4.681e-01, 1.627e+00,
       3.043e+00, 4.538e+01, 6.831e-03, 1.427e-02, 2.489e-02, 9.087e-03,
       3.151e-02, 1.750e-03, 1.726e+01, 3.691e+01, 1.101e+02, 9.314e+02,
       1.148e-01, 9.866e-02, 1.547e-01, 6.575e-02, 3.233e-01, 6.165e-02])

In [None]:
xv[2]