In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=UserWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
%matplotlib inline

In [None]:
#Displaying static image https://plotly.com/python/orca-management/
!pip install plotly>=4.7.1
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4

<h3>Data Preparation & Wrangling</h3>

In [4]:
from sklearn import datasets
df = datasets.load_iris() #also from here px.data.iris()

<i><b>df</b> is a .json dataset containing data about features, target, features names, target names, etc. Our task is to read data into .json format and make a dataframe of it so that it will become more readible. Print **df** data into another cell if you want to look inside it.</i>

In [5]:
temp = pd.DataFrame(df['data'],columns = df['feature_names'])
temp.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
temp['target'] = df['target']
temp.sample(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
141,6.9,3.1,5.1,2.3,2
40,5.0,3.5,1.3,0.3,0
115,6.4,3.2,5.3,2.3,2
24,4.8,3.4,1.9,0.2,0
42,4.4,3.2,1.3,0.2,0


In [7]:
df['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [8]:
temp['target_names'] = temp['target']
temp.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0


In [9]:
for i in range(3):
    temp['target_names'] = temp['target_names'].replace({i:df['target_names'][i]})
temp.sample(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
39,5.1,3.4,1.5,0.2,0,setosa
128,6.4,2.8,5.6,2.1,2,virginica
8,4.4,2.9,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
129,7.2,3.0,5.8,1.6,2,virginica
115,6.4,3.2,5.3,2.3,2,virginica
97,6.2,2.9,4.3,1.3,1,versicolor
84,5.4,3.0,4.5,1.5,1,versicolor
130,7.4,2.8,6.1,1.9,2,virginica
74,6.4,2.9,4.3,1.3,1,versicolor


<h3>Decision-Tree Algo.</h3>

*A decision-tree algorithm is a **supervised learning** classification algorithm. It is used to make a decision-tree using recursive partioning to classify the data and to select the best predictive characterstics or feature of a dataset. In this dataset, we have four features and this algorith may choose any but best predictive feature to predict the target name. Since our datset contains categorial target variable, the decision-tree will be called as **categorial variable decision-tree**(if variable is continuous, then continuous variable decision-tree).* 

In [10]:
temp.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa


<h4>Model Building & Algorithms</h4>

<h4>1. ID3-Algorithm</h4>
<i>It uses Entropy and Information gain for constructing a decision-tree.</i>

**Entropy**: It is the measure of randomness or disorderness in a dataset. To construct an acceptable decision-tree, finding best attributes that can maximize information gain or minimize entropy is the only need. While running ID3 algorithm, it splits the dataset in all possible ways and return the attributes that results maximum information gain or least entropy. The attributes that result maximum information gain leads to the development of decision-tree and the attribute become a *decision-node*. This process will go on for its branches and so on. The entropy decreases with a downward step of a tree and becomes 0 at *leaf node* or it can say that **a branch with entropy 0 is leaf node**. If, at any branch, the entropy isn\'t 0, there the data will split further and this ID3 algo. will recursively continue to run until the attainment of 0 entropy. 

In [12]:
from sklearn.tree import DecisionTreeClassifier
model_ent = DecisionTreeClassifier(criterion='entropy')
model_ent.fit(temp.drop(['target','target_names'],axis = 1),temp['target'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [13]:
#Visualizing Tree
import pydotplus
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import matplotlib.image as mpimg

In [None]:
dot_data = StringIO()
export_graphviz(model_ent, out_file=dot_data, feature_names=df['feature_names'],  
                filled=True, rounded=True,
                special_characters=True,rotate=False)
img = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(img.create_png())

<h4>2. Gini-index</h4>

In [None]:
model_gini = DecisionTreeClassifier(criterion='gini') #default though
model_gini.fit(temp.drop(['target','target_names'],axis = 1),temp['target'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')