# Decision Tree

### Problem Statement -

        - A cloth manufacturing company is interested to know about the segment or attributes contributing 
          to high sale..

### Data Understanding

In [40]:
import warnings
warnings.filterwarnings('ignore')

In [41]:
import pandas as pd
import numpy as np
cd = pd.read_csv ("~/desktop/Digi 360/Module 19/Company_Data.csv",encoding='mac_roman')
cd.head(5)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [42]:
# Renaming columns

cd = cd.rename(columns={'Sales':'sales', 'CompPrice':'cprice',
                       'Income':'inc','Advertising':'adv',
                       'Population':'pop','Price':'price',
                       'ShelveLoc':'sloc','Age':'age','Education':'edu','Urban':'urban','US':'us'})
cd.columns

Index(['sales', 'cprice', 'inc', 'adv', 'pop', 'price', 'sloc', 'age', 'edu',
       'urban', 'us'],
      dtype='object')

In [43]:
cd.shape

(400, 11)

In [44]:
cd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
sales     400 non-null float64
cprice    400 non-null int64
inc       400 non-null int64
adv       400 non-null int64
pop       400 non-null int64
price     400 non-null int64
sloc      400 non-null object
age       400 non-null int64
edu       400 non-null int64
urban     400 non-null object
us        400 non-null object
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [45]:
cd.isnull().sum()

sales     0
cprice    0
inc       0
adv       0
pop       0
price     0
sloc      0
age       0
edu       0
urban     0
us        0
dtype: int64

### Data Preparation

In [46]:
# Converting output variable into categorical variable

cd.sales = cd.sales.apply(lambda x: 'High' if x<=7.49 else 'Low')

In [47]:
cd.sales.value_counts()

High    201
Low     199
Name: sales, dtype: int64

In [48]:
### Label encoding the categorical varibales

cd.sloc = cd.sloc.apply(lambda x: 1 if x=='Good' else (0 if x=='Bad' else 2))
cd.urban = cd.urban.apply(lambda x: 1 if x=='Yes' else 0)
cd.us = cd.us.apply(lambda x: 1 if x=='Yes' else 0)

In [49]:
cd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
sales     400 non-null object
cprice    400 non-null int64
inc       400 non-null int64
adv       400 non-null int64
pop       400 non-null int64
price     400 non-null int64
sloc      400 non-null int64
age       400 non-null int64
edu       400 non-null int64
urban     400 non-null int64
us        400 non-null int64
dtypes: int64(10), object(1)
memory usage: 34.5+ KB


### Splitting the dataset 

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
cd_X = cd.iloc[:,1:11]
cd_X.head()

Unnamed: 0,cprice,inc,adv,pop,price,sloc,age,edu,urban,us
0,138,73,11,276,120,0,42,17,1,1
1,111,48,16,260,83,1,65,10,1,1
2,113,35,10,269,80,2,59,12,1,1
3,117,100,4,466,97,2,55,14,1,1
4,141,64,3,340,128,0,38,13,1,0


In [52]:
cd_y = cd.iloc[:,0]
cd_y.head()

0     Low
1     Low
2     Low
3    High
4    High
Name: sales, dtype: object

In [53]:
X_train, X_test, y_train, y_test = train_test_split(cd_X, cd_y, test_size=0.2,random_state=4)

In [54]:
X_train.head()

Unnamed: 0,cprice,inc,adv,pop,price,sloc,age,edu,urban,us
153,150,36,7,488,150,2,25,17,0,1
42,77,69,0,25,24,2,50,18,1,0
339,134,44,4,219,126,1,44,15,1,1
128,133,100,3,350,126,0,55,13,1,1
102,113,22,0,57,97,2,65,16,0,0


In [55]:
y_train.head()

153    High
42      Low
339     Low
128    High
102    High
Name: sales, dtype: object

### Building the model

In [56]:
from sklearn.tree import DecisionTreeClassifier

In [57]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [58]:
# Checking the score on train data
dt.score(X_train,y_train)

1.0

In [59]:
# Checking the score on test data
dt.score(X_test,y_test)

0.8

So our model is overfitting here because train score is 100% and test score is 69%. Let's go for ensemble methods.

### Bagging - Gradient

In [60]:
from sklearn.ensemble import BaggingClassifier

In [62]:
bg = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5,max_features=1.0,n_estimators=20)

In [63]:
bg.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [64]:
#Finding score for test Data
bg.score(X_test, y_test)

0.75

In [65]:
#Finding score for train Data
bg.score(X_train, y_train)

0.95625

### Extract Feature Importance

In [75]:
# Extract feature importances
import pandas as pd
fi = pd.DataFrame({'feature': list(X_train.columns),
                   'importance': dt.feature_importances_}).\
                    sort_values('importance', ascending = False)

# Display
fi.head()

Unnamed: 0,feature,importance
4,price,0.309422
0,cprice,0.155644
5,sloc,0.153516
2,adv,0.110616
1,inc,0.086242


Feature importances can give us insight into a problem by telling us what variables are the most discerning between classes. 

For example, here `price`, indicating whether the product price , is the most important feature which makes sense in the problem context.

### Visualization of a single tree

In [72]:
data_feature_names = X_train.columns

In [74]:
from IPython.display import Image  
from sklearn import tree
import pydotplus
import collections

# Visualize data
dot_data = tree.export_graphviz(dt,
                                feature_names=data_feature_names,
                                out_file=None,
                                filled=True,
                                rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)

colors = ('turquoise', 'orange')
edges = collections.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('tree.png')

True

### Conclusion

        - Accuracy of our model is 75%
        - Important feature is `price` to classify whether the sales are `High` or `Low`. 