# Decision Tree

### Problem Statement -

        - Use Decision Trees to prepare a model on fraud data, treating those who have taxable_income <= 30000 
          as "Risky" and others are "Good"

### Data Understanding

In [43]:
import warnings
warnings.filterwarnings('ignore')

In [44]:
import pandas as pd
import numpy as np
fc = pd.read_csv ("~/desktop/Digi 360/Module 19/Fraud_check.csv",encoding='mac_roman')
fc.head(5)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [45]:
# Renaming columns

fc = fc.rename(columns={'Undergrad':'ug', 'Marital.Status':'mstatus',
                       'Taxable.Income':'inc',
                       'City.Population':'pop','Work.Experience':'exp',
                       'Urban':'urban'})
fc.columns

Index(['ug', 'mstatus', 'inc', 'pop', 'exp', 'urban'], dtype='object')

In [46]:
fc.shape

(600, 6)

In [47]:
fc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
ug         600 non-null object
mstatus    600 non-null object
inc        600 non-null int64
pop        600 non-null int64
exp        600 non-null int64
urban      600 non-null object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [48]:
fc.isnull().sum()

ug         0
mstatus    0
inc        0
pop        0
exp        0
urban      0
dtype: int64

### Data Preparation

In [49]:
fc.mstatus.value_counts()

Single      217
Married     194
Divorced    189
Name: mstatus, dtype: int64

In [50]:
# Converting output variable into categorical variable

fc.inc = fc.inc.apply(lambda x: 'Risky' if x<= 30000 else 'Good')

In [51]:
fc.inc.value_counts()

Good     476
Risky    124
Name: inc, dtype: int64

In [52]:
### Label encoding the categorical varibales

fc.ug = fc.ug.apply(lambda x: 1 if x=='YES' else 0)
fc.urban = fc.urban.apply(lambda x: 1 if x=='YES' else 0)
fc.mstatus = fc.mstatus.apply(lambda x: 1 if x=='Married' else (0 if x=='Single' else 2))

In [53]:
print(fc.mstatus.value_counts())
print(fc.urban.value_counts())
print(fc.ug.value_counts())

0    217
1    194
2    189
Name: mstatus, dtype: int64
1    302
0    298
Name: urban, dtype: int64
1    312
0    288
Name: ug, dtype: int64


In [54]:
fc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
ug         600 non-null int64
mstatus    600 non-null int64
inc        600 non-null object
pop        600 non-null int64
exp        600 non-null int64
urban      600 non-null int64
dtypes: int64(5), object(1)
memory usage: 28.2+ KB


### Splitting the dataset 

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
fc_y = fc['inc']
fc_y.head()

0    Good
1    Good
2    Good
3    Good
4    Good
Name: inc, dtype: object

In [58]:
fc_y.value_counts()

Good     476
Risky    124
Name: inc, dtype: int64

In [57]:
fc_X = fc.loc[:,fc.columns != 'inc']
fc_X.head()

Unnamed: 0,ug,mstatus,pop,exp,urban
0,0,0,50047,10,1
1,1,2,134075,18,1
2,0,1,160205,30,1
3,1,0,193264,15,1
4,0,1,27533,28,0


In [59]:
X_train, X_test, y_train, y_test = train_test_split(fc_X, fc_y, test_size=0.2,random_state=4)

In [60]:
X_train.head()

Unnamed: 0,ug,mstatus,pop,exp,urban
369,0,1,89122,28,0
117,0,0,96370,29,1
138,0,0,155335,25,0
14,0,1,57529,13,1
595,1,2,39492,7,1


In [61]:
y_train.head()

369    Good
117    Good
138    Good
14     Good
595    Good
Name: inc, dtype: object

### Building the model

In [62]:
from sklearn.tree import DecisionTreeClassifier

In [63]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [64]:
# Checking the score on train data
dt.score(X_train,y_train)

1.0

In [65]:
# Checking the score on test data
dt.score(X_test,y_test)

0.675

So our model is overfitting here because train score is 100% and test score is 68%. Let's go for ensemble methods.

### Bagging - Gradient

In [66]:
from sklearn.ensemble import BaggingClassifier

In [67]:
bg = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5,max_features=1.0,n_estimators=20)

In [68]:
bg.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [69]:
#Finding score for test Data
bg.score(X_test, y_test)

0.7666666666666667

In [70]:
#Finding score for train Data
bg.score(X_train, y_train)

0.8791666666666667

### Extract Feature Importance

In [71]:
# Extract feature importances
import pandas as pd
fi = pd.DataFrame({'feature': list(X_train.columns),
                   'importance': dt.feature_importances_}).\
                    sort_values('importance', ascending = False)

# Display
fi.head()

Unnamed: 0,feature,importance
2,pop,0.603579
3,exp,0.195696
4,urban,0.076788
1,mstatus,0.066571
0,ug,0.057365


Feature importances can give us insight into a problem by telling us what variables are the most discerning between classes. 

For example, here `pop`, indicating whether the `City Population` , is the most important feature which makes sense in the problem context.

### Visualization of a single tree

In [72]:
data_feature_names = X_train.columns

In [74]:
from IPython.display import Image  
from sklearn import tree
import pydotplus
import collections

# Visualize data
dot_data = tree.export_graphviz(dt,
                                feature_names=data_feature_names,
                                out_file=None,
                                filled=True,
                                rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)

colors = ('turquoise', 'orange')
edges = collections.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('tree.png')

True

### Conclusion

        - Accuracy of our model is 77%
        - Important feature is `City Population` to classify whether the customer is `Risky` or `Good`. 