# Using Decision Tree to generate rules

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn import tree
from sklearn.externals.six import StringIO
import pydot_ng as pydot

# report
from sklearn.metrics import classification_report

In [2]:
# get today str
today = datetime.today().strftime(format='%Y%m%d')
info = 'Demo'

In [32]:
# loading dataset
dataset = pd.read_csv('../op/dataset_n5_0303_kmean_demo_nan.csv', index_col=0)

In [33]:
# drop dormant
dataset = dataset[dataset['label'] != 'Z.Dorment']

In [34]:
dataset.head()

Unnamed: 0_level_0,@A_activeDay,@C_activeDay,@E_activeDay,@B_activeDay,@D_activeDay,@A_spending,@C_spending,@E_spending,@B_spending,@D_spending,@Special_spending,label,silhouette_vals2,silhouette_vals
KeyID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
4,2.0,,,,,2401.0,,,,,822.0,0,0.845566,0.845566
5,2.0,,,6.0,4.0,1821.0,,,5675.0,2042.0,,0,0.80737,0.80737
13,2.0,,,,,1062.0,,,,,,0,0.845511,0.845511
14,2.0,,,3.0,,3375.5,,,4677.5,,,0,0.843125,0.843125
17,117.0,,,1.0,,7889.4,,,,,32510.0,2,0.375341,0.375341


In [11]:
dataset['label'].unique()

array(['0.0', '2.0', '1.0', '3.0'], dtype=object)

In [12]:
# labels and X
X = dataset.drop('label', axis=1).fillna(0).values
y = dataset['label'].values
feature_names = (dataset.columns[: -1])

In [13]:
# decision tree
tree_cls = tree.DecisionTreeClassifier(criterion='entropy',
                                       max_leaf_nodes=20,
                                       min_samples_split=30,
                                       min_samples_leaf=100,)
tree_train = tree_cls.fit(X, y)

In [14]:
def output_pdf(tree_train):
	dot_data = StringIO()
	tree.export_graphviz(tree_train, out_file=dot_data,
	                     feature_names=feature_names,
	                     filled=True, rounded=True,
	                     special_characters=True,
                         node_ids=1,
                         proportion=1)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf('../op/tree_grah_%s_%s.pdf' % (info, today))

In [15]:
output_pdf(tree_train)

In [16]:
# report 
print(classification_report(y, tree_train.predict(X)))

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00     17154
        1.0       0.95      0.95      0.95      1247
        2.0       0.96      1.00      0.98      1723
        3.0       0.98      0.90      0.94       608

avg / total       0.99      0.99      0.99     20732



## fine_tune 

In [24]:
# fine-tune ruls
# fill nan 
data_whole = dataset.fillna(0)
# filter

# 1
A_day_bewlow = (data_whole['@A_activeDay'] < 87)
# 2
B_day_bewlow = (data_whole['@B_activeDay'] < 78)
# 3
D_day_bewlow = (data_whole['@D_activeDay'] < 21)

"""
# 4
~A_day_bewlow
# 5
~B_day_bewlow
# 6
~D_day_bewlow

"""

flter_0 = (A_day_bewlow & B_day_bewlow & D_day_bewlow)
flter_1 = (A_day_bewlow & ~D_day_bewlow)
flter_2 = (~A_day_bewlow)
flter_3 = (A_day_bewlow & ~B_day_bewlow & D_day_bewlow)

def cluster_filter(flter):
    percentage = data_whole[flter].groupby('label').size() / data_whole.groupby('label').size()
    counts = data_whole[flter].groupby('label').size()
    cluster_counts = data_whole.groupby('label').size()
    table = pd.DataFrame({'1.Counts':counts, '2.Cluster_counts':cluster_counts,'3.Percentage':percentage, })
    return table

filter_table_pre = {
    'filter0':cluster_filter(flter=flter_0),
    'filter1':cluster_filter(flter=flter_1),
    'filter2':cluster_filter(flter=flter_2),
    'filter3':cluster_filter(flter=flter_3),
}

In [29]:
filter_table = pd.concat(filter_table_pre, axis=0).fillna(0)

In [30]:
filter_table

Unnamed: 0_level_0,Unnamed: 1_level_0,1.Counts,2.Cluster_counts,3.Percentage
Unnamed: 0_level_1,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
filter0,0.0,17108.0,17154,0.997318
filter0,1.0,19.0,1247,0.015237
filter0,2.0,3.0,1723,0.001741
filter0,3.0,8.0,608,0.013158
filter1,0.0,22.0,17154,0.001282
filter1,1.0,1179.0,1247,0.945469
filter1,2.0,0.0,1723,0.0
filter1,3.0,44.0,608,0.072368
filter2,0.0,10.0,17154,0.000583
filter2,1.0,49.0,1247,0.039294


In [28]:
filter_table.to_excel('../op/Demo_rules_%s.xlsx' % today)