In [1]:
from sklearn.datasets import load_iris
# More toy data sets from sklearn please refer:
# http://scikit-learn.org/stable/datasets/
import numpy as np
import pandas as pd
from sklearn import tree
import graphviz
from sklearn.metrics import accuracy_score as acc_rate

# use this line in terminal if graphviz does not work: conda install python-graphviz 

In [2]:
p2p = pd.read_csv("./p2p.csv")
p2p = p2p.iloc[:,1:-1]
p2p

Unnamed: 0,ratio001,ratio002,ratio003,ratio004,ratio005,ratio006,ratio008,ratio011,ratio012,ratio017,...,ratio030,DIO,DPO,DSO,turnover,status,nace,ratio036,ratio037,ratio039
0,-0.238174,-0.155186,-0.124784,-0.422984,-0.441998,-0.281685,-0.476657,-3.467956,-3.405227,-1.297426,...,-0.301233,-0.297901,-0.462928,-0.722769,-0.381662,1,4635,1,1,1
1,-0.758472,-0.356497,-0.659060,-0.076745,0.260873,-0.515998,-1.119889,1.754630,0.921839,4.328932,...,-0.643458,-0.317154,-0.279493,-0.198444,4.591706,1,4791,1,1,1
2,-0.333616,-0.207378,0.508892,-0.263181,-0.496910,-0.276421,-0.372174,-0.547247,-0.088445,1.102322,...,-0.263208,-0.224098,-0.268703,-0.603186,0.408384,1,4752,1,1,1
3,-0.444811,-0.478278,-1.056660,0.145204,-0.255298,-0.239247,-0.424368,-3.467956,0.064050,1.449654,...,-0.529383,-0.166338,-0.398186,-0.685974,0.094786,1,4771,1,1,1
4,-0.479096,-0.523014,-1.156061,-0.902393,-0.749504,-0.371168,-0.400809,1.754630,-0.107507,0.039276,...,-2.924961,-0.320363,-0.160800,0.638637,0.040900,1,3212,1,1,1
5,-1.368040,-0.309276,-0.485109,-0.334205,-0.046633,-0.238589,3.449548,-0.118991,-3.405227,-0.213329,...,-0.263208,-0.317154,2.320963,-0.796358,0.000006,1,4941,1,1,1
6,-0.118640,-0.167613,-0.286309,-0.964539,-1.078975,-0.300766,-1.119889,-0.654312,0.921839,-0.518561,...,-0.681484,-0.073282,-0.732685,-0.685974,-0.108669,1,4711,1,1,1
7,-0.891442,-1.040260,-0.609360,-0.565031,-1.177817,-0.332019,-0.272288,-0.172523,-0.298127,-1.276376,...,-0.567408,-0.320363,5.485103,-0.198444,-0.131375,1,4120,1,1,1
8,-0.218252,-0.257084,-0.162059,-0.023477,0.271855,-0.250432,-0.138404,0.469861,0.369042,0.460284,...,0.117042,-0.297901,-0.074478,-0.106457,-0.163345,1,1330,0,0,1
9,-0.355392,-0.291879,1.105293,1.148411,1.787421,4.064475,-1.119889,-3.467956,0.626378,-0.055451,...,-2.924961,-0.320363,0.055005,0.592644,-0.187972,1,4752,0,0,1


In [3]:
print(p2p.columns.values)

['ratio001' 'ratio002' 'ratio003' 'ratio004' 'ratio005' 'ratio006'
 'ratio008' 'ratio011' 'ratio012' 'ratio017' 'ratio018' 'ratio019'
 'ratio027' 'ratio029' 'ratio030' 'DIO' 'DPO' 'DSO' 'turnover' 'status'
 'nace' 'ratio036' 'ratio037' 'ratio039']


In [4]:
# Glance the data set
print(np.delete(p2p.columns.values,19,axis=0)) # name of features
p2p_features = p2p.drop("status",axis=1)
print(p2p.drop("status",axis=1)) # data for the features
p2p_target = p2p["status"]
print("status")  # name of the targets
print(p2p["status"])  # data for the targets

['ratio001' 'ratio002' 'ratio003' 'ratio004' 'ratio005' 'ratio006'
 'ratio008' 'ratio011' 'ratio012' 'ratio017' 'ratio018' 'ratio019'
 'ratio027' 'ratio029' 'ratio030' 'DIO' 'DPO' 'DSO' 'turnover' 'nace'
 'ratio036' 'ratio037' 'ratio039']
       ratio001  ratio002  ratio003  ratio004  ratio005  ratio006  ratio008  \
0     -0.238174 -0.155186 -0.124784 -0.422984 -0.441998 -0.281685 -0.476657   
1     -0.758472 -0.356497 -0.659060 -0.076745  0.260873 -0.515998 -1.119889   
2     -0.333616 -0.207378  0.508892 -0.263181 -0.496910 -0.276421 -0.372174   
3     -0.444811 -0.478278 -1.056660  0.145204 -0.255298 -0.239247 -0.424368   
4     -0.479096 -0.523014 -1.156061 -0.902393 -0.749504 -0.371168 -0.400809   
5     -1.368040 -0.309276 -0.485109 -0.334205 -0.046633 -0.238589  3.449548   
6     -0.118640 -0.167613 -0.286309 -0.964539 -1.078975 -0.300766 -1.119889   
7     -0.891442 -1.040260 -0.609360 -0.565031 -1.177817 -0.332019 -0.272288   
8     -0.218252 -0.257084 -0.162059 -0.023477  0.2

In [5]:
# randomly choose 1/3 of samples as testing data
np.random.seed(123)
test_idx = np.random.randint(0, len(p2p), len(p2p) // 3)
print(test_idx)
print(np.shape(test_idx))


[ 3582 11646  1346 ...  5602 10586 11928]
(5015,)


In [6]:
# training data
train_data = np.delete(np.array(p2p_features), test_idx, axis=0)
train_target = np.delete(np.array(p2p_target), test_idx,axis=0)

In [7]:
# testing data
test_data = np.array(p2p_features)[test_idx]
test_target = np.array(p2p_target)[test_idx]

In [8]:
# train the model
# initial an decision tree classifier object with given arguments
clf = tree.DecisionTreeClassifier(criterion='entropy',
                                  splitter='best')
# A lot of arguments can be placed into the object
# Refer docs: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
clf.fit(X=train_data, y=train_target)

DecisionTreeClassifier(criterion='entropy')

In [9]:
# make prediction
print('\nThe target test data set is:\n', test_target)
print('\nThe predicted result is:\n', clf.predict(test_data))
print('\nAccuracy rate is:\n', acc_rate(test_target, clf.predict(test_data)))


The target test data set is:
 [0 0 1 ... 0 0 0]

The predicted result is:
 [0 0 0 ... 0 0 0]

Accuracy rate is:
 0.8699900299102692


In [10]:
# visualizing the tree

dot_data = tree.export_graphviz(clf,
                                out_file=None,
                                feature_names=list(np.delete(p2p.columns.values,19,axis=0)),
                                class_names=list("status"),
                                filled=True,
                                rounded=True,
                                impurity=False,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("p2p_lending", view = True)


'p2p_lending.pdf'