In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score
from sklearn.preprocessing import LabelEncoder 
from sklearn.tree import export_text
import joblib

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Sử dụng toàn bộ tập dữ liệu

In [2]:
train0 = pd.read_csv("train.csv")
test0 =  pd.read_csv("test.csv")
columns = train0.columns
feature_train = train0[[columns[i] for i in range(len(columns)-1)]].values
target_train = train0[columns[-1]].values
feature_test = test0[[columns[i] for i in range(len(columns)-1)]].values
target_test = test0[columns[-1]].values


In [3]:
target_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [4]:
clf = DecisionTreeClassifier(criterion = "gini", splitter = "best", max_depth= 5, min_samples_split= 2, random_state=6)
clf.fit(feature_train, target_train)


In [5]:
print(f"accuracy is {clf.score(feature_test, target_test)}")

accuracy is 0.9505247376311844


In [6]:
pred = clf.predict(feature_test)
precision = [0,0]
recall = [0,0]
recall[0] = recall_score(target_test[target_test == 0], pred[target_test ==0], average = "binary", pos_label= 0)
precision[0] = precision_score(target_test[pred == 0], pred[pred ==0], average = "binary", pos_label= 0)
recall[1] = recall_score(target_test[target_test == 1], pred[target_test ==1], average = "binary", pos_label= 1)
precision[1] = precision_score(target_test[pred == 1], pred[pred ==1], average = "binary", pos_label= 1)

print(f"precision of label 0 and 1 are {precision[0], precision[1]}")
print(f"recall of label 0 and 1 are {recall[0], recall[1]}")

precision of label 0 and 1 are (0.9670710571923743, 0.8444444444444444)
recall of label 0 and 1 are (0.9755244755244755, 0.8)


In [7]:
tree_rule = export_text(clf, feature_names = [columns[i] for i in range(len(columns)-1)])
print(tree_rule)

|--- Total day charge <= 44.95
|   |--- Customer service calls <= 3.50
|   |   |--- International plan <= 0.50
|   |   |   |--- Total day charge <= 37.95
|   |   |   |   |--- Total eve charge <= 29.27
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Total eve charge >  29.27
|   |   |   |   |   |--- class: 0
|   |   |   |--- Total day charge >  37.95
|   |   |   |   |--- Total eve charge <= 20.82
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Total eve charge >  20.82
|   |   |   |   |   |--- class: 1
|   |   |--- International plan >  0.50
|   |   |   |--- Total intl calls <= 2.50
|   |   |   |   |--- class: 1
|   |   |   |--- Total intl calls >  2.50
|   |   |   |   |--- Total intl charge <= 3.53
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Total intl charge >  3.53
|   |   |   |   |   |--- class: 1
|   |--- Customer service calls >  3.50
|   |   |--- Total day charge <= 29.50
|   |   |   |--- Total eve charge <= 18.05
|   |   |   |   |--- Total night charge <

Sử dụng tập dữ liệu cân bằng

In [8]:
train1 = pd.read_csv("train_balance.csv")
feature_train1 = train1[[columns[i] for i in range(len(columns)-1)]].values
target_train1 = train1[columns[-1]].values

In [9]:
clf1 = DecisionTreeClassifier(criterion = "entropy", splitter = "best", max_depth= 5, min_samples_split= 2, random_state=6)
clf1.fit(feature_train1, target_train1)

In [10]:
print(f"accuracy is {clf1.score(feature_test, target_test)}")

accuracy is 0.9115442278860569


In [11]:
pred1 = clf1.predict(feature_test)
precision1 = [0,0]
recall1 = [0,0]
recall1[0] = recall_score(target_test[target_test == 0], pred1[target_test ==0], average = "binary", pos_label= 0)
precision1[0] = precision_score(target_test[pred1 == 0], pred1[pred1 ==0], average = "binary", pos_label= 0)
recall1[1] = recall_score(target_test[target_test == 1], pred1[target_test ==1], average = "binary", pos_label= 1)
precision1[1] = precision_score(target_test[pred1 == 1], pred1[pred1 ==1], average = "binary", pos_label= 1)

print(f"precision of label 0 and 1 are {precision1[0], precision1[1]}")
print(f"recall of label 0 and 1 are {recall1[0], recall1[1]}")

precision of label 0 and 1 are (0.9723756906077348, 0.6451612903225806)
recall of label 0 and 1 are (0.9230769230769231, 0.8421052631578947)


In [12]:
tree_rule1 = export_text(clf1, feature_names = [columns[i] for i in range(len(columns)-1)])
print(tree_rule1)

|--- Total day charge <= 39.92
|   |--- Customer service calls <= 3.50
|   |   |--- International plan <= 0.50
|   |   |   |--- Total day charge <= 35.45
|   |   |   |   |--- Total eve charge <= 14.34
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Total eve charge >  14.34
|   |   |   |   |   |--- class: 0
|   |   |   |--- Total day charge >  35.45
|   |   |   |   |--- Total eve charge <= 20.86
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Total eve charge >  20.86
|   |   |   |   |   |--- class: 1
|   |   |--- International plan >  0.50
|   |   |   |--- Total intl calls <= 2.50
|   |   |   |   |--- class: 1
|   |   |   |--- Total intl calls >  2.50
|   |   |   |   |--- Total intl charge <= 3.52
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Total intl charge >  3.52
|   |   |   |   |   |--- class: 1
|   |--- Customer service calls >  3.50
|   |   |--- Total day charge <= 30.99
|   |   |   |--- Total intl calls <= 6.50
|   |   |   |   |--- class: 1
|   |   |   

In [13]:
joblib.dump(clf1, 'decision_tree_model1.joblib')
joblib.dump(clf, 'decision_tree_model.joblib')


['decision_tree_model.joblib']