In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

In [4]:
# 使用CART树进行分类
# 读取数据
iris = load_iris()
features = iris.data
labels = iris.target
# 划分数据集
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.3,random_state=0)
# 创建CART决策树
cart = DecisionTreeClassifier(criterion="gini")
cart = cart.fit(train_features,train_labels)
test_predict = cart.predict(test_features)
# 计算准确率
acc = accuracy_score(test_labels,test_predict)
print(f"CART树分类准确率:{acc}")

CART树分类准确率:0.9777777777777777


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [14]:
# 使用CART树进行回归
housing = fetch_california_housing()
print(housing.feature_names)
features = housing.data
labels = housing.target
# 划分数据集
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.3,random_state=0)
# 创建CART树
cart = DecisionTreeRegressor()
cart = cart.fit(train_features,train_labels)
test_predict = cart.predict(test_features)
# 评估准则
MSE = mean_squared_error(test_labels,test_predict)
MAE = mean_absolute_error(test_labels,test_predict)
print(f"回归树的均方差为:{MSE}")
print(f"回归树的绝对偏差为:{MAE}")

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
回归树的均方差为:0.5419527317160046
回归树的绝对偏差为:0.4679359738372093


#### 经典数据集
生存预测-分类问题

In [55]:
train_data = pd.read_csv("./titanic/train.csv")
test_data = pd.read_csv("./titanic/test.csv")
print("训练数据集大小")
print(train_data.shape)
print(train_data.info())
print("-"*50)
print(train_data.describe())
print("-"*50)
print(train_data.head())

训练数据集大小
(891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
--------------------------------------------------
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838

In [56]:
# 补全年龄
train_data['Age'].fillna(train_data['Age'].mean(),inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)
# 补签票价
train_data['Fare'].fillna(train_data['Fare'].mean(),inplace=True)
test_data['Fare'].fillna(train_data['Fare'].mean(),inplace=True)

In [57]:
print(train_data['Embarked'].value_counts())
print(test_data['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64
S    270
C    102
Q     46
Name: Embarked, dtype: int64


In [58]:
train_data['Embarked'].fillna('S',inplace=True)
test_data['Embarked'].fillna('S',inplace=True)

In [59]:
# 选择有效特征，去掉冗余或者不相关的特征
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]

In [60]:
from sklearn.feature_extraction import DictVectorizer
dvec = DictVectorizer(sparse=False)
# 将分类转化为one-hot编码 训练集用fit_transform
train_features = dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)

['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


  train_features = dvec.fit_transform(train_features.to_dict(orient='record'))


In [52]:
# ID3决策树训练
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(train_features,train_labels)
# 测试集转换用transform
test_features = dvec.transform(test_features.to_dict(orient='record'))
test_predict = clf.predict(test_features)

  test_features = dvec.transform(test_features.to_dict(orient='record'))


In [53]:
acc_decision_tree = round(clf.score(train_features,train_labels),2)
print(f"ID3决策树的准确率为:{acc_decision_tree}")

ID3决策树的准确率为:0.98


In [54]:
# K折交叉验证
from sklearn.model_selection import cross_val_score
k_folds_acc = np.mean(cross_val_score(clf,train_features,train_labels,cv=10))
print(f"K折交叉验证的准确率:{k_folds_acc}")

K折交叉验证的准确率:0.7734082397003745


In [61]:
# CART决策树训练
cart = DecisionTreeClassifier(criterion='gini')
cart = cart.fit(train_features,train_labels)
test_features = dvec.transform(test_features.to_dict(orient='record'))
predict_val = cart.predict(train_features)

  test_features = dvec.transform(test_features.to_dict(orient='record'))


In [62]:
acc_cart = round(cart.score(train_features,train_labels),2)
print(f"CART树准确率:{acc_cart}")
acc_k_folds = np.mean(cross_val_score(cart,train_features,train_labels,cv=10))
print(f"K折交叉验证的准确率为:{acc_k_folds}")

CART树准确率:0.98
K折交叉验证的准确率为:0.7901622971285892


In [12]:
# 手写数据集分类 CART树
from sklearn.datasets import load_digits
digits = load_digits()
features = digits.data
labels = digits.target
# print(digits.feature_names)
print("shape:",features.shape)
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.3,random_state=0)
clf = DecisionTreeClassifier(criterion='gini')
clf = clf.fit(train_features,train_labels)
predict_val = clf.predict(test_features)
print(predict_val)
acc = accuracy_score(test_labels,predict_val)
print(f"分类准确率：{acc}")

shape: (1797, 64)
[2 8 2 6 6 7 1 9 8 5 2 8 6 6 6 6 4 0 5 8 8 7 3 4 7 5 4 9 2 9 4 7 6 8 9 1 3
 1 0 1 1 6 7 7 1 0 3 0 2 1 9 1 7 9 9 0 9 8 6 3 0 2 3 4 1 9 2 6 9 1 8 6 5 1
 2 8 3 2 9 7 1 2 6 0 9 3 7 5 1 2 5 9 3 1 7 7 1 8 5 8 5 4 2 5 9 0 7 1 4 7 3
 4 8 9 7 9 8 0 4 5 2 5 1 4 7 7 0 6 1 5 8 3 9 5 9 9 3 7 5 6 1 8 6 5 6 1 5 1
 5 9 9 1 8 3 6 1 8 9 2 7 6 7 9 5 6 0 8 1 9 3 6 1 0 4 4 6 3 9 6 7 4 9 6 7 9
 3 3 3 0 7 7 5 7 8 0 7 8 9 6 4 5 0 1 4 6 4 3 3 0 9 5 5 3 3 4 2 1 6 0 9 9 4
 9 3 7 6 2 3 3 1 6 9 3 6 3 3 2 0 7 6 1 4 3 8 2 7 8 5 5 7 5 6 3 7 8 6 5 5 8
 0 9 1 6 5 1 7 4 3 8 0 3 6 4 6 3 3 6 4 8 8 4 6 7 8 6 4 8 3 2 4 6 9 0 5 4 3
 4 4 2 9 0 1 7 2 0 9 6 6 4 1 0 7 9 8 5 7 8 2 3 4 3 9 2 6 9 1 5 9 0 8 5 4 3
 5 6 8 2 7 2 3 5 1 6 4 5 0 9 4 1 1 7 0 1 9 0 5 4 7 8 8 6 5 3 4 4 4 9 8 7 0
 9 6 3 5 2 3 0 8 2 3 1 3 4 0 0 4 6 0 7 7 6 2 0 4 4 2 3 7 8 9 8 6 8 5 6 2 2
 3 1 7 3 8 0 3 3 2 6 5 5 9 1 3 7 0 0 9 0 8 5 8 3 3 4 7 1 8 9 8 3 6 6 1 6 2
 1 7 5 5 1 9 2 9 9 7 2 1 4 9 3 2 6 2 5 9 4 5 8 3 0 6 3 0 4 8 4 1 8 6 4 3 4
 8 0 4 