# Report: Decision trees

## Task:

1. You need to select a dataset for this task. Please note that it must be run for both classification and regression
2. Explore this dataset and remove outliers if necessary
3. Fill in missing values in the dataset if necessary
4. Use sklearn.tree.DecisionTreeClassifier to classify the data. Adjust the classifier parameters to achieve the best result
5. Use sklearn.tree.DecisionTreeRegressor for regression, adjust the algorithm parameters to get the best result

- Student: Jingyu Yan

## Solutions

### 1. Select a dataset



In [5]:
import pandas as pd
from sklearn.datasets import load_boston

# Load the Boston house price dataset
boston = load_boston()
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
boston_df['MEDV'] = boston.target  # Add the median home price as a new column

# Displays the first few rows of the data set and descriptive statistics
boston_df.head()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [6]:
# 检查数据集中是否存在缺失值
missing_values = boston_df.isnull().sum()

# 查看各特征的异常值情况，通过计算Z-score的方式
from scipy import stats
import numpy as np

# 计算Z-score
z_scores = np.abs(stats.zscore(boston_df))
threshold = 3  # 通常Z-score大于3被认为是异常值
outliers = (z_scores > threshold).sum(axis=0)

missing_values, outliers


(CRIM       0
 ZN         0
 INDUS      0
 CHAS       0
 NOX        0
 RM         0
 AGE        0
 DIS        0
 RAD        0
 TAX        0
 PTRATIO    0
 B          0
 LSTAT      0
 MEDV       0
 dtype: int64,
 CRIM        8
 ZN         14
 INDUS       0
 CHAS       35
 NOX         0
 RM          8
 AGE         0
 DIS         5
 RAD         0
 TAX         0
 PTRATIO     0
 B          25
 LSTAT       5
 MEDV        0
 dtype: int64)

In [7]:
# 对于存在异常值的特征，使用中位数进行替换
features_with_outliers = ['CRIM', 'ZN', 'RM', 'DIS', 'B', 'LSTAT']
for feature in features_with_outliers:
    median_value = boston_df[feature].median()
    # 只替换超出3个标准差的值
    boston_df[feature] = np.where(z_scores[feature] > threshold, median_value, boston_df[feature])

# 检查替换后的数据
boston_df.describe()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,2.738417,8.863636,11.136779,0.06917,0.554695,6.283372,68.574901,3.718598,9.549407,408.237154,18.455534,374.644269,12.412549,22.532806
std,5.057613,19.171611,6.860353,0.253994,0.115878,0.636122,28.148861,1.980432,8.707259,168.537116,2.164946,51.835781,6.758134,9.197104
min,0.00632,0.0,0.46,0.0,0.385,4.368,2.9,1.1296,1.0,187.0,12.6,83.45,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8895,45.025,2.100175,4.0,279.0,17.4,378.5225,6.95,17.025
50%,0.25519,0.0,9.69,0.0,0.538,6.2085,77.5,3.203325,5.0,330.0,19.05,391.445,11.35,21.2
75%,2.80872,0.0,18.1,0.0,0.624,6.6055,94.075,5.112625,24.0,666.0,20.2,396.225,16.57,25.0
max,28.6558,80.0,27.74,1.0,0.871,8.375,100.0,9.2229,24.0,711.0,22.0,396.9,34.02,50.0


In [9]:
# 创建分类目标变量，高于中位数的为1，低于或等于中位数的为0
median_value = boston_df['MEDV'].median()
boston_df['High_Value'] = (boston_df['MEDV'] > median_value).astype(int)

# 显示更新后的数据框架头几行
boston_df.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,High_Value
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,1
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,1
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,1
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,1
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,1


In [10]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# 分割数据集为训练集和测试集
X = boston_df.drop(['MEDV', 'High_Value'], axis=1)
y_class = boston_df['High_Value']
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)

# 设置决策树分类器的参数网格
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# 创建GridSearchCV对象进行参数搜索
dtc = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5, scoring='accuracy')

# 训练模型
grid_search.fit(X_train, y_train)

# 最佳模型和参数
best_classifier = grid_search.best_estimator_
best_params = grid_search.best_params_

# 使用最佳模型在测试集上进行预测并评估结果
y_pred = best_classifier.predict(X_test)
classification_results = classification_report(y_test, y_pred)

best_params, classification_results


({'max_depth': None, 'min_samples_leaf': 10, 'min_samples_split': 2},
 '              precision    recall  f1-score   support\n\n           0       0.90      0.75      0.82        60\n           1       0.71      0.88      0.79        42\n\n    accuracy                           0.80       102\n   macro avg       0.81      0.82      0.80       102\nweighted avg       0.82      0.80      0.81       102\n')

In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 使用相同的数据分割
y_regr = boston_df['MEDV']

# 设置决策树回归器的参数网格
param_grid_regr = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

# 创建GridSearchCV对象进行参数搜索
dtr = DecisionTreeRegressor(random_state=42)
grid_search_regr = GridSearchCV(estimator=dtr, param_grid=param_grid_regr, cv=5, scoring='neg_mean_squared_error')

# 训练模型
grid_search_regr.fit(X_train, y_train)

# 最佳模型和参数
best_regressor = grid_search_regr.best_estimator_
best_params_regr = grid_search_regr.best_params_

# 使用最佳模型在测试集上进行预测并评估结果
y_pred_regr = best_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_regr)
r2 = r2_score(y_test, y_pred_regr)

best_params_regr, mse, r2


({'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 2},
 0.08924297743465144,
 0.6315539931626535)