In [13]:
import numpy as np
import pandas as pd
from collections import Counter


sex: 性别 - 女性, 男性。  
age: 年龄 - 连续变量。  
workclass: 工作类别 - 私营部门, 自雇非公司, 自雇公司, 联邦政府, 地方政府, 州政府, 无薪, 从未工作。  
fnlwgt: 最终权重 - 连续变量。  
education: 教育程度 - 学士, 些许大学教育, 11年级, 高中毕业, 专业学校, 函授大专, 技术学院, 9年级, 7-8年级, 12年级, 硕士, 1-4年级, 10年级, 博士, 5-6年级, 幼儿园。  
education-num: 教育编号 - 连续变量。  
marital-status: 婚姻状况 - 已婚公民配偶, 离婚, 未婚, 分居, 寡居, 配偶缺席, 军婚。
occupation: 职业 - 技术支持, 手工艺维修, 其他服务, 销售, 高管管理, 专业领域, 清洁工, 机械操作检查, 行政文书, 农业/渔业, 运输搬运, 私人家庭服务, 保护服务, 武装部队。  
relationship: 家庭关系 - 妻子, 子女, 丈夫, 非家庭成员, 其他亲属, 未婚。  
race: 种族 - 白人, 亚太岛民, 美国印第安人/爱斯基摩人, 其他, 黑人。  
capital-gain: 资本收益 - 连续变量。  
capital-loss: 资本损失 - 连续变量。  
hours-per-week: 每周工作小时数 - 连续变量。  
native-country: 原籍国 - 美国, 柬埔寨, 英国, 波多黎各, 加拿大, 德国, 美国海外属地(关岛-美属维尔金群岛等), 印度, 日本, 希腊, 南部, 中国, 古巴, 伊朗, 洪都拉斯, 菲律宾, 意大利, 波兰, 牙买加, 越南, 墨西哥, 葡萄牙, 爱尔兰, 法国, 多米尼加共和国, 老挝, 厄瓜多尔, 台湾, 海地, 哥伦比亚, 匈牙利, 危地马拉, 尼加拉瓜, 苏格兰, 泰国, 南斯拉夫, 萨尔瓦多, 特立尼达和多巴哥, 秘鲁, 香港, 荷兰。  
income: 收入是否大于50K。  

In [14]:
headers = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    'income'
]
train_data = pd.read_csv('adult.data',header=None)
test_data = pd.read_csv('adult.test',header=None)
train_data.columns = headers
test_data.columns = headers
income_map = {
    ' >50K': 1,
    ' <=50K': 0
}
train_data['income'] = train_data['income'].map(income_map)
income_map = {
   ' >50K.': 1,
   ' <=50K.': 0
}
test_data['income'] = test_data['income'].map(income_map)
# 筛选出类型为 object 的列
object_columns = train_data.select_dtypes(include=['object']).columns
# 对这些列应用独热编码
train_data = pd.get_dummies(train_data, columns=object_columns)
test_data = pd.get_dummies(test_data,columns=object_columns)
new_column = pd.Series(False, index=test_data.index)
new_column.name = 'native-country_ Holand-Netherlands'
test_data.insert(82,new_column.name,new_column)
X_train,y_train = train_data.drop(columns='income').to_numpy().astype(np.float64),train_data['income'].to_numpy().astype(np.int64)
X_test,y_test = test_data.drop(columns='income').to_numpy().astype(np.float64),test_data['income'].to_numpy().astype(np.int64)

In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Columns: 109 entries, age to native-country_ Yugoslavia
dtypes: bool(102), int64(7)
memory usage: 4.9 MB


In [16]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_features=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_features = n_features
        self.root = None

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    def fit(self, X, y):
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1], self.n_features)
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(set(y))
        if (depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)
        feature_idxs = np.random.choice(n_features, self.n_features, replace=False)
        best_feature, best_threshold = self._best_split(X, y, feature_idxs)
        left_idxs, right_idxs = self._split(X[:, best_feature], best_threshold)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feature, best_threshold, left, right)

    def _best_split(self, X, y, feature_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None
        for idx in feature_idxs:
            X_column = X[:, idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = idx
                    split_threshold = threshold
        return split_idx, split_threshold

    def _information_gain(self, y, X_column, threshold):
        parent_entropy = self._entropy(y)
        left_idxs, right_idxs = self._split(X_column, threshold)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
        information_gain = parent_entropy - child_entropy
        return information_gain

    def _split(self, X_column, split_threshold):
        left_idxs = np.argwhere(X_column <= split_threshold).flatten()
        right_idxs = np.argwhere(X_column > split_threshold).flatten()
        return left_idxs, right_idxs

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value

In [17]:
clf = DecisionTree(max_depth=3)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(np.mean(y_pred==y_test))
from sklearn.metrics import precision_score, recall_score, roc_curve, auc

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

print(f"ROC-AUC 值: {roc_auc}")
print(f'精确率: {precision}')
print(f'召回率: {recall}')

0.8447884036607088
ROC-AUC 值: 0.7281382063485192
精确率: 0.7555211158465711
召回率: 0.5070202808112324


In [19]:
class RandomForest:
    def __init__(self, n_trees=100, min_samples_split=2, max_depth=100, n_features=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_features = n_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(min_samples_split=self.min_samples_split, max_depth=self.max_depth, n_features=self.n_features)
            X_sample, y_sample = self._bootstrap_samples(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_predictions = np.swapaxes(tree_predictions, 0, 1)
        y_pred = [self._most_common_label(tree_pred) for tree_pred in tree_predictions]
        return np.array(y_pred)

    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]

    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value

In [21]:
rf = RandomForest(n_trees=5, min_samples_split=2, max_depth=5, n_features=None)
rf.fit(X_train, y_train)
# 预测
y_pred = rf.predict(X_test)
print(np.mean(y_pred==y_test))
from sklearn.metrics import precision_score, recall_score, roc_curve, auc

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

print(f"ROC-AUC 值: {roc_auc}")
print(f'精确率: {precision}')
print(f'召回率: {recall}')

0.8469381487623611
ROC-AUC 值: 0.723170000382645
精确率: 0.7816139767054908
召回率: 0.48855954238169524
