In [13]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

In [45]:
class Data:
    def __init__(self) -> None:
        # 读取数据
        self.train_dataset = self.get_data('./Adult/adult.data', isTest=False)
        self.test_dataset = self.get_data('./Adult/adult.test', isTest=True)
        # 数据处理
        self.train_dataset, self.test_dataset = self.data_process(self.train_dataset, self.test_dataset)
        # 获取数据基本信息
        self.num_of_train = self.train_dataset.shape[0]  # 训练集大小
        self.num_of_test = self.test_dataset.shape[0]  # 测试集大小
        self.num_of_features = self.train_dataset[:, :-1].shape[1]  # 特征个数
        self.num_of_label = 2  # 标签个数

        self.train_features = self.train_dataset[:, :-1]  # 训练集X
        self.train_label = self.train_dataset[:, -1]  # 训练集y
        self.test_features = self.test_dataset[:, :-1]  # 测试集X
        self.test_label = self.test_dataset[:, -1]  # 测试集y

    @staticmethod
    def get_data(filepath, isTest=False):
        """
        读取数据
        :param filepath: adult.data/adult.test文件的存放位置
        :param isTest: 读取的数据是否是测试集
        :return: 去除空值后的数据，以dataframe形式返回
        """
        line_list = []
        with open(filepath) as lines:
            if isTest:
                next(lines)
            for line in lines:
                l = line.split(',')
                if '?' in l:
                    print('yes')
                line_list.append(l)

        data = pd.DataFrame(line_list,
                            columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                                     'marital-status', 'occupation', 'relationship',
                                     'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                                     'native-country', 'label'])

        # 丢弃最后一行空行
        data = data.iloc[:-1, :]

        for col in ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']:
            data[col] = data[col].astype(int)

        for col in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country',
                    'sex', 'label']:
            data[col] = data[col].astype(str)
            data[col] = data[col].apply(lambda x: x.strip(' '))
            data[col] = data[col].apply(lambda x: None if x == '?' else x)

        data['label'] = data['label'].apply(lambda l: l.strip())

        data.dropna(axis='index', how='any', inplace=True)

        data['sex'] = data['sex'].apply(lambda x: 0 if x == 'Male' else 1)

        data['label'] = data['label'].apply(lambda x: 0 if x.strip('.') == '<=50K' else 1)

        return data

    @staticmethod
    def data_process(train_dataset, test_dataset):
        """
        数据预处理：
        1. 对于类别型属性转化为one-hot编码
        2. 对于数值型属性进行Min-Max归一化
        :param train_dataset: 读取的训练数据，dataframe
        :param test_dataset: 读取的测试数据，dataframe
        :return: (train_dataset, test_dataset), 都是array
        """
        X_train_raw = train_dataset.drop(columns=['sex'])
        y_train = np.array(train_dataset['sex'])

        X_test_raw = test_dataset.drop(columns=['sex'])
        y_test = np.array(test_dataset['sex'])

        num_col = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'label']
        cat_col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

        full_pipeline = ColumnTransformer([
            ("num", MinMaxScaler(), num_col),
            ("cat", OneHotEncoder(), cat_col)])

        X_train = full_pipeline.fit_transform(X_train_raw).toarray()
        X_test = full_pipeline.transform(X_test_raw).toarray()

        train_dataset = np.concatenate([X_train, y_train.reshape((y_train.shape[0], -1))], axis=1)
        test_dataset = np.concatenate([X_test, y_test.reshape((y_test.shape[0], -1))], axis=1)

        return train_dataset, test_dataset

    @staticmethod
    def load_array(data_arrays, batch_size, is_train=True):
        """
        以batch加载数据
        :param data_arrays: (X_array, y_array)
        :param batch_size: batch的大小
        :param is_train: 传入的数据是否是训练集，如果是则会随机打乱顺序
        :return: 数据迭代器，每次返回一个batch的数据
        """
        dataset = tf.data.Dataset.from_tensor_slices(data_arrays)
        if is_train:
            dataset = dataset.shuffle(buffer_size=len(data_arrays), seed=42)
        dataset = dataset.batch(batch_size)

        return dataset

In [46]:
data = Data()

In [47]:
data.train_features

array([[0.30136986, 0.04333771, 0.8       , ..., 1.        , 0.        ,
        0.        ],
       [0.45205479, 0.04727738, 0.8       , ..., 1.        , 0.        ,
        0.        ],
       [0.28767123, 0.1372439 , 0.53333333, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.56164384, 0.09391367, 0.53333333, ..., 1.        , 0.        ,
        0.        ],
       [0.06849315, 0.1276201 , 0.53333333, ..., 1.        , 0.        ,
        0.        ],
       [0.47945205, 0.18638336, 0.53333333, ..., 1.        , 0.        ,
        0.        ]])

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [49]:
log = LogisticRegression(max_iter=1000)

In [50]:
cross_val_score(log, data.train_features, data.train_label, cv=5)

array([0.84352727, 0.85479861, 0.85361406, 0.84797745, 0.84996684])

In [None]:
data_iter = data.load_array((data.train_features, data.train_label))