In [1]:
import pandas as pd

In [2]:
# 读取数据
def get_data(filepath, isTest=False):
    line_list = []
    with open(filepath) as lines:
        if isTest:
            next(lines)
        for line in lines:
            l = line.split(',')
            if '?' in l:
                print('yes')
            line_list.append(l)

    data = pd.DataFrame(line_list,
                    columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                             'marital-status', 'occupation', 'relationship',
                             'race', 'sex', 'capital-gain', 'capital-loss','hours-per-week',
                             'native-country', 'label'])

    # 丢弃最后一行空行
    data = data.iloc[:-1, :]

    for col in ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']:
        data[col] = data[col].astype(int)

    for col in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country', 'sex', 'label']:
        data[col] = data[col].astype(str)
        data[col] = data[col].apply(lambda x: x.strip(' '))
        data[col] = data[col].apply(lambda x : None if x == '?' else x)

    data['label'] = data['label'].apply(lambda l : l.strip())

    data.dropna(axis='index', how='any', inplace=True)

    data['sex'] = data['sex'].apply(lambda x : 0 if x == 'Male' else 1)

    data['label'] = data['label'].apply(lambda x : 0 if x.strip('.') =='<=50K' else 1)

    return data

In [3]:
data = get_data('./Adult/adult.data', False)

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,0,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,1,0,0,40,Cuba,0


In [5]:
train = data.copy()

In [15]:
X_train_raw = train.drop(columns=['label'])

In [16]:
X_train_raw

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,0,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,1,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,1,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,0,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,1,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,0,0,0,20,United-States


In [9]:
y_train = train['label']

In [10]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: label, Length: 30162, dtype: int64

In [11]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np

In [12]:
num_col = ['age', 'fnlwgt', 'education-num', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

In [18]:
full_pipeline = ColumnTransformer([
    ("num", MinMaxScaler(), num_col),
    ("cat", OneHotEncoder(), cat_col)])

In [19]:
X_train = full_pipeline.fit_transform(X_train_raw)

In [20]:
X_train

<30162x103 sparse matrix of type '<class 'numpy.float64'>'
	with 345148 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
from sklearn.base import clone

In [24]:
# 手动交叉验证
def eval_model(model_dic, X_train, y_train, isStratified=True):
    if isStratified:
        kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=3407)
    else:
        kfolds = KFold(n_splits=5, shuffle=True, random_state=3407)

    for model_name, model in model_dic.items():
        scores = {'accuracy':[],
                  'auc':None}
        y_pred_prb = np.zeros(shape=y_train.shape)

        for train_indices, valid_indices in kfolds.split(X_train, y_train):
            clone_clf = clone(model)

            X_train_fold = X_train[train_indices, :]
            y_train_fold = y_train.iloc[train_indices]
            X_valid_fold = X_train[valid_indices, :]
            y_valid_fold = y_train.iloc[valid_indices]

            clone_clf.fit(X_train_fold, y_train_fold)

            y_pred = clone_clf.predict(X_valid_fold)
            y_pred_prb[valid_indices] = clone_clf.predict_proba(X_valid_fold)[:, 1]

            scores['accuracy'].append(accuracy_score(y_valid_fold, y_pred))

        scores['auc'] = roc_auc_score(y_train, y_pred_prb)

        print("{}分类器的5折交叉验证结果为: \n accuracy为: {}({})\n auc为: {}".format(model_name,
                                                                                   np.mean(scores['accuracy']),
                                                                                   np.std(scores['accuracy']),
                                                                                   scores['auc']))

In [28]:
log_clf = LogisticRegression(max_iter=1000)

In [29]:
model_dic = {"log":log_clf}

In [30]:
eval_model(model_dic, X_train, y_train)

log分类器的5折交叉验证结果为: 
 accuracy为: 0.8459318795255626(0.006034754588434064)
 auc为: 0.9022206688663666


## 测试

In [32]:
test_data = get_data('./Adult/adult.test', isTest=True)

In [33]:
test = test_data.copy()

In [34]:
X_test_raw = test.drop(columns=['label'])

In [35]:
y_test = test['label']

In [36]:
X_test = full_pipeline.transform(X_test_raw)

In [37]:
def test_eval(model_fitted_dic, X_test, y_test):
    model_scores = {}
    for model_name, model in model_fitted_dic.items():
        scores = {'accuracy': None,
                  'auc': None,
                  'roc': None}

        y_pred = model.predict(X_test)
        y_pred_prb = model.predict_proba(X_test)[:, 1]

        scores['accuracy'] = accuracy_score(y_test, y_pred)
        scores['auc'] = roc_auc_score(y_test, y_pred_prb)
        scores['roc'] = roc_curve(y_test, y_pred_prb)

        print("{}分类器在测试集上的结果为: \n accuracy为: {}\n auc为: {}".format(model_name,
                                                                                  scores['accuracy'],
                                                                                  scores['auc']))
        model_scores[model_name] = scores
    return model_scores

In [40]:
log_clf_trained = LogisticRegression(max_iter=1000)

In [41]:
log_clf_trained.fit(X_train, y_train)

In [42]:
model_fitted_dic = {"log": log_clf_trained}

In [43]:
test_eval(model_fitted_dic, X_test, y_test)

log分类器在测试集上的结果为: 
 accuracy为: 0.8460159362549801
 auc为: 0.9015326774838219


{'log': {'accuracy': 0.8460159362549801,
  'auc': 0.9015326774838219,
  'roc': (array([0.        , 0.        , 0.        , ..., 0.97077465, 0.9709507 ,
          1.        ]),
   array([0.00000000e+00, 2.70270270e-04, 2.18918919e-02, ...,
          1.00000000e+00, 1.00000000e+00, 1.00000000e+00]),
   array([2.00000000e+00, 9.99999999e-01, 9.99990337e-01, ...,
          1.28166381e-03, 1.27875945e-03, 6.90591988e-05]))}}

In [47]:
X_train

<30162x103 sparse matrix of type '<class 'numpy.float64'>'
	with 345148 stored elements in Compressed Sparse Row format>

In [48]:
X_test

<15060x103 sparse matrix of type '<class 'numpy.float64'>'
	with 172339 stored elements in Compressed Sparse Row format>

## 使用tensorflow构建的逻辑回归

In [243]:
def evaluate(y_pred, y):
    ones = tf.ones_like(y_pred)
    zeros = tf.zeros_like(y_pred)
    output = tf.where(y_pred > 0.5 , ones, zeros)
    y = tf.reshape(y, output.shape)
    res = tf.reduce_sum(tf.where(y == tf.cast(output, dtype=y.dtype), ones, zeros))/y.shape[0]
    return res.numpy()

In [244]:
import tensorflow as tf

In [265]:
class Config:
    def __init__(self) -> None:
        self.batch_size = 256
        self.learn_rate = 2
        self.epochs = 200
config = Config()

In [266]:
def load_array(data_arrays, batch_size, is_train=True):
    """
    以batch size加载X和y
    :param data_arrays: (X, y),其中X和y均为array
    :param batch_size:
    :param is_train: 如果True则会shuffle
    :return: dataloader
    """
    dataset = tf.data.Dataset.from_tensor_slices(data_arrays)
    if is_train:
        dataset = dataset.shuffle(buffer_size=len(data_arrays), seed=42)
    dataset = dataset.batch(batch_size)
    return dataset

In [267]:
data_loader = load_array((X_train.toarray(), y_train), config.batch_size)

In [268]:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(units=2, kernel_initializer=tf.keras.initializers.Zeros(), activation='sigmoid'))
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(units=200, kernel_initializer=tf.keras.initializers.Zeros()))
model.add(tf.keras.layers.Dense(units=50, kernel_initializer=tf.keras.initializers.RandomNormal(seed=42),
                                activation='softmax'))
model.add(tf.keras.layers.Dense(2))
model.add(tf.keras.layers.Activation('sigmoid'))

In [269]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
optimizer = tf.keras.optimizers.SGD(learning_rate=config.learn_rate)

In [270]:
# def evaluate(y_true, y_pred):
#     ones = tf.ones_like(y_pred)
#     zeros = tf.zeros_like(y_pred)
#     output = tf.where(y_pred > 0.5 , ones, zeros)
#     y = tf.reshape(y_true, output.shape)
#     res = tf.reduce_sum(tf.where(y == tf.cast(output, dtype=y.dtype), ones, zeros))/y.shape[0]
#     return res.numpy()
def evaluate(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=1)
    res = tf.equal(tf.cast(y_true, y_pred.dtype), y_pred)
    res = np.mean(res)
    return res

In [271]:
for epoch in range(config.epochs):
    train_loss_list = []
    train_acc_list = []
    for X, y in data_loader:
        with tf.GradientTape() as tape:
            y_pred = model(X)
            l = loss(y_true=tf.one_hot(y, depth=2), y_pred=y_pred)
        grads = tape.gradient(l, model.trainable_variables)
        # print(grads)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        train_loss_list.append(l)
        train_acc_list.append(evaluate(y_true=y, y_pred=model(X)))
        # train_acc_list.append(evaluate(y_true=y, y_pred=tf.squeeze(model(X))))
    train_loss = np.mean(train_loss_list)
    train_acc = np.mean(train_acc_list)
    test_loss = loss(y_pred=model(X_test.toarray()), y_true=tf.one_hot(y_test, depth=2))
    # test_loss = loss(y_pred=tf.squeeze(model(X_test.toarray())), y_true=y_test)
    test_acc = evaluate(y_pred=model(X_test.toarray()), y_true=y_test)
    print("epoch:{}, trian_loss:{:.5f}, train_acc:{:.3f}, test_loss:{:.5f}, test_acc:{:.3f}".format(epoch+1, train_loss, train_acc, test_loss.numpy(), test_acc))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

epoch:1, trian_loss:0.56332, train_acc:0.751, test_loss:0.55517, test_acc:0.754
epoch:2, trian_loss:0.52205, train_acc:0.757, test_loss:0.42414, test_acc:0.808
epoch:3, trian_loss:0.40119, train_acc:0.824, test_loss:0.38736, test_acc:0.829
epoch:4, trian_loss:0.37716, train_acc:0.830, test_loss:0.37961, test_acc:0.827
epoch:5, trian_loss:0.36921, train_acc:0.834, test_loss:0.37525, test_acc:0.825
epoch:6, trian_loss:0.36398, train_acc:0.836, test_loss:0.37359, test_acc:0.822
epoch:7, trian_loss:0.36044, train_acc:0.838, test_loss:0.37120, test_acc:0.823
epoch:8, trian_loss:0.35722, train_acc:0.839, test_loss:0.36823, test_acc:0.823
epoch:9, trian_loss:0.35413, train_acc:0.840, test_loss:0.3