In [1]:
# 读取数据

import pandas as pd 
census = pd.read_csv("./census_data.csv")
# display the header of the data
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
# 转换label为数字，train_test_split
census['income_bracket'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [3]:
def label_fix(label):
    if label == ' <=50K':
        return 0
    else:
        return 1

census['income_bracket'] = census['income_bracket'].apply(label_fix)

In [4]:
# Perform a train_test_split on the data
from sklearn.model_selection import train_test_split
# 1. numeric colums
# 2. vocabulary list columns
# 3. hash buckets
new_x_data_columns = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week',
                     'gender',
                     'occupation', 'marital_status', 'relationship', 'education', 'workclass', 'native_country']

x_data = census[new_x_data_columns]
y_labels_raw = census['income_bracket']
# 转换成one hot 输出
y_labels = pd.get_dummies(y_labels_raw)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_labels, test_size=0.3, random_state=101)

In [5]:
y_labels

Unnamed: 0,0,1
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
32556,1,0
32557,0,1
32558,1,0
32559,1,0


In [6]:
# Define the number of hash buckets
num_buckets = 1000
# 分割数据集为不同的列类型
# Sample data
x_train_numeric = x_train.iloc[:, :5]  # first five columns are numeric
x_train_cat_vocabulary = x_train.iloc[:, 5:6]  # gender column is categorical with vocabulary list
x_train_cat_hash = x_train.iloc[:, 6:]  # Assuming remaining columns are categorical with hash buckets

x_test_numeric = x_test.iloc[:, :5]
x_test_cat_vocabulary = x_test.iloc[:, 5:6]
x_test_cat_hash = x_test.iloc[:, 6:]

In [7]:
import tensorflow as tf
# 定义不同类型列的处理方式
# Define preprocessing layers for numeric features
numeric_input = tf.keras.Input(shape=(5, ))
normalization_layer = tf.keras.layers.experimental.preprocessing.Normalization()
normalized_numeric = normalization_layer(numeric_input)

# Define preprocessing layers for categorical features with vocabulary list
cat_vocabulary_input = tf.keras.Input(shape=(1, ), dtype=tf.string)
vocabulary = ["Female", "Male"]  # Example vocabulary list
string_lookup_layer = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=vocabulary, mask_token=None)
encoded_cat_vocabulary = tf.cast(string_lookup_layer(cat_vocabulary_input), tf.float32)

# Define preprocessing layers for categorical features with hash buckets
cat_hash_input = tf.keras.Input(shape=(x_train_cat_hash.shape[1], ), dtype=tf.string)
hashing_layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=num_buckets)
encoded_cat_hash = tf.cast(hashing_layer(cat_hash_input), tf.float32)

In [8]:
# Combine all preprocessing layers
preprocessing_layers = tf.keras.layers.Concatenate()([normalized_numeric, encoded_cat_vocabulary, encoded_cat_hash])

In [9]:
# Define the rest of your model architecture
dense_layer1 = tf.keras.layers.Dense(64, activation='relu')(preprocessing_layers)
output_layer = tf.keras.layers.Dense(2, activation='softmax')(dense_layer1)

In [10]:
# Build the model
model = tf.keras.Model(inputs=[numeric_input, cat_vocabulary_input, cat_hash_input], outputs=output_layer)

In [11]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [12]:
# 定义一个回调函数来保存验证集上表现最好的模型
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./best_model.h5',
    monitor='val_acc',  # 监控验证集上的精度函数
    save_best_only=True,  # 仅保存在验证集上表现最好的模型
    save_weights_only=False,  # 保存整个模型（包括模型架构）
    verbose=1  # 打印保存信息
)

# Train the model
history = model.fit([x_train_numeric, x_train_cat_vocabulary, x_train_cat_hash], y_train, epochs=50, batch_size=128,
                    validation_data=([x_test_numeric, x_test_cat_vocabulary, x_test_cat_hash], y_test),
                    callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50


Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [13]:
# Evaluate the model
test_loss, test_acc = model.evaluate([x_test_numeric, x_test_cat_vocabulary, x_test_cat_hash], y_test)
print("Test Accuracy:", test_acc)

Test Accuracy: 0.8234210014343262


In [14]:
predictions = model.predict([x_test_numeric, x_test_cat_vocabulary, x_test_cat_hash])

In [19]:
preds_10 = tf.argmax(predictions[:10], axis=1)
print("preds_10:" + str(preds_10))
acts_10 = tf.argmax(y_test[:10], axis=1)
print("acts_10:" + str(acts_10))

preds_10:tf.Tensor([0 0 0 0 1 0 0 0 0 0], shape=(10,), dtype=int64)
acts_10:tf.Tensor([0 0 0 0 1 0 0 0 0 0], shape=(10,), dtype=int64)
