In [4]:

import tensorflow as tf
import tempfile
import pandas as pd
import urllib
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [66]:
COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
           "marital_status", "occupation", "relationship", "race", "gender",
           "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"]

LABEL_COLUMN = 'label'
CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
                       "relationship", "race", "gender", "native_country"]
CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
                      "hours_per_week"]

In [67]:
df_train = pd.read_csv('../data/adult.data', names=COLUMNS, 
                       skipinitialspace=True)
df_test = pd.read_csv('../data/adult.test', names=COLUMNS, 
                      skipinitialspace=True, skiprows=1)

- 年龄：连续。 
- 工作类别：私人，无个人收入，无个人收入，联邦政府，地方政府，州政府，无薪，从未工作过。 
- fnlwgt：连续。 
- 教育程度：学士，部分大学，11年级，高中毕业生，教授学校，Assoc-acdm，Assoc-voc，9、7-8-8、12，硕士，1-4至10，博士学位，5-6至学前班。 
- 教育人数：连续。 
- 婚姻状况：已婚公民配偶，离婚，未婚，分居，丧偶，已婚配偶缺席，已婚AF配偶。 
- 职业：技术支持，工艺修理，其他服务，销售，执行管理，专业教授，装卸清洁员，机器操作检查员，行政助理，农业捕鱼，运输移动，私人住宅serv，保护- - serv，武装部队。 
- 关系：妻子，独生子女，丈夫，亲戚，其他亲戚，未婚。 
- 种族：白色，亚洲人-帕斯岛，亚美-印度-爱斯基摩人，其他，黑人。 
- 性别：女，男。 
- 资本收益：连续。 
- 资本损失：连续。 
- 每周小时：连续。 
- 祖国：美国，柬埔寨，英国，波多黎各，加拿大，德国，美国以外的关岛（关岛-USVI等），印度，日本，希腊，华南，中国，古巴，伊朗，洪都拉斯，菲律宾，意大利，波兰，牙买加，越南，墨西哥，葡萄牙，爱尔兰，法国，多米尼加共和国，老挝，厄瓜多尔，台湾，海地，哥伦比亚，匈牙利，危地马拉，尼加拉瓜，苏格兰，泰国，南斯拉夫，萨尔瓦多，特立尼达和多巴哥，秘鲁，洪，荷兰霍兰。



In [68]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [8]:
df_train['income_bracket']=df_train['income_bracket'].fillna('<=50K')
df_test['income_bracket']=df_test['income_bracket'].fillna('<=50K.')

In [69]:
df_train.loc[df_train['income_bracket']=='>50K','income_bracket'] = 1
df_train.loc[df_train['income_bracket']=='<=50K','income_bracket'] = 0
df_test.loc[df_test['income_bracket']=='>50K.','income_bracket'] = 1
df_test.loc[df_test['income_bracket']=='<=50K.','income_bracket'] = 0

In [70]:
df_train['income_bracket'].unique()

array([0, 1, nan], dtype=object)

## 将离散值 连续值进行转化和处理，编程tf可处理的变量

In [71]:
# 将从每个连续要素列名称（k）到存储在常量Tensor中的该列的值
continuous_cols = {k: tf.constant(df_train[k].values)
                       for k in CONTINUOUS_COLUMNS}

In [None]:

# 将离散值映射成mutil-hot   shape:(size,1)

# tf.SparseTensor

categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df_train[k].size)],
        values=df_train[k].values,
        dense_shape=[df_train[k].size, 1])
                        for k in CATEGORICAL_COLUMNS}

In [None]:
feature_cols = dict(list(continuous_cols.items()) + list(categorical_cols.items()))
feature_cols

#### 改成函数型

In [86]:
def input_fn(df):
#     连续型变量
    continuous_cols = {k: tf.constant(df[k].values)
                       for k in CONTINUOUS_COLUMNS}
    
#     离散型变量  改成mutil-hot
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df_train[k].size)],
        values=df_train[k].values,
        dense_shape=[df_train[k].size, 1])
                        for k in CATEGORICAL_COLUMNS}
#     将离散特征和连续特征改成字典
    feature_cols = dict(list(continuous_cols.items()) + list(categorical_cols.items()))
#     label变成常量
    label = tf.constant(df[LABEL_COLUMN].values)
    return feature_cols, label


In [73]:
def train_input_n():
    return input_n(df_train)


def eval_input_n():
    return input_n(df_test)

In [74]:
df_train['race'].value_counts()

White                 22705
Black                  2524
Asian-Pac-Islander      830
Amer-Indian-Eskimo      260
Other                   222
Name: race, dtype: int64

In [76]:
df_train['workclass'].value_counts()

Private             18463
Self-emp-not-inc     2089
Local-gov            1732
?                    1490
State-gov            1054
Self-emp-inc          915
Federal-gov           784
Without-pay            10
Never-worked            5
Name: workclass, dtype: int64

### 特征处理

In [77]:
# Categorical base columns.

# tf.contrib.layers.sparse_column_with_keys 或者
# tf.feature_column.categorical_column_with_vocabulary_list
# 按照自定义的字典将类别特征映射到数值，适合特征种类较少时候使用

gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])
race = tf.contrib.layers.sparse_column_with_keys(column_name="race",
                                                 keys=["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other",
                                                       "White"])

In [78]:
# tf.contrib.layers.sparse_column_with_hash_bucket  或者
# tf.feature_column.categorical_column_with_hash_bucket
# 自动将类别特征映射到数值，适合特征种类较多时候使用

education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)

In [79]:
# Continuous base columns.

# tf.contrib.layers.bucketized_column  或者
# tf.feature_column.bucketized_column
# 把连续特征按照区间映射为类别特征

age = tf.contrib.layers.real_valued_column("age")
# 分桶操作
age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
education_num = tf.contrib.layers.real_valued_column("education_num")
capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")

In [80]:
# tf.contrib.layers.crossed_column   或者
# tf.feature_column.crossed_column
# 特征相乘生成的交叉特征

wide_columns = [
    gender, native_country, education, occupation, workclass, relationship, age_buckets,
    tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4)),
    tf.contrib.layers.crossed_column([native_country, occupation], hash_bucket_size=int(1e4)),
    tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6))]

# 对于离散值  one-hot 值少的时候   embedding 词嵌入
deep_columns = [
    tf.contrib.layers.embedding_column(workclass, dimension=8),
    tf.contrib.layers.embedding_column(education, dimension=8),
    tf.contrib.layers.embedding_column(gender, dimension=8),
    tf.contrib.layers.embedding_column(relationship, dimension=8),
    tf.contrib.layers.embedding_column(native_country, dimension=8),
    tf.contrib.layers.embedding_column(occupation, dimension=8),
    age, education_num, capital_gain, capital_loss, hours_per_week]



#### 函数型

In [81]:
flags = tf.app.flags
FLAGS = flags.FLAGS


In [82]:
def build_estimator(model_dir,model_type):
    
    # 离散分类别的
    gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["female","male"])
    education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size = 1000)
    relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size = 100)
    workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
    occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
    native_country = tf.contrib.layers.sparse_column_with_hash_bucket( "native_country", hash_bucket_size=1000)

    # Continuous base columns.
    age = tf.contrib.layers.real_valued_column("age")
    education_num = tf.contrib.layers.real_valued_column("education_num")
    capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
    capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
    hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")
    #类别转换
    age_buckets = tf.contrib.layers.bucketized_column(age, boundaries= [18,25, 30, 35, 40, 45, 50, 55, 60, 65])

    wide_columns = [gender, native_country,education, occupation, workclass, relationship, age_buckets,
                    tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4)),
                    tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6)),
                    tf.contrib.layers.crossed_column([native_country, occupation],hash_bucket_size=int(1e4))]

    #embedding_column用来表示类别型的变量
    deep_columns = [tf.contrib.layers.embedding_column(workclass, dimension=8),
                    tf.contrib.layers.embedding_column(education, dimension=8),
                    tf.contrib.layers.embedding_column(gender, dimension=8),
                    tf.contrib.layers.embedding_column(relationship, dimension=8),
                    tf.contrib.layers.embedding_column(native_country,dimension=8),
                    tf.contrib.layers.embedding_column(occupation, dimension=8),
                    age,education_num,capital_gain,capital_loss,hours_per_week,]

    if model_type =="wide":
        m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,feature_columns=wide_columns)
    elif model_type == "deep":
        m = tf.contrib.learn.DNNClassifier(model_dir=model_dir, feature_columns=deep_columns, hidden_units=[100,50])
    else:
        m = tf.contrib.learn.DNNLinearCombinedClassifier(model_dir=model_dir, linear_feature_columns=wide_columns, dnn_feature_columns = deep_columns, dnn_hidden_units=[100,50])

    return m

In [83]:
# drop Not a number elements
df_train = df_train.dropna(how='any',axis=0)
df_test = df_test.dropna(how='any', axis=0)

In [88]:

m = build_estimator('./','wide_n_deep')
train_steps = 200
m.fit(input_fn=train_input_fn, steps=train_steps)
results = m.evaluate(input_fn=eval_input_fn, steps=1)
# m.fit(input_fn=lambda: input_n(df_train),
#           steps=train_steps)
# results = m.evaluate(input_fn=lambda: input_n(df_test), steps=1)

predicts = m.predict(input_fn=eval_input_fn)

for key in sorted(results):
    print("%s: %s"%(key, results[key]))


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001E7FB46DD88>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_protocol': None, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': './'}


KeyError: 'label'

[参考](https://zhuanlan.zhihu.com/p/43328492)