In [1]:
import tensorflow as tf


In [2]:
## fetching data from website

import tempfile
import urllib.request
train_file = tempfile.NamedTemporaryFile(delete=False)
test_file = tempfile.NamedTemporaryFile(delete=False)
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)

('C:\\Users\\umarv\\AppData\\Local\\Temp\\tmp58m34f_w',
 <http.client.HTTPMessage at 0x1ed0ed2a978>)

In [3]:
## reading file into csv format

import pandas as pd
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"]
df_train = pd.read_csv(train_file.name, names=CSV_COLUMNS, skipinitialspace=True)
df_test = pd.read_csv(test_file.name, names=CSV_COLUMNS, skipinitialspace=True, skiprows=1)

In [4]:
df_train.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# we'll construct a label column named "label"(target variable) whose value is 1 if the income is over 50K, and 0 otherwise.

train_labels = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
test_labels = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

In [6]:
## Converting Data into Tensors

##When building a tf.estimator model, the input data is specified by means of an Input Builder function

def input_fn(data_file, num_epochs, shuffle):
  """Input builder function."""
  df_data = pd.read_csv(
      tf.gfile.Open(data_file),
      names=CSV_COLUMNS,
      skipinitialspace=True,
      engine="python",
      skiprows=1)
  # remove NaN elements
  df_data = df_data.dropna(how="any", axis=0)
  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
  return tf.estimator.inputs.pandas_input_fn(
      x=df_data,
      y=labels,
      batch_size=100,
      num_epochs=num_epochs,
      shuffle=shuffle,
      num_threads=5)

In [7]:
## Selecting and Engineering Features for the Model

# Base Categorical Feature Columns
# converting categorical feature into facter variables 
gender = tf.feature_column.categorical_column_with_vocabulary_list(
    "gender", ["Female", "Male"])
occupation = tf.feature_column.categorical_column_with_hash_bucket(
    "occupation", hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"
    ])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    "marital_status", [
        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
    ])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    "relationship", [
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
        "Other-relative"
    ])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    "workclass", [
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
    ])
native_country = tf.feature_column.categorical_column_with_hash_bucket(
    "native_country", hash_bucket_size=1000)
race = tf.feature_column.categorical_column_with_hash_bucket(
    "race", hash_bucket_size=1000)

In [8]:
# preparing base continuos feature

age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

In [9]:
# Making Continuous Features Categorical through Bucketization

age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

In [10]:
## Intersecting Multiple Columns with CrossedColumn

education_x_occupation = tf.feature_column.crossed_column(
    ["education", "occupation"], hash_bucket_size=1000)

age_buckets_x_education_x_occupation = tf.feature_column.crossed_column(
    [age_buckets, "education", "occupation"], hash_bucket_size=1000)
age_buckets_x_race_x_occupation = tf.feature_column.crossed_column(
    [age_buckets, "race", "occupation"], hash_bucket_size=1000)

In [11]:
# now data has been prepared for traing model on them
## After processing the input data and defining all the feature columns, we're now ready to put them all together and build a Logistic Regression model.
## Defining The Logistic Regression Model
base_columns = [
    gender, native_country, education, occupation, workclass, relationship,
    age_buckets,
]
crossed_columns = [
    tf.feature_column.crossed_column(
        ["education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ["native_country", "occupation"], hash_bucket_size=1000)
]

model_dir = tempfile.mkdtemp()
m = tf.estimator.LinearClassifier(
    model_dir=model_dir, feature_columns=base_columns + crossed_columns)
## The learned model files will be stored in model_dir.


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_keep_checkpoint_max': 5, '_save_summary_steps': 100, '_log_step_count_steps': 100, '_save_checkpoints_secs': 600, '_model_dir': 'C:\\Users\\umarv\\AppData\\Local\\Temp\\tmphj9rygdq', '_tf_random_seed': 1, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None}


In [25]:
## Training and Evaluating Our Model
## Training a model is just a one-liner using the tf.estimator API:
# set num_epochs to None to get infinite stream of data.
m.train(
    input_fn=input_fn(train_file.name, num_epochs=None, shuffle=True),
    steps=500)


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq\model.ckpt-2150
INFO:tensorflow:Saving checkpoints for 2151 into C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq\model.ckpt.
INFO:tensorflow:loss = 40.6828, step = 2151
INFO:tensorflow:global_step/sec: 114.002
INFO:tensorflow:loss = 36.2864, step = 2251 (0.878 sec)
INFO:tensorflow:global_step/sec: 134.964
INFO:tensorflow:loss = 27.2409, step = 2351 (0.741 sec)
INFO:tensorflow:global_step/sec: 136.334
INFO:tensorflow:loss = 35.3101, step = 2451 (0.733 sec)
INFO:tensorflow:global_step/sec: 134.457
INFO:tensorflow:loss = 36.0771, step = 2551 (0.745 sec)
INFO:tensorflow:Saving checkpoints for 2650 into C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq\model.ckpt.
INFO:tensorflow:Loss for final step: 37.1017.


<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1ed0fd3b780>

In [26]:
results = m.evaluate(
    input_fn=input_fn(test_file.name, num_epochs=1, shuffle=False),
    steps=None)
print("model directory = %s" % model_dir)
for key in sorted(results):
  print("%s: %s" % (key, results[key]))

INFO:tensorflow:Starting evaluation at 2017-08-29-14:19:46
INFO:tensorflow:Restoring parameters from C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq\model.ckpt-2650
INFO:tensorflow:Finished evaluation at 2017-08-29-14:19:54
INFO:tensorflow:Saving dict for global step 2650: accuracy = 0.834224, accuracy_baseline = 0.763774, auc = 0.881216, auc_precision_recall = 0.692679, average_loss = 0.355393, global_step = 2650, label/mean = 0.236226, loss = 35.4978, prediction/mean = 0.239685
model directory = C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq
accuracy: 0.834224
accuracy_baseline: 0.763774
auc: 0.881216
auc_precision_recall: 0.692679
average_loss: 0.355393
global_step: 2650
label/mean: 0.236226
loss: 35.4978
prediction/mean: 0.239685


In [54]:
## regularization to avoid overfitting


m = tf.estimator.LinearClassifier(
    model_dir=model_dir, feature_columns=base_columns + crossed_columns,
    optimizer=tf.train.FtrlOptimizer(
      learning_rate=0.1,
      l1_regularization_strength=1.90,
      l2_regularization_strength=1.90)
    )

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_keep_checkpoint_max': 5, '_save_summary_steps': 100, '_log_step_count_steps': 100, '_save_checkpoints_secs': 600, '_model_dir': 'C:\\Users\\umarv\\AppData\\Local\\Temp\\tmphj9rygdq', '_tf_random_seed': 1, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None}


In [57]:
## training the regularized model model
m.train(
    input_fn=input_fn(train_file.name, num_epochs=None, shuffle=True),
    steps=200)



INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq\model.ckpt-7050
INFO:tensorflow:Saving checkpoints for 7051 into C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq\model.ckpt.
INFO:tensorflow:loss = 37.8441, step = 7051
INFO:tensorflow:global_step/sec: 118.882
INFO:tensorflow:loss = 22.6692, step = 7151 (0.844 sec)
INFO:tensorflow:Saving checkpoints for 7250 into C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq\model.ckpt.
INFO:tensorflow:Loss for final step: 31.2684.


<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1ed2244b9b0>

In [58]:
## cheking the prediction of regularised model on test data

results = m.evaluate(
    input_fn=input_fn(test_file.name, num_epochs=1, shuffle=False),
    steps=None)
print("model directory = %s" % model_dir)
for key in sorted(results):
  print("%s: %s" % (key, results[key]))

INFO:tensorflow:Starting evaluation at 2017-08-29-14:51:55
INFO:tensorflow:Restoring parameters from C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq\model.ckpt-7250
INFO:tensorflow:Finished evaluation at 2017-08-29-14:52:04
INFO:tensorflow:Saving dict for global step 7250: accuracy = 0.835207, accuracy_baseline = 0.763774, auc = 0.883041, auc_precision_recall = 0.695703, average_loss = 0.351953, global_step = 7250, label/mean = 0.236226, loss = 35.1543, prediction/mean = 0.238741
model directory = C:\Users\umarv\AppData\Local\Temp\tmphj9rygdq
accuracy: 0.835207
accuracy_baseline: 0.763774
auc: 0.883041
auc_precision_recall: 0.695703
average_loss: 0.351953
global_step: 7250
label/mean: 0.236226
loss: 35.1543
prediction/mean: 0.238741


In [59]:
## Wide and deep learning

# ALL categorical columns are converted into embedding column to reduce the dimensions of sparse column,
# Then they are concatinated with cotinous columns

deep_columns = [
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(gender),
    tf.feature_column.indicator_column(relationship),
    # To show an example of embedding
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation, dimension=8),
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
]


In [61]:

model_dir_nn = tempfile.mkdtemp()
m = tf.estimator.DNNLinearCombinedClassifier(
    model_dir=model_dir_nn,
    linear_feature_columns=crossed_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[100, 50])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_keep_checkpoint_max': 5, '_save_summary_steps': 100, '_log_step_count_steps': 100, '_save_checkpoints_secs': 600, '_model_dir': 'C:\\Users\\umarv\\AppData\\Local\\Temp\\tmp5i2zco74', '_tf_random_seed': 1, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None}


In [64]:
# After reading in the data, you can train and evaluate the model:

# set num_epochs to None to get infinite stream of data.
m.train(
    input_fn=input_fn(train_file.name, num_epochs=None, shuffle=True),
    steps=200)
# set steps to None to run evaluation until all data consumed.
results = m.evaluate(
    input_fn=input_fn(test_file.name, num_epochs=1, shuffle=False),
    steps=None)
print("model directory = %s" % model_dir_nn)
for key in sorted(results):
  print("%s: %s" % (key, results[key]))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from C:\Users\umarv\AppData\Local\Temp\tmp5i2zco74\model.ckpt-1300
INFO:tensorflow:Saving checkpoints for 1301 into C:\Users\umarv\AppData\Local\Temp\tmp5i2zco74\model.ckpt.
INFO:tensorflow:loss = 46.7423, step = 1301
INFO:tensorflow:global_step/sec: 112.272
INFO:tensorflow:loss = 40.5235, step = 1401 (0.892 sec)
INFO:tensorflow:Saving checkpoints for 1500 into C:\Users\umarv\AppData\Local\Temp\tmp5i2zco74\model.ckpt.
INFO:tensorflow:Loss for final step: 36.7199.
INFO:tensorflow:Starting evaluation at 2017-08-29-15:11:00
INFO:tensorflow:Restoring parameters from C:\Users\umarv\AppData\Local\Temp\tmp5i2zco74\model.ckpt-1500
INFO:tensorflow:Finished evaluation at 2017-08-29-15:11:09
INFO:tensorflow:Saving dict for global step 1500: accuracy = 0.819913, accuracy_baseline = 0.763774, auc = 0.85032, auc_precision_recall = 0.677864, average_loss = 0.409404, global_step = 1500, label/mean = 0.236226, loss = 40.89