In [3]:
!git clone https://github.com/GoogleCloudPlatform/training-data-analyst

Cloning into 'training-data-analyst'...
remote: Enumerating objects: 50569, done.[K
remote: Counting objects: 100% (2689/2689), done.[K
remote: Compressing objects: 100% (1230/1230), done.[K
remote: Total 50569 (delta 1578), reused 2264 (delta 1311), pack-reused 47880[K
Receiving objects: 100% (50569/50569), 580.05 MiB | 25.88 MiB/s, done.
Resolving deltas: 100% (31625/31625), done.
Checking out files: 100% (10070/10070), done.


In [6]:
!mkdir data 
!gsutil -m cp gs://cloud-samples-data/ml-engine/census/data/* data/

Copying gs://cloud-samples-data/ml-engine/census/data/adult.data.csv...
Copying gs://cloud-samples-data/ml-engine/census/data/adult.test.csv...         
Copying gs://cloud-samples-data/ml-engine/census/data/census.test.csv...        
Copying gs://cloud-samples-data/ml-engine/census/data/test.json...              
Copying gs://cloud-samples-data/ml-engine/census/data/census.train.csv...       
Copying gs://cloud-samples-data/ml-engine/census/data/test.csv...               
/ [6/6 files][ 10.7 MiB/ 10.7 MiB] 100% Done                                    
Operation completed over 6 objects/10.7 MiB.                                     


In [9]:
!ls data

adult.data.csv	census.test.csv   test.csv
adult.test.csv	census.train.csv  test.json


In [33]:
%env TRAIN_DATA=data/adult.data.csv
%env EVAL_DATA=data/adult.test.csv

env: TRAIN_DATA=data/adult.data.csv
env: EVAL_DATA=data/adult.test.csv


In [34]:
!head $TRAIN_DATA

39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners, Husband, Black, Male, 0, 0, 40, United-States, <=50K
28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K
37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K
49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K
31, 

In [56]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

DATA_URL = (
    'https://storage.googleapis.com/cloud-samples-data/ai-platform/census'
    '/data')
TRAINING_FILE = 'adult.data.csv'
EVAL_FILE = 'adult.test.csv'
TRAINING_URL = '%s/%s' % (DATA_URL, TRAINING_FILE)
EVAL_URL = '%s/%s' % (DATA_URL, EVAL_FILE)

columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship', 'race', 'gender',
    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
    'income_bracket'
]
target = 'income_bracket'

dtypes = {
    
    'workclass': pd.api.types.CategoricalDtype(categories=[
        'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc',
        'Self-emp-not-inc', 'State-gov', 'Without-pay'
    ]),
    'marital_status': pd.api.types.CategoricalDtype(categories=[
        'Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
        'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'
    ]),
    'occupation': pd.api.types.CategoricalDtype([
        'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial',
        'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct',
        'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv',
        'Sales', 'Tech-support', 'Transport-moving'
    ]),
    'relationship': pd.api.types.CategoricalDtype(categories=[
        'Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried',
        'Wife'
    ]),
    'race': pd.api.types.CategoricalDtype(categories=[
        'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'
    ]),
    'native_country': pd.api.types.CategoricalDtype(categories=[
        'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic',
        'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece',
        'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong',
        'Hungary',
        'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos',
        'Mexico',
        'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines',
        'Poland',
        'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand',
        'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia'
    ]),
    'income_bracket': pd.api.types.CategoricalDtype(categories=[
        '<=50K', '>50K'
    ])
}

df = pd.read_csv(os.environ['TRAIN_DATA'], 
                 names=columns, header=None, 
                 na_values='?', 
                 delimiter=', ').astype(dtypes)
df_test = pd.read_csv(os.environ['EVAL_DATA'], 
                 names=columns, header=None, 
                 na_values='?', 
                 delimiter=', ').astype(dtypes)

  return func(*args, **kwargs)


In [65]:
df[dtypes.keys()].apply(lambda x: x.cat.codes)#.apply(lambda x: x.astype(dtypes[x.]))

Unnamed: 0,workclass,marital_status,occupation,relationship,race,native_country,income_bracket
0,6,4,0,1,4,38,0
1,5,2,3,0,4,38,0
2,3,0,5,1,4,38,0
3,3,2,5,0,2,38,0
4,3,2,9,5,2,4,0
...,...,...,...,...,...,...,...
32556,3,2,12,5,4,38,0
32557,3,2,6,0,4,38,1
32558,3,6,0,4,4,38,0
32559,3,4,0,3,4,38,0


In [66]:
!mv src/data data