### Copy Training and Prediction Data
Create Storage bucket to hold training data.

In [1]:
# Some code to determine a unique bucket name for the purposes of the sample
from gcp.context import Context

project = Context.default().project_id
ml_bucket_name = project + '-mldata'
ml_bucket_path = 'gs://' + ml_bucket_name

train_data_path = ml_bucket_path + '/sampledata/ml/census/census_train.csv'
test_data_path = ml_bucket_path + '/sampledata/ml/census/census_test.csv'
predict_data_path = ml_bucket_path + '/sampledata/ml/census/census_predict.csv'
output_path = ml_bucket_path + '/sampledata/ml/census/'
metadata_path = ml_bucket_path + '/sampledata/ml/census/metadata.yaml'

In [2]:
%%storage create --bucket $ml_bucket_path

Now copy data over.

In [3]:
%%storage copy --source gs://cloud-datalab/sampledata/ml/census/* --destination $ml_bucket_path

In [None]:
%%storage list --bucket $ml_bucket_path

### Browse and Explore Your CSV Data

In [8]:
%%csv view -i $train_data_path -n 10
columns: label, age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country

age,capital-gain,capital-loss,education,education-num,fnlwgt,hours-per-week,label,marital-status,native-country,occupation,race,relationship,sex,workclass
39,2174,0,Bachelors,13,77516,40,<=50K,Never-married,United-States,Adm-clerical,White,Not-in-family,Male,State-gov
50,0,0,Bachelors,13,83311,13,<=50K,Married-civ-spouse,United-States,Exec-managerial,White,Husband,Male,Self-emp-not-inc
38,0,0,HS-grad,9,215646,40,<=50K,Divorced,United-States,Handlers-cleaners,White,Not-in-family,Male,Private
53,0,0,11th,7,234721,40,<=50K,Married-civ-spouse,United-States,Handlers-cleaners,Black,Husband,Male,Private
28,0,0,Bachelors,13,338409,40,<=50K,Married-civ-spouse,Cuba,Prof-specialty,Black,Wife,Female,Private
37,0,0,Masters,14,284582,40,<=50K,Married-civ-spouse,United-States,Exec-managerial,White,Wife,Female,Private
49,0,0,9th,5,160187,16,<=50K,Married-spouse-absent,Jamaica,Other-service,Black,Not-in-family,Female,Private
52,0,0,HS-grad,9,209642,45,>50K,Married-civ-spouse,United-States,Exec-managerial,White,Husband,Male,Self-emp-not-inc
31,14084,0,Masters,14,45781,50,>50K,Never-married,United-States,Prof-specialty,White,Not-in-family,Female,Private
42,5178,0,Bachelors,13,159449,40,>50K,Married-civ-spouse,United-States,Exec-managerial,White,Husband,Male,Private


Get stats of columns (--profile). -n is number of lines to read, and is optional (default to 5).

In [None]:
%%csv view -i $train_data_path --profile -n 200
columns: label, age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country

### Infer Schema and Generate Feature Class by Running
Run the following command and it will replace cell input with feature class definition in next cell.
Note that --target (and --key) can be either a column name, or an index into the columns (0 based, -1 means last).

In [None]:
%%ml features --csv $train_data_path --target label
columns: label, age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country

### Define feature class
Now the feature class is generated. Modify it as appropriate, such as converting a column from text to categorical. Then execute the cell.

In [2]:
%%tensorflow feature

import google.cloud.ml.features as features


class CsvFeatures(features.CsvFeatureSet):
  """ This class is generated from command line:
         %%ml csv-schema ...
         Please modify it as appropriate!!!
  """

  def __init__(self):
    columns = 'label','age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country'
    super(CsvFeatures, self).__init__(columns)

  target = features.target('label').classification()
  attrs = [
      features.numeric('age').min_max_scale(-1.0, 1.0),
      features.numeric('capital-gain').min_max_scale(-1.0, 1.0),
      features.numeric('capital-loss').min_max_scale(-1.0, 1.0),
      features.numeric('education-num').min_max_scale(-1.0, 1.0),
      features.numeric('fnlwgt').min_max_scale(-1.0, 1.0),
      features.numeric('hours-per-week').min_max_scale(-1.0, 1.0),
      features.categorical('education').one_of_k(),
      features.categorical('marital-status').one_of_k(),
      features.categorical('occupation').one_of_k(),
      features.categorical('race').one_of_k(),
      features.categorical('relationship').one_of_k(),
      features.categorical('sex').one_of_k(),
      features.categorical('workclass').one_of_k(),
  ]
  native_country = features.text('native-country').bag_of_words(vocab_size=10000)


### Preprocess Training and Testing Data
Output: preprocessed train data, test data, and metadata generated from train data

In [42]:
%%ml preprocess -o $output_path --cloud -n dfjobtrain
train: $train_data_path
test: $test_data_path

### Preprocessing Prediction Data
Using the metadata we generated to preprocess more data.

In [46]:
%%ml preprocess -o $output_path --cloud -n dfjobpredict
predict: $predict_data_path
metadata: $metadata_path