In [34]:
import tensorflow as tf
import math

hello = tf.constant('Hello world!')
with tf.Session() as sess:
  print(sess.run(hello))

b'Hello world!'


In [0]:
# tf.enable_eager_execution()

Steps to do are: 


1.   Import training and test Tfrecords from GCS
2.   Parse them into some model-ingestable format
3.   Decide how my model should look. Which frameworks to use. CNN decisions.
4.   Have interpretable results including loss, learning curve and accuracy/evaluation on test set.



In [0]:
# Authenticate to GCS.
from google.colab import auth
auth.authenticate_user()

In [0]:
# Create the service client.
from googleapiclient.discovery import build
gcs_service = build('storage', 'v1')

In [38]:
from apiclient.http import MediaIoBaseDownload

trainFile = 'py8_training_ee_export.tfrecord.gz'
testFile = 'py8_test_ee_export.tfrecord.gz'

def download_tfrecord(file_name):
  with open('/tmp/'+file_name, 'wb') as f:
    request = gcs_service.objects().get_media(bucket='labelled_data',
                                              object=trainFile)
    media = MediaIoBaseDownload(f, request)

    done = False
    while not done:
      # _ is a placeholder for a progress object that we ignore.
      # (Our file is small, so we skip reporting progress.)
      _, done = media.next_chunk()

  print('Download complete')
  
download_tfrecord(trainFile)
download_tfrecord(testFile)

Download complete
Download complete


In [39]:
# Names of the features.
bands = ['VV', 'latitude', 'longitude']
label = 'constant'
featureNames = list(bands)
featureNames.append(label)

# Feature columns
columns = [
  tf.FixedLenFeature(shape=[1], dtype=tf.float32) for k in featureNames
]

# Dictionary with names as keys, features as values.
featuresDict = dict(zip(featureNames, columns))
print(featuresDict)

{'VV': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None), 'latitude': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None), 'longitude': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None), 'constant': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None)}


In [40]:
#inspect the tfrecords to check they downloaded ok
training_dataset = tf.data.TFRecordDataset('/tmp/'+trainFile, compression_type='GZIP')
test_dataset = tf.data.TFRecordDataset('/tmp/'+testFile, compression_type='GZIP')

iterator = training_dataset.make_one_shot_iterator()
foo = iterator.get_next()
with tf.Session() as sess:
    print(sess.run([foo]))

[b'\nS\n\x0e\n\x02VV\x12\x08\x12\x06\n\x04\x94\xb6.\xc0\n\x14\n\x08constant\x12\x08\x12\x06\n\x04\x00\x00\x00\x00\n\x14\n\x08latitude\x12\x08\x12\x06\n\x04,CWB\n\x15\n\tlongitude\x12\x08\x12\x06\n\x04D\x13\xbf\xbf']


In [41]:

def parse_tfrecord(example_proto):
  parsed_features = tf.parse_single_example(example_proto, featuresDict)
#   labels = parsed_features.pop(label) #label is a global var
  labels = parsed_features['constant']
  parsed_features['constant'] = tf.cast(labels, tf.int32) 
  return parsed_features, tf.cast(labels, tf.int32)

# Map the parsing function over the datasets
parsed_training_dataset = training_dataset.map(parse_tfrecord, num_parallel_calls=5)
parsed_test_dataset = test_dataset.map(parse_tfrecord, num_parallel_calls=5)

iterator = parsed_training_dataset.make_one_shot_iterator()
foo = iterator.get_next()
with tf.Session() as sess:
  print(sess.run([foo]))

[({'VV': array([-2.7298937], dtype=float32), 'constant': array([0], dtype=int32), 'latitude': array([53.815598], dtype=float32), 'longitude': array([-1.4927754], dtype=float32)}, array([0], dtype=int32))]


In [0]:

# https://datascience.stackexchange.com/questions/13567/ways-to-deal-with-longitude-latitude-feature

def get_real_xyz(lat, lon):
  x = tf.math.cos(lat) * tf.math.cos(lon)
  y = tf.math.cos(lat) * tf.math.sin(lon)
  z = tf.math.sin(lat)
  return x,y,z

def add_features(features, label):
  features['location'] = get_real_xyz(features['latitude'], features['longitude'])
  return features, label
  

In [0]:
def tfrecord_input_fn(fileName,
                      numEpochs=None,
                      shuffle=True,
                      batchSize=None):

  dataset = tf.data.TFRecordDataset(fileName, compression_type='GZIP')

  dataset = dataset.map(parse_tfrecord, num_parallel_calls=5)
  # Add additional features.
  # dataset = dataset.map(add_features)

  if shuffle:
    dataset = dataset.shuffle(buffer_size=batchSize * 10)
  dataset = dataset.batch(batchSize)
  dataset = dataset.repeat(numEpochs)

  iterator = dataset.make_one_shot_iterator()
  features, labels = iterator.get_next()
  return features, labels

In [0]:
# def get_training_input(fileName):
#   dataset = tf.data.TFRecordDataset(fileName, compression_type='GZIP')
#   dataset = dataset.map(parse_tfrecord, num_parallel_calls=5)
#   dataset = dataset.map(add_features)
  
#   iterator = dataset.make_one_shot_iterator()
#   features, labels = iterator.get_next()
#   return features, labels

# get_training_input('/tmp/'+trainFile)

In [0]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [0]:
tf.reset_default_graph()

In [0]:
!pkill ngrok

In [0]:
# ! wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
# ! unzip ngrok-stable-linux-amd64.zip

In [0]:
LOG_DIR = 'tmp/output/run31'


get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6009 &'
    .format(LOG_DIR)
)

get_ipython().system_raw('./ngrok http 6009 &')

In [123]:
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

http://85a654dd.ngrok.io


In [0]:
# make and train a classifier ---- CHANGE THIS FOR YOUR OWN CUSTOM ALGOS IF YOU LIKE NOW.
cheat_feature_column = tf.feature_column.categorical_column_with_identity(key='constant',num_buckets=4)
indicator_column = tf.feature_column.indicator_column(cheat_feature_column)

lat_fc = tf.feature_column.bucketized_column(tf.feature_column.numeric_column('latitude'), [-90, -60, -30, 0, 30, 60,90])
long_fc = tf.feature_column.bucketized_column(tf.feature_column.numeric_column('longitude'), [-180, -120, -60, 0, 60, 120,180])
number_of_categories = 5000
embedding_dimensions =  number_of_categories**0.25
embedded_col = tf.feature_column.embedding_column(tf.feature_column.crossed_column([lat_fc, long_fc],number_of_categories),3)


inputColumns = {tf.feature_column.numeric_column('VV'), embedded_col, indicator_column }
learning_rate = 0.01
optimizer = tf.train.AdagradOptimizer(learning_rate)

classifier = tf.estimator.DNNClassifier(feature_columns=inputColumns,
                                  hidden_units=[1,2,1],
                                  model_dir='tmp/output/run31',
                                  optimizer=optimizer,
                                  n_classes=2)

classifier.train(input_fn=lambda: tfrecord_input_fn(fileName='/tmp/'+trainFile, batchSize=50))

In [0]:
def zscore(col):
  mean, var = tf.nn.moments(col, axes=[1])
  std = tf.math.sqrt(var)
  return (col - mean)/std


In [0]:
# # Evaluate the model.
# eval_result = classifier.evaluate(
#     input_fn=lambda:tfrecord_input_fn(fileName='/tmp/'+testFile, batchSize=10, shuffle=False))