# Iceberg Classification Step 1: Create Feature Groups and Train/Eval datasets
This notebook will perform the following operations:
- Read the pre-processed data from a HopsFS dataset into a PySpark dataframe 
- Create and Feature Group "iceberg"
- Create a training and test dataset with the Feature Store API

In [26]:
import os
import pandas as pd
import numpy as np
from hops import hdfs
from hops import pandas_helper as pd
from hops import featurestore

## define relevant paths

In [27]:
DATA_FOLDER = 'eodata'
train_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER,'train.json')
train_preprocessed_all_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'train_preprocessed_all.json')
train_preprocessed_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'train_preprocessed.json')
test_preprocessed_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'test_preprocessed.json')

print("train_ds_path:", train_ds_path)
print("train_preprocessed_all_ds_path:", train_preprocessed_all_ds_path)
print("train_preprocessed_ds_path:", train_preprocessed_ds_path)
print("test_preprocessed_ds_path:", test_preprocessed_ds_path)

train_ds_path: hdfs://127.0.0.1:8020/Projects/ExtremeEarth/eodata/train.json
train_preprocessed_all_ds_path: hdfs://127.0.0.1:8020/Projects/ExtremeEarth/eodata/train_preprocessed_all.json
train_preprocessed_ds_path: hdfs://127.0.0.1:8020/Projects/ExtremeEarth/eodata/train_preprocessed.json
test_preprocessed_ds_path: hdfs://127.0.0.1:8020/Projects/ExtremeEarth/eodata/test_preprocessed.json

In [8]:
#read raw train with spark and insert into feature store
train_preprocessed_all_df = spark.read.format('json').load(train_preprocessed_all_ds_path)

In [9]:
train_preprocessed_all_df.printSchema()

root
 |-- band_1: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- band_2: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- band_avg: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- id: string (nullable = true)
 |-- inc_angle: string (nullable = true)
 |-- is_iceberg: long (nullable = true)

In [10]:
train_preprocessed_all_df.show(5)

+--------------------+--------------------+--------------------+--------+---------+----------+
|              band_1|              band_2|            band_avg|      id|inc_angle|is_iceberg|
+--------------------+--------------------+--------------------+--------+---------+----------+
|[-27.878361, -27....|[-27.154118, -29....|[-27.5162395, -28...|dfd5f913|  43.9239|         0|
|[-12.242375, -14....|[-31.506321, -27....|[-21.874348, -21....|e25388fd|  38.1562|         0|
|[-24.603676, -24....|[-24.870956, -24....|[-24.737316, -24....|58b2aaa0|  45.2859|         1|
|[-22.454607, -23....|[-27.889421, -27....|[-25.172014, -25....|4cfc3a18|  43.8306|         0|
|[-26.006956, -23....|[-27.206915, -30....|[-26.6069355, -26...|271f93f4|  35.6256|         0|
+--------------------+--------------------+--------------------+--------+---------+----------+
only showing top 5 rows

# Create and save features to the Feature Store

In [11]:
#read raw train with spark and insert into feature store
train_preprocessed_all_df = spark.read.format('json').load(train_preprocessed_all_ds_path)

featurestore.create_featuregroup(
    train_preprocessed_all_df,
    "iceberg",
    description="Training dataset in Feature Store for iceberg classification",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)

Registering feature metadata...
Registering feature metadata... [COMPLETE]
Writing feature data to offline feature group (Hive)...
Running sql: use extremeearth_featurestore against offline feature store
Writing feature data to offline feature group (Hive)... [COMPLETE]
Feature group created successfully

# Train test split
Now that preprocessing is done, let's split the feature data into training and testing set.

In [12]:
RAND_SEED = 42
TRAIN_SIZE = 0.8
icebergs_fg = featurestore.get_featuregroup("iceberg")

Running sql: use extremeearth_featurestore against offline feature store
SQL string for the query created successfully
Running sql: SELECT * FROM iceberg_1 against offline feature store

In [19]:
icebergs_fg.count()

1604

In [20]:
#Read feature group data, split into train/test and export in tfrecords
icebergs_train_df, icebergs_test_df = icebergs_fg.randomSplit([TRAIN_SIZE, 1-TRAIN_SIZE], RAND_SEED)

In [22]:
print("Training dataset contains {} records".format(icebergs_train_df.count()))

Training dataset contains 1284 records

In [23]:
print("Testing dataset contains {} records".format(icebergs_test_df.count()))

Testing dataset contains 320 records

In [28]:
# create a traiing dataset of TFRecord
featurestore.create_training_dataset(
    icebergs_train_df, "train_tfrecords_iceberg_classification_dataset",
    data_format = "tfrecords",
    descriptive_statistics = False,
    feature_correlation = False,
    feature_histograms = False,
    cluster_analysis = False
)

write feature frame, write_mode: overwrite
Training Dataset created successfully

In [29]:
# create a test dataset of TFRecord
featurestore.create_training_dataset(
    icebergs_test_df, "test_tfrecords_iceberg_classification_dataset",
    data_format = "tfrecords",
    descriptive_statistics = False,
    feature_correlation = False,
    feature_histograms = False,
    cluster_analysis = False
)

write feature frame, write_mode: overwrite
Training Dataset created successfully

# End of Step 1