# Iceberg Classification Step 2: Create Feature Groups and Train/Eval datasets
This notebook will perform the following operations:
- Read the pre-processed data from a HopsFS dataset into a PySpark dataframe 
- Create and Feature Group "iceberg"
- Create a training and test dataset with the Feature Store API

In [1]:
import os
import pandas as pd
import numpy as np
from hops import hdfs
from hops import pandas_helper as pd
import hsfs

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
30,application_1617271333718_0139,pyspark,idle,Link,Link


SparkSession available as 'spark'.


## define relevant paths

In [2]:
DATA_FOLDER = 'eodata'
train_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER,'train.json')
train_preprocessed_all_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'train_preprocessed_all.json')
train_preprocessed_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'train_preprocessed.json')
test_preprocessed_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'test_preprocessed.json')

print("train_ds_path:", train_ds_path)
print("train_preprocessed_all_ds_path:", train_preprocessed_all_ds_path)
print("train_preprocessed_ds_path:", train_preprocessed_ds_path)
print("test_preprocessed_ds_path:", test_preprocessed_ds_path)

train_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train.json
train_preprocessed_all_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train_preprocessed_all.json
train_preprocessed_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train_preprocessed.json
test_preprocessed_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/test_preprocessed.json

In [3]:
#read raw train with spark and insert into feature store
train_preprocessed_all_df = spark.read.format('json').load(train_preprocessed_all_ds_path)

In [4]:
train_preprocessed_all_df.printSchema()

root
 |-- band_1: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- band_2: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- band_avg: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- id: string (nullable = true)
 |-- inc_angle: string (nullable = true)
 |-- is_iceberg: long (nullable = true)

In [5]:
train_preprocessed_all_df.show(5)

+--------------------+--------------------+--------------------+--------+---------+----------+
|              band_1|              band_2|            band_avg|      id|inc_angle|is_iceberg|
+--------------------+--------------------+--------------------+--------+---------+----------+
|[-27.878361, -27....|[-27.154118, -29....|[-27.5162395, -28...|dfd5f913|  43.9239|         0|
|[-12.242375, -14....|[-31.506321, -27....|[-21.874348, -21....|e25388fd|  38.1562|         0|
|[-24.603676, -24....|[-24.870956, -24....|[-24.737316, -24....|58b2aaa0|  45.2859|         1|
|[-22.454607, -23....|[-27.889421, -27....|[-25.172014, -25....|4cfc3a18|  43.8306|         0|
|[-26.006956, -23....|[-27.206915, -30....|[-26.6069355, -26...|271f93f4|  35.6256|         0|
+--------------------+--------------------+--------------------+--------+---------+----------+
only showing top 5 rows

# Create and save features to the Feature Store

In [6]:
conn = hsfs.connection()
fs = conn.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

In [7]:
icebergs_fg = fs.create_feature_group(
    "iceberg",
    time_travel_format=None,
    statistics_config=hsfs.statistics_config.StatisticsConfig(enabled=False, correlations=False, histograms=False, columns=[]),
    description="Training dataset in Feature Store for iceberg classification"
)

In [8]:
icebergs_fg.save(train_preprocessed_all_df)

<hsfs.feature_group.FeatureGroup object at 0x7fb2f7ea1910>

# Train test split
Now that preprocessing is done, let's split the feature data into training and testing set.

In [9]:
RAND_SEED = 42
TRAIN_SIZE = 0.8

In [10]:
icebergs_fg.read().count()

1604

In [11]:
#Read feature group data, split into train/test and export in tfrecords
icebergs_train_df, icebergs_test_df = icebergs_fg.read().randomSplit([TRAIN_SIZE, 1-TRAIN_SIZE], RAND_SEED)

In [12]:
print("Training dataset contains {} records".format(icebergs_train_df.count()))

Training dataset contains 1263 records

In [13]:
print("Testing dataset contains {} records".format(icebergs_test_df.count()))

Testing dataset contains 341 records

In [14]:
# create a traiing dataset of TFRecord
icebergs_train_td = fs.create_training_dataset(
    "train_tfrecords_iceberg_classification_dataset",
    statistics_config=hsfs.statistics_config.StatisticsConfig(enabled=False, correlations=False, histograms=False, columns=[]),
    data_format = "tfrecords"
).save(icebergs_train_df)



In [15]:
# create a traiing dataset of TFRecord
icebergs_test_td = fs.create_training_dataset(
    "test_tfrecords_iceberg_classification_dataset",
    statistics_config=hsfs.statistics_config.StatisticsConfig(enabled=False, correlations=False, histograms=False, columns=[]),
    data_format = "tfrecords"
).save(icebergs_test_df)



# End of Step 2