# Iceberg Classification Step 1: Preprocessing the data
This notebook will perform the following operations:
- take the 'train.json' as input file
- do some preprocessing and data engineering
- save the preprocessed dataset in a Hopsworks dataset

In [1]:
import os
import pandas as pd
import numpy as np
from hops import hdfs
from hops import pandas_helper as pd
from hops import featurestore

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
28,application_1617271333718_0134,pyspark,idle,Link,Link


SparkSession available as 'spark'.


## define relevant paths

In [2]:
DATA_FOLDER = 'eodata'
train_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER,'train.json')
train_preprocessed_all_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'train_preprocessed_all.json')
train_preprocessed_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'train_preprocessed.json')
test_preprocessed_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'test_preprocessed.json')

print("train_ds_path:", train_ds_path)
print("train_preprocessed_all_ds_path:", train_preprocessed_all_ds_path)
print("train_preprocessed_ds_path:", train_preprocessed_ds_path)
print("test_preprocessed_ds_path:", test_preprocessed_ds_path)

train_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train.json
train_preprocessed_all_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train_preprocessed_all.json
train_preprocessed_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train_preprocessed.json
test_preprocessed_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/test_preprocessed.json

## Read the raw data

In [3]:
# read the raw data to pandas dataframe
raw_train_df = pd.read_json(train_ds_path)

In [4]:
raw_train_df

            id  ... is_iceberg
0     dfd5f913  ...          0
1     e25388fd  ...          0
2     58b2aaa0  ...          1
3     4cfc3a18  ...          0
4     271f93f4  ...          0
...        ...  ...        ...
1599  04e11240  ...          0
1600  c7d6f6f8  ...          0
1601  bba1a0f1  ...          0
1602  7f66bb44  ...          0
1603  9d8f326c  ...          0

[1604 rows x 5 columns]

## Create new feature band_avg

In [5]:
# a function for taking list average
def list_avg(row):
    return [sum(x)/2 for x in zip(row['band_1'], row['band_2'])]

raw_train_df['band_avg'] = raw_train_df.apply(lambda row: list_avg(row), axis=1)

In [6]:
raw_train_df

            id  ...                                           band_avg
0     dfd5f913  ...  [-27.516239499999998, -28.346024, -29.84960749...
1     e25388fd  ...  [-21.874347999999998, -21.4524295, -20.7830205...
2     58b2aaa0  ...  [-24.737316, -24.348173, -22.762496, -21.28190...
3     4cfc3a18  ...  [-25.172013999999997, -25.301306500000003, -25...
4     271f93f4  ...  [-26.6069355, -26.712035999999998, -26.7120359...
...        ...  ...                                                ...
1599  04e11240  ...  [-29.4237985, -29.105365, -26.472991999999998,...
1600  c7d6f6f8  ...  [-27.437631500000002, -27.400965, -27.76694599...
1601  bba1a0f1  ...  [-21.723625, -23.7647725, -23.9906165, -22.930...
1602  7f66bb44  ...  [-24.262994499999998, -23.944199, -24.2661145,...
1603  9d8f326c  ...  [-22.1770305, -22.817203499999998, -23.9654685...

[1604 rows x 6 columns]

In [7]:
#save raw train df in dataset
raw_train_df.to_json(path_or_buf='train_preprocessed_all.json', orient='records')
hdfs.copy_to_hdfs("train_preprocessed_all.json", DATA_FOLDER , overwrite=True)

Started copying local path train_preprocessed_all.json to hdfs path hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train_preprocessed_all.json

Finished copying

# End of Step 1