## FRUITS! Feature extraction notebook

In [1]:
###################################
##### Initialisation of Spark session ########################
###################################

spark.sparkContext.setLogLevel("WARN")
spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")

spark


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1676555148927_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7f6e26036cd0>

In [2]:
###################################
########## Libraries ##############
###################################
import pandas as pd
from PIL import Image
import numpy as np
import io
import os



os.environ['CUDA_VISIBLE_DEVICES'] ='-1'
os.environ['TF_CPP_MIN_LOG_LEVEL']="2"



import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras import Model
from pyspark.sql.functions import col, pandas_udf, PandasUDFType, element_at, split
from pyspark.sql import SparkSession

from pyspark.ml.functions import array_to_vector
from pyspark.ml import PipelineModel




FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:

###################################
##### PATH ########################
###################################


PATH_s3 = "s3://cd-p8-fruits/"
PATH_Pipeline = PATH_s3+"pipeline_trained"
PATH_Data = PATH_s3+"/Sample"
PATH_Result = PATH_s3+"/Results"

print('\nPATH_S3   ' +PATH_s3+
'\nPATH_Data:   ' +PATH_Data+
'\nPATH_Result: '+PATH_Result+
'\nPATH_Pipeline:   ' +PATH_Pipeline)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…


PATH_S3   s3://cd-p8-fruits/
PATH_Data:   s3://cd-p8-fruits//Sample
PATH_Result: s3://cd-p8-fruits//Results
PATH_Pipeline:   s3://cd-p8-fruits/pipeline_trained

In [4]:
###################################
########## Model Initialisation ###
###################################
model = MobileNetV2(weights='imagenet',
                        include_top=True,
                        input_shape=(224, 224, 3))
for layer in model.layers:
    layer.trainable = False
new_model = Model(inputs=model.input,
                    outputs=model.layers[-2].output)
brodcast_weights = sc.broadcast(new_model.get_weights())
new_model.set_weights(brodcast_weights.value)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224.h5

In [5]:
###################################
########## Functions ##############
###################################


def preprocess(content):
    """
    Preprocesses raw image bytes for prediction.
    """
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    arr = img_to_array(img)
    return preprocess_input(arr)


def featurize_series(model, content_series):
    """
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    """
    input = np.stack(content_series.map(preprocess))
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    return pd.Series(output)


def model_fn():
    """
    Returns a MobileNetV2 model with top layer removed 
    and broadcasted pretrained weights.
    """
    model = MobileNetV2(weights='imagenet',
                        include_top=True,
                        input_shape=(224, 224, 3))
    for layer in model.layers:
        layer.trainable = False
    new_model = Model(inputs=model.input,
                      outputs=model.layers[-2].output)
    new_model.set_weights(brodcast_weights.value)
    return new_model


@pandas_udf('array<float>', PandasUDFType.SCALAR_ITER)
def featurize_udf(content_series_iter):
    '''
    This method is a Scalar Iterator pandas UDF wrapping our featurization function.
    The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).

    :param content_series_iter: This argument is an iterator over batches of data, where each batch
                              is a pandas Series of image data.
    '''
    # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
    # for multiple data batches.  This amortizes the overhead of loading big models.
    model = model_fn()
    for content_series in content_series_iter:
        yield featurize_series(model, content_series)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [6]:
###################################
##### Loading images ##############
###################################
images = spark.read.format("binaryFile") \
    .option("pathGlobFilter", "*.jpg") \
    .option("recursiveFileLookup", "true") \
    .load(PATH_Data)


images = images.withColumn('label', element_at(
    split(images['path'], '/'), -2))  # withColumn add a column,
print(images.printSchema())
print(images.select('path', 'label').show(5, False))


features_df = images.repartition(100).select(col("path"), col(
    "label"), featurize_udf("content").alias("features")).withColumn("features", array_to_vector("features"))

print(features_df.printSchema())
#print(features_df.select('path', 'label', 'features').show(5))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)

None
+---------------------------------------------------+--------------+
|path                                               |label         |
+---------------------------------------------------+--------------+
|s3://cd-p8-fruits/Sample/Lychee/1_100.jpg          |Lychee        |
|s3://cd-p8-fruits/Sample/Lychee/r_48_100.jpg       |Lychee        |
|s3://cd-p8-fruits/Sample/Lychee/r_5_100.jpg        |Lychee        |
|s3://cd-p8-fruits/Sample/Lychee/r_193_100.jpg      |Lychee        |
|s3://cd-p8-fruits/Sample/Apple Braeburn/r_3_100.jpg|Apple Braeburn|
+---------------------------------------------------+--------------+
only showing top 5 rows

None
root
 |-- path: string (nullable = true)
 |-- label: string (nullable = true)
 |-- features: vector (nullable = true)

None

In [7]:
###################################
##### Standard Scaling + PCA ######
###################################



pipeline_model = PipelineModel.load(PATH_Pipeline)

features_df = pipeline_model.transform(features_df)

features_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------+--------------------+--------------------+--------------------+
|                path|         label|            features|     Scaled_features|        pca_features|
+--------------------+--------------+--------------------+--------------------+--------------------+
|s3://cd-p8-fruits...|Apple Braeburn|[0.57977545261383...|[1.04470877966440...|[7.61116841483536...|
|s3://cd-p8-fruits...|        Lychee|[0.85327965021133...|[1.53754136720669...|[-10.574755284183...|
|s3://cd-p8-fruits...|        Lychee|[1.35203599929809...|[2.43626023233836...|[-11.817118383397...|
|s3://cd-p8-fruits...|        Lychee|[1.04481923580169...|[1.88268030990838...|[-14.591393143335...|
|s3://cd-p8-fruits...|Apple Braeburn|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|[1.64081596827364...|
|s3://cd-p8-fruits...|        Lychee|[1.28063607215881...|[2.30760330073921...|[-13.226224933598...|
|s3://cd-p8-fruits...|Apple Braeburn|[0.70797502994537...|[1.27571411696131...|[5.450458344

In [8]:
###################################
##### Export output ###############
###################################

#features_df.write.mode("overwrite").parquet(PATH_Result)

features_df.withColumn('features', col('features').cast('string')) \
.withColumn('Scaled_features', col('Scaled_features').cast('string')) \
.withColumn('pca_features', col('pca_features').cast('string')).repartition(1) \
.write.mode("overwrite").options(header=True, delimiter=';') \
.csv(PATH_Result+"/output")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…