In [1]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("example") \
    .getOrCreate()

# Quick pipeline

Before you parse some more complex data, your manager would like to see a simple pipeline example including the basic steps. For this example, you'll want to ingest a data file, filter a few rows, add an ID column to it, then write it out as JSON data.

In [2]:
# Import the data to a DataFrame
departures_df = spark.read.csv('dataset/AA_DFW_2015_Departures_Short.csv', header=True)

# Remove any duration of 0
departures_df = departures_df.filter(" 'Actual elapsed time (Minutes)' <> '0'")



In [3]:
import pyspark.sql.functions as F
# Add an ID column
departures_df = departures_df.withColumn('id', F.monotonically_increasing_id())

# Write the file out to JSON format
# departures_df.write.json('output.json')
departures_df.show(5)

+-----------------+-------------+-------------------+-----------------------------+---+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)| id|
+-----------------+-------------+-------------------+-----------------------------+---+
|       01/01/2015|         0005|                HNL|                          526|  0|
|       01/01/2015|         0007|                OGG|                          517|  1|
|       01/01/2015|         0023|                SFO|                          233|  2|
|       01/01/2015|         0027|                LAS|                          165|  3|
|       01/01/2015|         0029|                ONT|                            0|  4|
+-----------------+-------------+-------------------+-----------------------------+---+
only showing top 5 rows



# Pipeline data issue

After creating your quick pipeline, you provide the json file to an analyst on your team. After loading the data and performing a couple exploratory tasks, the analyst tells you there's a problem in the dataset while trying to sort the duration data. She's not sure what the issue is beyond the sorting operation not working as expected.
```
Date          Flight Number   Airport     Duration    ID

09/30/2015    2287            ANC         409         107962
12/28/2015    1408            OKC         41          141917
08/11/2015    2287            ANC         410         87978
```
After analyzing the data, which command would fix the issue?

- `departures_df = departures_df.withColumn('Duration', departures_df['Duration'].cast(IntegerType()))`

# Removing commented lines

Your boss would like you to perform some complex parsing on a new dataset. The data represents annotation data for the ImageNet dataset, but focusing specifically on dog breeds and identifying them in images. Before any actual analysis can occur, you'll need to clear out several components of invalid / incorrect data. The general schema of the document is unknown so you'd like to import the rows into a single column, allowing for quick analysis.

To start, you need to remove all commented rows in the dataset.

In [4]:
from pyspark.sql.functions import col
# Import the file to a DataFrame and perform a row count
annotations_df = spark.read.csv('dataset/AA_DFW_2017_Departures_Short.csv', sep='|')
full_count = annotations_df.count()

# Count the number of rows beginning with '#'
comment_count = annotations_df.filter(col('_c0').startswith('#')).count()

# Import the file to a new DataFrame, without commented rows
no_comments_df = spark.read.csv('dataset/AA_DFW_2017_Departures_Short.csv', sep='|', comment='#')

# Count the new DataFrame and verify the difference is as expected
no_comments_count = no_comments_df.count()
print("Full count: %d\nComment count: %d\nRemaining count: %d" % (full_count, comment_count, no_comments_count))

Full count: 139359
Comment count: 0
Remaining count: 139359


# Removing invalid rows

Now that you've successfully removed the commented rows, you have received some information about the general format of the data. There should be at minimum 5 tab separated columns in the DataFrame. Remember that your original DataFrame only has a single column, so you'll need to split the data on the tab (\t) characters.

In [5]:
initial_count = annotations_df.count()
# Split _c0 on the tab character and store the list in a variable
tmp_fields = F.split(annotations_df['_c0'], '\t')

# Create the colcount column on the DataFrame
annotations_df = annotations_df.withColumn('colcount', F.size(tmp_fields))

# Remove any rows containing fewer than 5 fields
annotations_df_filtered = annotations_df.filter(~ (annotations_df.colcount < 5))

# Count the number of rows
final_count = annotations_df_filtered.count()
print("Initial count: %d\nFinal count: %d" % (initial_count, final_count))

Initial count: 139359
Final count: 0


# Splitting into columns

You've cleaned up your data considerably by removing the invalid rows from the DataFrame. Now you want to perform some further transformations by generating specific meaningful columns based on the DataFrame content.

In [6]:
# Split the content of _c0 on the tab character (aka, '\t')
split_cols = F.split(annotations_df['_c0'], '\t')

# Add the columns folder, filename, width, and height
split_df = annotations_df.withColumn('folder', split_cols.getItem(0))
split_df = split_df.withColumn('filename', split_cols.getItem(1))
split_df = split_df.withColumn('width', split_cols.getItem(2))
split_df = split_df.withColumn('height', split_cols.getItem(3))

# Add split_cols as a column
split_df = split_df.withColumn('split_cols', split_cols)
split_df.show(2)

+--------------------+--------+--------------------+--------+-----+------+--------------------+
|                 _c0|colcount|              folder|filename|width|height|          split_cols|
+--------------------+--------+--------------------+--------+-----+------+--------------------+
|Date (MM/DD/YYYY)...|       1|Date (MM/DD/YYYY)...|    NULL| NULL|  NULL|[Date (MM/DD/YYYY...|
|"01/01/2017","000...|       1|"01/01/2017","000...|    NULL| NULL|  NULL|["01/01/2017","00...|
+--------------------+--------+--------------------+--------+-----+------+--------------------+
only showing top 2 rows



# Further parsing

You've molded this dataset into a significantly different format than it was before, but there are still a few things left to do. You need to prep the column data for use in later analysis and remove a few intermediary columns.

In [7]:
from pyspark.sql.types import ArrayType, StringType
def retriever(cols, colcount):
  # Return a list of dog data
  return cols[4:colcount]

# Define the method as a UDF
udfRetriever = F.udf(retriever, ArrayType(StringType()))

# Create a new column using your UDF
split_df = split_df.withColumn('dog_list', udfRetriever(split_df.split_cols, split_df.colcount))

# Remove the original column, split_cols, and the colcount
split_df = split_df.drop('_c0').drop('split_cols').drop('colcount')

In [8]:
split_df.count()
# split_df.show(3)


139359

# Validate rows via join

Another example of filtering data is using joins to remove invalid entries. You'll need to verify the folder names are as expected based on a given DataFrame named valid_folders_df. The DataFrame split_df is as you last left it with a group of split columns.

In [9]:
# # Rename the column in valid_folders_df
# valid_folders_df = valid_folders_df.withColumnRenamed('_c0','folder')

# # Count the number of rows in split_df
# split_count = split_df.count()

# # Join the DataFrames
# joined_df = split_df.join(F.broadcast(valid_folders_df), "folder")

# # Compare the number of rows remaining
# joined_count = joined_df.count()
# print("Before: %d\nAfter: %d" % (split_count, joined_count))

# Examining invalid rows

You've successfully filtered out the rows using a join, but sometimes you'd like to examine the data that is invalid. This data can be stored for later processing or for troubleshooting your data sources.

You want to find the difference between two DataFrames and store the invalid rows.

In [10]:
# # Determine the row counts for each DataFrame
# split_count = split_df.count()
# joined_count = joined_df.count()

# # Create a DataFrame containing the invalid rows
# invalid_df = split_df.join(F.broadcast(joined_df), 'folder', 'left_anti')

# # Validate the count of the new DataFrame is as expected
# invalid_count = invalid_df.count()
# print(" split_df:\t%d\n joined_df:\t%d\n invalid_df: \t%d" % (split_count, joined_count, invalid_count))

# # Determine the number of distinct folder rows removed
# invalid_folder_count = invalid_df.select('folder').distinct().count()
# print("%d distinct invalid folders found" % invalid_folder_count)

# Dog parsing

You've done a considerable amount of cleanup on the initial dataset, but now need to analyze the data a bit deeper. There are several questions that have now come up about the type of dogs seen in an image and some details regarding the images. You realize that to answer these questions, you need to process the data into a specific type. Before you can use it, you'll need to create a schema / type to represent the dog details.

In [11]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
# Select the dog details and show 10 untruncated rows
# print(joined_df.select("dog_list").show(10, truncate=False))

# Define a schema type for the details in the dog list
DogType = StructType([
	StructField("breed", StringType(), False),
    StructField("start_x", IntegerType(), False),
    StructField("start_y", IntegerType(), False),
    StructField("end_x", IntegerType(), False),
    StructField("end_y", IntegerType(), False)
])

# Per image count

Your next task in building a data pipeline for this dataset is to create a few analysis oriented columns. You've been asked to calculate the number of dogs found in each image based on your dog_list column created earlier. You have also created the DogType which will allow better parsing of the data within some of the data columns.

In [12]:
# # Create a function to return the number and type of dogs as a tuple
# def dogParse(doglist):
#   dogs = []
#   for dog in doglist:
#     (breed, start_x, start_y, end_x, end_y) = dog.split(',')
#     dogs.append((breed, int(start_x), int(start_y), int(end_x), int(end_y)))
#   return dogs

# # Create a UDF
# udfDogParse = F.udf(dogParse, ArrayType(DogType))

# # Use the UDF to list of dogs and drop the old column
# joined_df = joined_df.withColumn('dogs', udfDogParse('dog_list')).drop('dog_list')

# # Show the number of dogs in the first 10 rows
# joined_df.select(F.size('dogs')).show(10)

# Percentage dog pixels

The final task for parsing the dog annotation data is to determine the percentage of pixels in each image that represents a dog (or dogs). You'll need to use the various techniques you've learned in this course to help calculate this information and add it as columns for later analysis.

To calculate the percentage of pixels, first calculate the total number of pixels representing each dog then sum them for the image. You can calculate the bounding box with the formula:

(Xend - Xstart) * (Yend - Ystart)

In [13]:
# # Define a UDF to determine the number of pixels per image
# def dogPixelCount(doglist):
#   totalpixels = 0
#   for dog in doglist:
#     totalpixels += (dog[3] - dog[1]) * (dog[4] - dog[2])
#   return totalpixels

# # Define a UDF for the pixel count
# udfDogPixelCount = F.udf(dogPixelCount, IntegerType())
# joined_df = joined_df.withColumn('dog_pixels', udfDogPixelCount(joined_df.dogs))

# # Create a column representing the percentage of pixels
# joined_df = joined_df.withColumn('dog_percent', (col('dog_pixels') / (joined_df.width * joined_df.height)) * 100)

# # Show the first 10 annotations with more than 60% dog
# joined_df.where("dog_percent > 60").show(10)