In [1]:
# ---------------------------------------------------------
# Initialize Spark Session
# ---------------------------------------------------------
# Purpose:
# - Create a Spark entry point for DataFrame operations
# - Optimize for local Docker execution by reducing shuffle partitions
# ---------------------------------------------------------

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC_Jobs_Data_Exploration") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

spark


In [3]:
# ---------------------------------------------------------
# Load NYC Jobs CSV Dataset
# ---------------------------------------------------------
# Assumptions:
# - CSV file is placed under /workspace/data/
# - Header is present
# - Schema inference is enabled for initial exploration
# ---------------------------------------------------------

df = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("../data/nyc-jobs.csv")

# Preview sample records
df.show(5, truncate=False)


+------+----------------------------+------------+--------------+----------------------------------------------------+------------------------------+-------------+-----+------------------------+-----------------------------+-----------------+---------------+----------------+----------------------------+------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [4]:
# ---------------------------------------------------------
# Inspect Data Schema
# ---------------------------------------------------------
# Objective:
# - Understand column names and data types
# - Identify numerical, categorical, and date-like fields
# - Detect potential data quality concerns early
# ---------------------------------------------------------

df.printSchema()


root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/Shift: string (nullable = true)
 |-- Work Locat

In [5]:
# ---------------------------------------------------------
# Classify Columns by Data Type
# ---------------------------------------------------------
# Objective:
# - Separate string, numeric, and date columns
# - Helps decide:
#   • Aggregation candidates
#   • Feature engineering opportunities
#   • Columns to exclude from analytics
# ---------------------------------------------------------

from pyspark.sql.types import StringType, NumericType, DateType

string_columns = [
    field.name for field in df.schema.fields
    if isinstance(field.dataType, StringType)
]

numeric_columns = [
    field.name for field in df.schema.fields
    if isinstance(field.dataType, NumericType)
]

date_columns = [
    field.name for field in df.schema.fields
    if isinstance(field.dataType, DateType)
]

print("String Columns:", string_columns)
print("Numeric Columns:", numeric_columns)
print("Date Columns:", date_columns)


String Columns: ['Agency', 'Posting Type', 'Business Title', 'Civil Service Title', 'Title Code No', 'Level', 'Job Category', 'Full-Time/Part-Time indicator', 'Salary Frequency', 'Work Location', 'Division/Work Unit', 'Job Description', 'Minimum Qual Requirements', 'Preferred Skills', 'Additional Information', 'To Apply', 'Hours/Shift', 'Work Location 1', 'Recruitment Contact', 'Residency Requirement', 'Posting Date', 'Post Until', 'Posting Updated', 'Process Date']
Numeric Columns: ['Job ID', '# Of Positions', 'Salary Range From', 'Salary Range To']
Date Columns: []


In [6]:
# ---------------------------------------------------------
# Null Value Analysis
# ---------------------------------------------------------
# Objective:
# - Quantify missing values per column
# - Identify columns with high null density
# - Support decisions on feature removal or imputation
# ---------------------------------------------------------

from pyspark.sql.functions import col, count, when

null_profile_df = df.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in df.columns
])

null_profile_df.show(truncate=False)


+------+------+------------+--------------+--------------+-------------------+-------------+-----+------------+-----------------------------+-----------------+---------------+----------------+-------------+------------------+---------------+-------------------------+----------------+----------------------+--------+-----------+---------------+-------------------+---------------------+------------+----------+---------------+------------+
|Job ID|Agency|Posting Type|# Of Positions|Business Title|Civil Service Title|Title Code No|Level|Job Category|Full-Time/Part-Time indicator|Salary Range From|Salary Range To|Salary Frequency|Work Location|Division/Work Unit|Job Description|Minimum Qual Requirements|Preferred Skills|Additional Information|To Apply|Hours/Shift|Work Location 1|Recruitment Contact|Residency Requirement|Posting Date|Post Until|Posting Updated|Process Date|
+------+------+------------+--------------+--------------+-------------------+-------------+-----+------------+---------

In [7]:
# ---------------------------------------------------------
# Categorical Cardinality Check
# ---------------------------------------------------------
# Objective:
# - Understand uniqueness of key categorical fields
# - Evaluate suitability for grouping and KPI generation
# ---------------------------------------------------------

agency_count = df.select("Agency").distinct().count()
job_category_count = df.select("Job Category").distinct().count()

print(f"Distinct Agencies: {agency_count}")
print(f"Distinct Job Categories: {job_category_count}")


Distinct Agencies: 52
Distinct Job Categories: 131


In [8]:
# ---------------------------------------------------------
# Salary Range Validation
# ---------------------------------------------------------
# Objective:
# - Validate min, max, and mean salary values
# - Detect outliers or invalid salary ranges
# - Confirm readiness for salary-based KPIs
# ---------------------------------------------------------

df.select(
    "Salary Range From",
    "Salary Range To"
).summary("min", "max", "mean").show()


+-------+------------------+-----------------+
|summary| Salary Range From|  Salary Range To|
+-------+------------------+-----------------+
|    min|               0.0|            10.36|
|    max|          218587.0|         234402.0|
|   mean|58904.139793856084|85535.71162739306|
+-------+------------------+-----------------+

