In [None]:
# Clean up any previous Spark installations
!rm -rf spark-3.4.1-bin-hadoop3 spark-3.4.1-bin-hadoop3.tgz

# Install Java (required for Spark)
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

In [None]:
# Download Spark
!wget https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz

# Confirm the file exists
!ls -lh spark-3.4.1-bin-hadoop3.tgz

# Extract Spark
!tar -xzf spark-3.4.1-bin-hadoop3.tgz

--2025-04-07 13:23:05--  https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 388341449 (370M) [application/x-gzip]
Saving to: ‘spark-3.4.1-bin-hadoop3.tgz’


2025-04-07 13:24:43 (3.77 MB/s) - ‘spark-3.4.1-bin-hadoop3.tgz’ saved [388341449/388341449]

-rw-r--r-- 1 root root 371M Jun 19  2023 spark-3.4.1-bin-hadoop3.tgz


In [None]:
# Install findspark
!pip install -q findspark

In [None]:
import os
import findspark

# Set environment paths
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

# Initialize findspark
findspark.init()

from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("FlightDelayPrediction") \
    .getOrCreate()

spark

In [None]:
import pandas as pd

# Load datasets
delays_pd = pd.read_csv("https://raw.githubusercontent.com/databricks/LearningSparkV2/master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv")
airports_pd = pd.read_csv("https://raw.githubusercontent.com/databricks/LearningSparkV2/master/databricks-datasets/learning-spark-v2/flights/airport-codes-na.txt", sep="\t")

# Preview
delays_pd.head(), airports_pd.head()

(      date  delay  distance origin destination
 0  1011245      6       602    ABE         ATL
 1  1020600     -8       369    ABE         DTW
 2  1021245     -2       602    ABE         ATL
 3  1020605     -4       602    ABE         ATL
 4  1031245     -4       602    ABE         ATL,
          City State Country IATA
 0  Abbotsford    BC  Canada  YXX
 1    Aberdeen    SD     USA  ABR
 2     Abilene    TX     USA  ABI
 3       Akron    OH     USA  CAK
 4     Alamosa    CO     USA  ALS)

In [None]:
# Convert to Spark DataFrames
delays_df = spark.createDataFrame(delays_pd)
airports_df = spark.createDataFrame(airports_pd)

# Check schema
delays_df.printSchema()
airports_df.printSchema()

root
 |-- date: long (nullable = true)
 |-- delay: long (nullable = true)
 |-- distance: long (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- IATA: string (nullable = true)



In [None]:
# Filter for SEA and SFO
filtered_df = delays_df.filter(delays_df.origin.isin("SEA", "SFO"))

# Drop nulls and limit rows for faster processing
filtered_df = filtered_df.na.drop().limit(10000)

# Show sample
filtered_df.show(5)

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011425|   92|    1495|   SEA|        ORD|
|1010715|   -7|    2104|   SEA|        JFK|
|1010830|   -5|    1442|   SEA|        DFW|
|1012205|   -3|    2367|   SEA|        MIA|
|1010600|   -3|    1442|   SEA|        DFW|
+-------+-----+--------+------+-----------+
only showing top 5 rows



In [None]:
from pyspark.sql.functions import when

# Create binary label
labeled_df = filtered_df.withColumn("label", when(filtered_df.delay > 10, 1).otherwise(0))

# Show sample
labeled_df.select("origin", "destination", "delay", "label").show(5)

+------+-----------+-----+-----+
|origin|destination|delay|label|
+------+-----------+-----+-----+
|   SEA|        ORD|   92|    1|
|   SEA|        JFK|   -7|    0|
|   SEA|        DFW|   -5|    0|
|   SEA|        MIA|   -3|    0|
|   SEA|        DFW|   -3|    0|
+------+-----------+-----+-----+
only showing top 5 rows



In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Encode origin and destination
origin_indexer = StringIndexer(inputCol="origin", outputCol="origin_index")
dest_indexer = StringIndexer(inputCol="destination", outputCol="dest_index")

# Check if distance column exists
if "distance" in labeled_df.columns:
    input_features = ["origin_index", "dest_index", "distance"]
else:
    input_features = ["origin_index", "dest_index"]

# Assemble features
assembler = VectorAssembler(inputCols=input_features, outputCol="features")

# Define model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Create pipeline
pipeline = Pipeline(stages=[origin_indexer, dest_indexer, assembler, lr])

In [None]:
# Split data
train_data, test_data = labeled_df.randomSplit([0.7, 0.3], seed=42)

# Train model
model = pipeline.fit(train_data)

In [None]:
# Predict
predictions = model.transform(test_data)

# Show output
predictions.select("origin", "destination", "delay", "label", "prediction", "probability").show(10)

+------+-----------+-----+-----+----------+--------------------+
|origin|destination|delay|label|prediction|         probability|
+------+-----------+-----+-----+----------+--------------------+
|   SEA|        DEN|    7|    0|       0.0|[0.79761836102784...|
|   SEA|        SLC|    0|    0|       0.0|[0.80880401119766...|
|   SEA|        ANC|   -3|    0|       0.0|[0.79680179611274...|
|   SEA|        DFW|   -3|    0|       0.0|[0.78622802805093...|
|   SFO|        DFW|   -3|    0|       0.0|[0.82196867460839...|
|   SEA|        OAK|   -2|    0|       0.0|[0.81030399833031...|
|   SEA|        ORD|   -3|    0|       0.0|[0.79009604162233...|
|   SEA|        IAH|   13|    1|       0.0|[0.80991377166334...|
|   SEA|        SAN|    2|    0|       0.0|[0.81259286186706...|
|   SEA|        LAX|   -2|    0|       0.0|[0.79103634987026...|
+------+-----------+-----+-----+----------+--------------------+
only showing top 10 rows



In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate using AUC
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)

print(f"Updated AUC: {auc:.4f}")

# Interpretation
if auc > 0.8:
    print("✅ The model performs well at distinguishing delays.")
elif auc > 0.6:
    print("🟡 The model has moderate performance. Consider adding features.")
else:
    print("🔴 The model performs poorly. Needs improvement with better features or models.")

Updated AUC: 0.5380
🔴 The model performs poorly. Needs improvement with better features or models.
