In [None]:
pip install numpy pandas SQLAlchemy psycopg2-binary

In [None]:
#Import related libaries
import numpy as np
import pandas as pd
import os
from sqlalchemy import create_engine
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

spark = SparkSession.builder \
    .appName("PySpark PostgreSQL and Cassandra Example") \
    .master("local[*]") \
    .config("spark.executor.memory", "2g") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.2,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \
    .config("spark.cassandra.connection.host", "scylladb") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.cassandra.auth.username", "cassandra") \
    .config("spark.cassandra.auth.password", "cassandra") \
    .getOrCreate()

df_postgres = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/demo") \
    .option("dbtable", "bos_air_traffic") \
    .option("user", "postgres") \
    .option("password", "1234qwer") \
    .option("driver", "org.postgresql.Driver") \
    .load()

df_postgres.show()

In [None]:
# Read CSV files from a directory
df_csv = spark.read \
    .format("csv") \
    .option("header", True) \
    .option("inferSchema", "true") \
    .load("./sensorInput")  

df_csv.show()

In [None]:

from pyspark.sql.functions import col, row_number, length
from pyspark.sql.window import Window
from pyspark.sql.functions import lit

# Continuing from the previous step where df_csv was loaded


# Union the data from PostgreSQL and CSV (adjust the renaming to match your schema)
df_combined = df_postgres.unionByName(df_csv)

# Define the window specification for deduplication based on some criteria like 'callsign'
windowSpec = Window.partitionBy("callsign").orderBy(col("last_contact").desc())

# Apply the window function to add a row number within each partition
df_combined_with_row_number = df_combined.withColumn("row_num", row_number().over(windowSpec))

# Filter to get only the latest records for each 'callsign', assuming 'callsign' must be non-empty
df_latest = df_combined_with_row_number.filter(
    (col("row_num") == 1) &
    col("callsign").isNotNull() &
    (length(col("callsign")) > 0)
).select(
    "callsign", "longitude", "latitude", "on_ground", "squawk"
)

# Show the DataFrame to verify the content
df_latest.show()




In [None]:
# Write the results back to Cassandra in a table designed for latest flight data
df_latest.write \
    .format("org.apache.spark.sql.cassandra") \
    .mode('append') \
    .option("keyspace", "demo") \
    .option("table", "latest_flight_data") \
    .save()