In [2]:
pip install cassandra-driver

Note: you may need to restart the kernel to use updated packages.


In [4]:
import csv
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

# Connect to Cassandra
auth_provider = PlainTextAuthProvider(username='admin', password='admin')
cluster = Cluster(['cassandra'], auth_provider=auth_provider)
session = cluster.connect()

# Select your keyspace
session.set_keyspace('demo')

# Define the CSV file location
csv_file_path = 'boston_air_traffic_hourly2.csv'

create_database_cql = "CREATE KEYSPACE demo WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};"
session.execute(create_database_cql)

# Create table (if not exists)
create_table_cql = """
CREATE TABLE IF NOT EXISTS flight_data (
    icao24 text,
    callsign text,
    origin_country text,
    time_position bigint,
    last_contact bigint,
    longitude double,
    latitude double,
    baro_altitude double,
    on_ground boolean,
    velocity double,
    true_track double,
    vertical_rate double,
    sensors text,
    geo_altitude double,
    squawk text,
    spi boolean,
    position_source int,
    PRIMARY KEY (icao24, last_contact)
) WITH cdc=true;
"""
session.execute(create_table_cql)

create_result_table_cql="""
CREATE TABLE IF NOT EXISTS latest_flight_data (
    callsign TEXT PRIMARY KEY,
    longitude DOUBLE,
    latitude DOUBLE,
    on_ground BOOLEAN,
    squawk TEXT
);
"""
session.execute(create_result_table_cql)

# Function to insert data into the table
def insert_data(row):
    insert_cql = """
    INSERT INTO flight_data (icao24, callsign, origin_country, time_position, last_contact, longitude, latitude, baro_altitude, on_ground, velocity, true_track, vertical_rate, sensors, geo_altitude, squawk, spi, position_source)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    session.execute(insert_cql, row)

# Read CSV and insert each row into Cassandra
with open(csv_file_path, newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header row
    for row in reader:
        # Convert appropriate columns to their data types
        row[3] = int(row[3]) if row[3] else None
        row[4] = int(row[4]) if row[4] else None
        row[5] = float(row[5]) if row[5] else None
        row[6] = float(row[6]) if row[6] else None
        row[7] = float(row[7]) if row[7] else None
        row[8] = row[8].lower() == 'true'
        row[9] = float(row[9]) if row[9] else None
        row[10] = float(row[10]) if row[10] else None
        row[11] = float(row[11]) if row[11] else None
        row[13] = float(row[13]) if row[13] else None
        row[15] = row[15].lower() == 'true'
        row[16] = int(row[16]) if row[16] else None
        insert_data(row)

# Close the connection
cluster.shutdown()
