In [1]:
!nodetool status
# run after docker compose up -d
# might take ~30 seconds

Datacenter: datacenter1
Status=Up/Down
|/ State=Normal/Leaving/Joining/Moving
--  Address     Load       Tokens  Owns (effective)  Host ID                               Rack 
UN  172.21.0.4  99.39 KiB  16      77.2%             fc942320-2bfa-4c8b-8800-f74ed0f2b9b7  rack1
UN  172.21.0.2  70.26 KiB  16      58.8%             908a4bc2-9776-4515-8614-dbf57cdeb73b  rack1
UN  172.21.0.3  70.24 KiB  16      64.0%             2fc9abf7-4ae3-41ed-9b3d-7a9e3638c103  rack1



In [2]:
from cassandra.cluster import Cluster
# Connect to the Cassandra cluster
cluster = Cluster(['p6-db-1', 'p6-db-2', 'p6-db-3'])
cass = cluster.connect()

In [3]:
cass.execute("DROP KEYSPACE IF EXISTS weather")

<cassandra.cluster.ResultSet at 0x7f52bbaea7a0>

In [4]:
cass.execute("""
CREATE KEYSPACE weather
WITH REPLICATION = { 
   'class' : 'SimpleStrategy', 
   'replication_factor' : 3 
};
""")

<cassandra.cluster.ResultSet at 0x7f5290644c10>

In [5]:
cass.execute("USE weather")

<cassandra.cluster.ResultSet at 0x7f52bba57ca0>

In [6]:
cass.execute("CREATE TYPE station_record (tmin int, tmax int)")

<cassandra.cluster.ResultSet at 0x7f529176a6e0>

In [7]:
cass.execute("""
create table stations(
    id TEXT,
    name TEXT STATIC,
    date DATE,
    record weather.station_record,
    PRIMARY KEY ((id), date) 
) WITH CLUSTERING ORDER BY (date ASC)
""")

<cassandra.cluster.ResultSet at 0x7f52bba57df0>

In [10]:
#q1
#What is the Schema of stations?
cass.execute("describe table weather.stations").one().create_statement

"CREATE TABLE weather.stations (\n    id text,\n    date date,\n    name text static,\n    record station_record,\n    PRIMARY KEY (id, date)\n) WITH CLUSTERING ORDER BY (date ASC)\n    AND additional_write_policy = '99p'\n    AND bloom_filter_fp_chance = 0.01\n    AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n    AND cdc = false\n    AND comment = ''\n    AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}\n    AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n    AND memtable = 'default'\n    AND crc_check_chance = 1.0\n    AND default_time_to_live = 0\n    AND extensions = {}\n    AND gc_grace_seconds = 864000\n    AND max_index_interval = 2048\n    AND memtable_flush_period_in_ms = 0\n    AND min_index_interval = 128\n    AND read_repair = 'BLOCKING'\n    AND speculative_retry = '99p';"

In [11]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName("p6")
         .config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.4.0')
         .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions")
         .getOrCreate())
# first build spark session to read the input file, then dump the file into cassandra table

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c87fc3f3-d523-435a-b595-25b42cd4922a;1.0
	confs: [default]
	found com.datastax.spark#spark-cassandra-connector_2.12;3.4.0 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.12;3.4.0 in central
	found com.datastax.oss#java-driver-core-shaded;4.13.0 in central
	found com.datastax.oss#native-protocol;1.5.0 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.12 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found com.github.stephenc.jcip#jcip-annotations;1.0-1 in central
	found com.gith

In [13]:
df = spark.read.text("ghcnd-stations.txt")

In [15]:
from pyspark.sql.functions import col, expr, rtrim

In [16]:
df2 = (df.withColumn("ID", expr("substring(value, 1, 11)"))
       .withColumn("STATE", expr("substring(value, 39, 2)"))
       .withColumn("NAME", rtrim(expr("substring(value, 42, 30)")))
       .drop("value"))
# refer to the documentation of "ghcnd-stations.txt
# trailing spaces after name need to be trimmed

In [18]:
filtered_df = df2.where(df2.STATE == "WI")

In [19]:
rows = filtered_df.collect()
len(rows) # 1313 rows with STATE == "WI"

1313

In [20]:
for row in rows:
    cass.execute("""
        INSERT INTO stations (ID, NAME)
        VALUES (%s, %s)
        """,(row.ID, row.NAME))
# weather_list is a list derived from spark
# this step is essentially moving data from spark to cassandra

In [21]:
cass.execute("SELECT COUNT(*) FROM weather.stations").one()[0]

1313

In [22]:
#q2
#What is the name corresponding to station ID USW00014837?
cass.execute("""
    SELECT NAME 
    FROM weather.stations 
    WHERE ID = 'USW00014837'
""").one()[0]

'MADISON DANE CO RGNL AP'

In [23]:
#q3
#what is the token for the USC00470273 station?
token_0273 = cass.execute("""
    SELECT TOKEN(ID)
    FROM weather.stations 
    WHERE ID = 'USC00470273'
""").one()[0]
token_0273

-9014250178872933741

In [24]:
import subprocess
# Use check_output to run nodetool ring
output = subprocess.check_output(['nodetool', 'ring'])

In [25]:
output_str = output.decode('utf-8')
lines = output_str.split('\n')
tokens = []
for line in lines:
    parts = line.split()
    if len(parts) > 1 and parts[-1].lstrip('-').isdigit(): # without lstrip, token with leading "-" will be removed
        tokens.append(int(parts[-1]))

In [26]:
#q4
#what is the first vnode token in the ring following the token for USC00470273?
for i, node in enumerate(tokens):
    if token_0273 > tokens[i] and i + 1 == len(tokens): # wrapping around
        ans = tokens[0]
        break
    elif token_0273 < tokens[i+1]:
        ans = tokens[i+1]
        break
ans

-8629239957855773006

In [27]:
import os
import shutil

folder_path = 'records.parquet'

# Check if the folder exists
if os.path.exists(folder_path) and os.path.isdir(folder_path):
    # Remove the folder
    shutil.rmtree(folder_path)

!unzip records.zip

Archive:  records.zip
replace records.parquet/part-00000-574ab704-2431-4c8b-9d88-6c635a467b99-c000.snappy.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [28]:
df_record = spark.read.parquet("records.parquet")

                                                                                

In [29]:
from pyspark.sql import functions as F
df2_record = (df_record.groupBy("station", "date")
                      .pivot("element", ["TMAX","TMIN"])
                      .agg(F.first("value"))
                      .orderBy("station"))
# Use Spark to load this and re-arrange the data so that there is 
# (a) one row per station/date combination, and (b) tmin and tmax columns
# Hint: The pivot function in PySpark might be useful to rearrange the data

In [30]:
df2_record.show(10)

                                                                                

+-----------+--------+------+------+
|    station|    date|  TMAX|  TMIN|
+-----------+--------+------+------+
|USR0000WDDG|20220806| 317.0| 217.0|
|USR0000WDDG|20220924| 161.0|  94.0|
|USR0000WDDG|20220628| 283.0| 161.0|
|USR0000WDDG|20220130| -33.0|-117.0|
|USR0000WDDG|20220919| 278.0| 139.0|
|USR0000WDDG|20220414|  50.0| -17.0|
|USR0000WDDG|20220629| 306.0| 150.0|
|USR0000WDDG|20221114|  17.0| -61.0|
|USR0000WDDG|20220712| 289.0| 156.0|
|USR0000WDDG|20220202|-106.0|-150.0|
+-----------+--------+------+------+
only showing top 10 rows



In [None]:
# cass.execute("""
# create table stations(
#     id TEXT,
#     name TEXT STATIC,
#     date DATE,
#     record weather.station_record,
#     PRIMARY KEY ((id), date) 
# ) WITH CLUSTERING ORDER BY (date ASC)
# """)

In [31]:
import grpc
import station_pb2 
import station_pb2_grpc 


rows = df2_record.collect()
channel = grpc.insecure_channel(f"localhost:5440") 
stub = station_pb2_grpc.StationStub(channel)

for row in rows:
    year = row.date[:4] # CQL requires that you insert date data in yyyy-mm-dd format
    month = row.date[4:6]
    day = row.date[6:]    
    request = station_pb2.RecordTempsRequest(
        station=row.station,
        date=str(year + "-" + month + "-" + day),
        tmin=int(row.TMIN),
        tmax=int(row.TMAX)
    )
    try:
        response = stub.RecordTemps(request)
        # print(response)
    except grpc.RpcError as e:
        print("gRPC call failed: {}".format(e))

In [32]:
result = cass.execute("""
SELECT *
FROM stations
WHERE id = 'USW00014837'
LIMIT 5
""")
for row in result:
    print(row)

Row(id='USW00014837', date=Date(18993), name='MADISON DANE CO RGNL AP', record=station_record(tmin=-99, tmax=-32))
Row(id='USW00014837', date=Date(18994), name='MADISON DANE CO RGNL AP', record=station_record(tmin=-166, tmax=-82))
Row(id='USW00014837', date=Date(18995), name='MADISON DANE CO RGNL AP', record=station_record(tmin=-177, tmax=-66))
Row(id='USW00014837', date=Date(18996), name='MADISON DANE CO RGNL AP', record=station_record(tmin=-88, tmax=-5))
Row(id='USW00014837', date=Date(18997), name='MADISON DANE CO RGNL AP', record=station_record(tmin=-116, tmax=-5))


In [33]:
#q5
#what is the max temperature ever seen for station USW00014837?
request = station_pb2.StationMaxRequest(station='USW00014837')
response = stub.StationMax(request)
response

tmax: 356

In [34]:
# already enabled CassandraSparkExtensions when creating your Spark session, so can create a Spark DataFrame corresponding to a Cassandra table like this
df = (spark.read.format("org.apache.spark.sql.cassandra")
      .option("spark.cassandra.connection.host", "p6-db-1,p6-db-2,p6-db-3")
      .option("keyspace", "weather")
      .option("table", "stations")
      .load())

In [35]:
# Create a temporary view in Spark named stations that corresponds to the stations table in Cassandra.
df.createOrReplaceTempView("stations")

In [36]:
#q6
#what tables/views are available in the Spark catalog?
spark.sql("DROP TABLE IF EXISTS id_state_name_table")
spark.catalog.listTables()

[Table(name='stations', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [37]:
#q7
#what is the average difference between tmax and tmin, for each of the four stations that have temperature records?
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

def get_tmax(record):
    return record.tmax if record is not None else None

def get_tmin(record):
    return record.tmin if record is not None else None

get_tmax_udf = udf(get_tmax, IntegerType())
get_tmin_udf = udf(get_tmin, IntegerType())

spark.udf.register("get_tmax", get_tmax_udf)
spark.udf.register("get_tmin", get_tmin_udf)

diff_df = spark.sql("""
SELECT id, AVG(get_tmax(record) - get_tmin(record)) as diff
FROM stations
WHERE id = "USW00014839"
OR id = "USR0000WDDG"
OR id = "USW00014837"
OR id = "USW00014898"
GROUP BY id
""")
rows = diff_df.collect()
result_dict = {row['id']: row['diff'] for row in rows}
result_dict

                                                                                

{'USW00014839': 89.6986301369863,
 'USR0000WDDG': 102.06849315068493,
 'USW00014837': 105.62739726027397,
 'USW00014898': 102.93698630136986}

23/11/20 23:57:49 WARN ChannelPool: [s0|p6-db-2/172.21.0.3:9042]  Error while opening new channel (ConnectionInitException: [s0|connecting...] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=DataStax Java driver for Apache Cassandra(R), DRIVER_VERSION=4.13.0, CLIENT_ID=688ad012-73fc-4359-9b5a-1b1c0b87f2be, APPLICATION_NAME=Spark-Cassandra-Connector-local-1700521939242}): failed to send request (java.nio.channels.NotYetConnectedException))
23/11/20 23:57:56 WARN ChannelPool: [s0|p6-db-2/172.21.0.3:9042]  Error while opening new channel (ConnectionInitException: [s0|connecting...] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=DataStax Java driver for Apache Cassandra(R), DRIVER_VERSION=4.13.0, CLIENT_ID=688ad012-73fc-4359-9b5a-1b1c0b87f2be, APPLICATION_NAME=Spark-Cassandra-Connector-local-1700521939242}): failed to send request (java.nio.channels.NotYetConnectedException))


In [39]:
#q8
#ran a docker command to kill the p6-db-2 container
#what does nodetool status output?
! nodetool status

Datacenter: datacenter1
Status=Up/Down
|/ State=Normal/Leaving/Joining/Moving
--  Address     Load       Tokens  Owns (effective)  Host ID                               Rack 
UN  172.21.0.4  87.72 KiB  16      100.0%            fc942320-2bfa-4c8b-8800-f74ed0f2b9b7  rack1
UN  172.21.0.2  87.74 KiB  16      100.0%            908a4bc2-9776-4515-8614-dbf57cdeb73b  rack1
DN  172.21.0.3  87.73 KiB  16      100.0%            2fc9abf7-4ae3-41ed-9b3d-7a9e3638c103  rack1



23/11/21 00:00:14 WARN ChannelPool: [s0|p6-db-2/172.21.0.3:9042]  Error while opening new channel (ConnectionInitException: [s0|connecting...] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=DataStax Java driver for Apache Cassandra(R), DRIVER_VERSION=4.13.0, CLIENT_ID=688ad012-73fc-4359-9b5a-1b1c0b87f2be, APPLICATION_NAME=Spark-Cassandra-Connector-local-1700521939242}): failed to send request (com.datastax.oss.driver.shaded.netty.channel.StacklessClosedChannelException))


In [44]:
#q9
#if you make a StationMax RPC call, what does the error field contain in StationMaxReply reply?
request = station_pb2.StationMaxRequest(station='USW00014837')
response = stub.StationMax(request)
response

error: "need 3 replicas, but only have 2"

23/11/21 00:19:14 WARN ChannelPool: [s0|p6-db-2/172.21.0.3:9042]  Error while opening new channel (ConnectionInitException: [s0|connecting...] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=DataStax Java driver for Apache Cassandra(R), DRIVER_VERSION=4.13.0, CLIENT_ID=688ad012-73fc-4359-9b5a-1b1c0b87f2be, APPLICATION_NAME=Spark-Cassandra-Connector-local-1700521939242}): failed to send request (com.datastax.oss.driver.shaded.netty.channel.StacklessClosedChannelException))


In [45]:
#q10
#if you make a RecordTempsRequest RPC call, what does error contain in the RecordTempsReply reply?
request = station_pb2.RecordTempsRequest(
    station="UWMADISON",
    date=str("2023-4-20"),
    tmin=int(0),
    tmax=int(100)
)

response = stub.RecordTemps(request)
response



23/11/21 00:20:08 WARN ChannelPool: [s0|p6-db-2/172.21.0.3:9042]  Error while opening new channel (ConnectionInitException: [s0|connecting...] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=DataStax Java driver for Apache Cassandra(R), DRIVER_VERSION=4.13.0, CLIENT_ID=688ad012-73fc-4359-9b5a-1b1c0b87f2be, APPLICATION_NAME=Spark-Cassandra-Connector-local-1700521939242}): failed to send request (com.datastax.oss.driver.shaded.netty.channel.StacklessClosedChannelException))
23/11/21 00:21:00 WARN ChannelPool: [s0|p6-db-2/172.21.0.3:9042]  Error while opening new channel (ConnectionInitException: [s0|connecting...] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=DataStax Java driver for Apache Cassandra(R), DRIVER_VERSION=4.13.0, CLIENT_ID=688ad012-73fc-4359-9b5a-1b1c0b87f2be, APPLICATION_NAME=Spark-Cassandra-Connector-local-1700521939242}): failed to send request (com.datastax.oss.driver.shaded.netty.channel.StacklessClose