This is a helper notebook for [Project 4](https://docs.google.com/document/d/1-bpfQw_rlfHUfWkQyWwmPYfMuy931SpU/edit?usp=sharing&ouid=115686932443554238260&rtpof=true&sd=true).
It assumes that you have downloaded one of the files (`20241001realtime_zone_csv.zip`) from the NYISO dataset and placed it on your Google Drive.

In [1]:
from google.colab import drive
import os
import zipfile

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
base_path = '/content/drive/My Drive/nyiso'
unzip_folder = 'unzipped'
zip_files = ['20241001realtime_zone_csv.zip']

In [4]:
def unzip_files(zip_files, base_path):
    for zip_file in zip_files:
        zip_path = os.path.join(base_path, zip_file)
        extract_dir = os.path.join(base_path, unzip_folder, f"{zip_file.replace('.zip', '')}")

        with zipfile.ZipFile(zip_path, 'r') as z:
            z.extractall(extract_dir)

unzip_files(zip_files, base_path)

Process all CSVs in the unzipped folder for each month and publish all records for a specific timestamp to a socket once a second.

In [5]:
import socket
import random
import json
from datetime import datetime
import time
import threading
import pandas as pd

host = "127.0.0.1"
port = 9999

server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.bind((host, port))
server_socket.listen(1)


def publish_csv_files_in_directory(directory_path, client_socket):
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)

        if filename.endswith('.csv'):
            print(f"Processing {filename}...")
            df = pd.read_csv(file_path)
            previous_timestamp = None
            for index, row in df.iterrows():
                current_timestamp = row['Time Stamp']
                message_str = json.dumps(row.to_dict()) + "\n"
                client_socket.sendall(message_str.encode('utf-8'))
                if previous_timestamp is None or current_timestamp != previous_timestamp:
                    time.sleep(1)

                previous_timestamp = current_timestamp

def publish_message(client_socket):
    for zip_file in zip_files:
        extract_dir = os.path.join(base_path, unzip_folder, f"{zip_file.replace('.zip', '')}")
        publish_csv_files_in_directory(extract_dir, client_socket)

def start_socket_server():
    print(f"Server started on {host}:{port}")
    while True:
        client_socket, client_address = server_socket.accept()
        print(f"Connection established with {client_address}")

        publish_thread = threading.Thread(target=publish_message, args=(client_socket,))
        publish_thread.daemon = True
        publish_thread.start()

socket_server_thread = threading.Thread(target=start_socket_server)
socket_server_thread.daemon = True
socket_server_thread.start()

print("Socket server is running in the background.")


Server started on 127.0.0.1:9999Socket server is running in the background.



In [6]:
def print_stream(streaming_df, output_mode="append"):

  def process_batch(df, epoch_id):
          df.show()

  query = streaming_df.writeStream \
      .foreachBatch(process_batch) \
      .outputMode(output_mode) \
      .start()


  try:
      query.awaitTermination()
  except KeyboardInterrupt:
      query.stop()

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, to_timestamp
from pyspark.sql.types import StringType, StructType, StructField, DoubleType

spark = SparkSession.builder \
    .appName("NYISOStreaming") \
    .master("local") \
    .getOrCreate()

streaming_df = spark.readStream \
    .format("socket") \
    .option("host", "127.0.0.1") \
    .option("port", 9999)  \
    .load()

print_stream(streaming_df)

Connection established with ('127.0.0.1', 47308)
Processing 20241001realtime_zone.csv...
+-----+
|value|
+-----+
+-----+

+--------------------+
|               value|
+--------------------+
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
+--------------------+
only showing top 20 rows

+--------------------+
|               value|
+--------------------+
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|{"Time Stamp": "1...|
|

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: reentrant call inside <_io.BufferedReader name=55>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in sen

Py4JError: An error occurred while calling o35.awaitTermination