# Project Draft

#### Requirements
- ```Python 3.9.18``` (conda env)
- ```pip freeze > requirements.txt```
- ```conda env export > environment.yml```

In [73]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
#from pyspark.sql.functions import *
from pyspark.sql.functions import window, sum as _sum

In [74]:
spark = SparkSession.builder.appName("StreamingFromCSV").getOrCreate()

# load df from csv
df = spark.read.csv("streaming_data_dir/drivedata-small.csv", header=True, inferSchema=True)

                                                                                

In [75]:
# show only rows where failure is not 0
df.where(df.failure != 0).show()




+-------------------+-------------+--------------------+-------+--------+------------------+-------------------------+---------------+-------------------+---------------------------+------------------+------------------------+-----------------+--------------------+---------------------+------------------------+--------------------------------+-------------------------+----------------------------------+--------------------+--------------------+----------------------------+-----------------------+----------------------------+----------------------------+------------------------+---------------------------+----------------------------+---------------------------+--------------------------+-------------------------+--------------------------+---------------+-----------------+---------------------+-----------------+----------------------+-----------------------+--------------------+
|               date|serial_number|               model|failure|vault_id|s1_read_error_rate|s2_throughput_pe

                                                                                

In [76]:
# show the column names and types
df.printSchema()


root
 |-- date: timestamp (nullable = true)
 |-- serial_number: string (nullable = true)
 |-- model: string (nullable = true)
 |-- failure: integer (nullable = true)
 |-- vault_id: integer (nullable = true)
 |-- s1_read_error_rate: integer (nullable = true)
 |-- s2_throughput_performance: integer (nullable = true)
 |-- s3_spin_up_time: integer (nullable = true)
 |-- s4_start_stop_count: integer (nullable = true)
 |-- s5_reallocated_sector_count: integer (nullable = true)
 |-- s7_seek_error_rate: long (nullable = true)
 |-- s8_seek_time_performance: integer (nullable = true)
 |-- s9_power_on_hours: integer (nullable = true)
 |-- s10_spin_retry_count: integer (nullable = true)
 |-- s12_power_cycle_count: integer (nullable = true)
 |-- s173_wear_leveling_count: long (nullable = true)
 |-- s174_unexpected_power_loss_count: integer (nullable = true)
 |-- s183_sata_downshift_count: integer (nullable = true)
 |-- s187_reported_uncorrectable_errors: integer (nullable = true)
 |-- s188_command_

In [77]:
# load cluster centers from csv
cluster_centers = spark.read.csv("cluster_data_dir/clusters.csv", header=True, inferSchema=True)

In [78]:
# show the column names and types
cluster_centers.printSchema()

root
 |-- cluster: integer (nullable = true)
 |-- s1_read_error_rate: double (nullable = true)
 |-- s2_throughput_performance: double (nullable = true)
 |-- s3_spin_up_time: double (nullable = true)
 |-- s4_start_stop_count: double (nullable = true)
 |-- s5_reallocated_sector_count: double (nullable = true)
 |-- s7_seek_error_rate: double (nullable = true)
 |-- s8_seek_time_performance: double (nullable = true)
 |-- s9_power_on_hours: double (nullable = true)
 |-- s10_spin_retry_count: double (nullable = true)
 |-- s12_power_cycle_count: double (nullable = true)
 |-- s173_wear_leveling_count: double (nullable = true)
 |-- s174_unexpected_power_loss_count: double (nullable = true)
 |-- s183_sata_downshift_count: double (nullable = true)
 |-- s187_reported_uncorrectable_errors: double (nullable = true)
 |-- s188_command_timeout: double (nullable = true)
 |-- s189_high_fly_writes: double (nullable = true)
 |-- s190_airflow_temperature_cel: double (nullable = true)
 |-- s191_g_sense_error_

In [79]:
# drop _c35: string (nullable = true) from cluster_centers
cluster_centers = cluster_centers.drop("_c35")

_c35: string (nullable = true) is just an empty cell

We use date and vault ID to cluster all features s1 through s242 and re-write the centers

In [80]:

schema = StructType([
    StructField("date", TimestampType(), True),
    StructField("serial_number", StringType(), True),
    StructField("model", StringType(), True),
    StructField("failure", IntegerType(), True),
    StructField("vault_id", IntegerType(), True),
    StructField("s1_read_error_rate", IntegerType(), True),
    StructField("s2_throughput_performance", IntegerType(), True),
    StructField("s3_spin_up_time", IntegerType(), True),
    StructField("s4_start_stop_count", IntegerType(), True),
    StructField("s5_reallocated_sector_count", IntegerType(), True),
    StructField("s7_seek_error_rate", IntegerType(), True),
    StructField("s8_seek_time_performance", IntegerType(), True),
    StructField("s9_power_on_hours", IntegerType(), True),
    StructField("s10_spin_retry_count", IntegerType(), True),
    StructField("s12_power_cycle_count", IntegerType(), True),
    StructField("s173_wear_leveling_count", IntegerType(), True),
    StructField("s174_unexpected_power_loss_count", IntegerType(), True),
    StructField("s183_sata_downshift_count", IntegerType(), True),
    StructField("s187_reported_uncorrectable_errors", IntegerType(), True),
    StructField("s188_command_timeout", IntegerType(), True),
    StructField("s189_high_fly_writes", IntegerType(), True),
    StructField("s190_airflow_temperature_cel", IntegerType(), True),
    StructField("s191_g_sense_error_rate", IntegerType(), True),
    StructField("s192_power_off_retract_count", IntegerType(), True),
    StructField("s193_load_unload_cycle_count", IntegerType(), True),
    StructField("s194_temperature_celsius", IntegerType(), True),
    StructField("s195_hardware_ecc_recovered", IntegerType(), True),
    StructField("s196_reallocated_event_count", IntegerType(), True),
    StructField("s197_current_pending_sector", IntegerType(), True),
    StructField("s198_offline_uncorrectable", IntegerType(), True),
    StructField("s199_udma_crc_error_count", IntegerType(), True),
    StructField("s200_multi_zone_error_rate", IntegerType(), True),
    StructField("s220_disk_shift", IntegerType(), True),
    StructField("s222_loaded_hours", IntegerType(), True),
    StructField("s223_load_retry_count", IntegerType(), True),
    StructField("s226_load_in_time", IntegerType(), True),
    StructField("s240_head_flying_hours", IntegerType(), True),
    StructField("s241_total_lbas_written", IntegerType(), True),
    StructField("s242_total_lbas_read", IntegerType(), True)
])

#read from csv file
#drivedata = spark.readStream.csv("streaming_data_dir/drivedata-small.csv", schema=schema, header=True)

# Read the CSV files as a data stream
streamingData = spark.readStream.schema(schema).option("maxFilesPerTrigger", 1).csv("streaming_data_dir")
# print true if both are true
print((streamingData.schema == schema)==( streamingData.isStreaming == True))

True


### First Query

In [15]:

# Apply a window function to the streaming data
windowedData = streamingData \
    .withWatermark("date", "31 days") \
    .groupBy(
        streamingData.vault_id,
        window(streamingData.date, "30 days", "1 day"),
        streamingData.model
    ) \
    .agg(_sum("failure").alias("total_failures"))

# Select only the date and vault_id fields
selectedData = windowedData.select("window.start", "vault_id", "total_failures")
filteredData = selectedData.filter(selectedData.total_failures > 0)

# Write the windowed data stream out to a memory sink for testing
query = filteredData \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .queryName("filteredData") \
    .option("numRows", 50) \
    .option("truncate", "false") \
    .start()



24/01/26 16:06:10 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/p0/qbnst5tj08g1z35zyllsm5vc0000gn/T/temporary-8e16d53e-195d-44ee-86c4-c9b438f75afc. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/01/26 16:06:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.




In [None]:
query.awaitTermination()

In [69]:
query.stop()

## Second query

In [82]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import StreamingKMeans

# Load initial center points from a CSV file
centers_df = spark.read.csv("cluster_data_dir/clusters.csv", header=True, inferSchema=True)

# drop _c35: string (nullable = true) from cluster_centers
centers_df = centers_df.drop("_c35")

# Convert the DataFrame to a list of DenseVector
centers = [Vectors.dense(row) for row in centers_df.collect()]
# Define the model
model = StreamingKMeans(k=len(centers), decayFactor=1.0)


In [84]:
# create list of weights for each center point
weights = [1.0 for center in centers]


model.setInitialCenters(centers, weights)


<pyspark.mllib.clustering.StreamingKMeans at 0x7fcfc47dca90>

In [89]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the streaming data
streamingData = spark.readStream.schema(schema).csv("streaming_data_dir")# Define a query
query = streamingData.writeStream.outputMode("append").format("console").start()

# Wait for the query to start
query.awaitTermination()

# Now you can convert the streaming DataFrame to an RDD
rdd = streamingData.rdd
# Create a SparkContext
sc = SparkContext.getOrCreate()

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Assuming that your streaming data is in the form of DataFrame named streamingData
# Convert DataFrame to RDD
rdd = streamingData.rdd

# Convert RDD to DStream
queue = [rdd]
stream = ssc.queueStream(queue)

# Now you can train the model on the DStream
model.trainOn(stream)

24/01/26 17:35:37 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/p0/qbnst5tj08g1z35zyllsm5vc0000gn/T/temporary-78b93846-4b73-4690-865d-0185b053138d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/01/26 17:35:37 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+--------------+--------------------+-------+--------+------------------+-------------------------+---------------+-------------------+---------------------------+------------------+------------------------+-----------------+--------------------+---------------------+------------------------+--------------------------------+-------------------------+----------------------------------+--------------------+--------------------+----------------------------+-----------------------+----------------------------+----------------------------+------------------------+---------------------------+----------------------------+---------------------------+--------------------------+-------------------------+--------------------------+---------------+-----------------+---------------------+-----------------+----------------------+-----------------------+--------------------+
|         

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Users/jonas/Documents/Uni/Erasmus/Cloud Computing for Big Data/Cloud-Computing-and-Big-Data-Applications/.conda/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/jonas/Documents/Uni/Erasmus/Cloud Computing for Big Data/Cloud-Computing-and-Big-Data-Applications/.conda/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/jonas/Documents/Uni/Erasmus/Cloud Computing for Big Data/Cloud-Computing-and-Big-Data-Applications/.conda/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 