In [23]:
# Ελεγχος version pyspark .. Θέλουμε το 3.4.1
import pyspark
print(pyspark.__version__)

3.4.1


In [24]:
# add java to path .. δεν μπορουσα να βρω αλλον τροπο να τρέξει η spark.
# (θελουμε την Java 17 επειδη μονο η Java 8 , Java 11 και Java 17 δουλευει με το pyspark)
import os
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-17"
os.environ["PATH"] += ";C:\\Program Files\\Java\\jdk-17\\bin"

In [25]:
from pyspark.sql import SparkSession
import json
from pyspark.sql.functions import col, expr

In [26]:
# Φτιάχνουμε ενα spark session
spark = SparkSession.builder.appName("TemperatureAnalysis").getOrCreate()

In [27]:
# Μετατρέπω τα text αρχεία σε json για να μπορέσω να τα διαβάσω με το spark
# Επειδή δεν βρήκαμε άλλον τρόπο να τα διαβάσουμε ως text αρχεία
convert = [["data/tempm.txt", "data/tempm.json", "temperature"], ["data/hum.txt", "data/hum.json" , "humidity"]]
for i,j,string in convert:
    with open(i, 'r') as input_file, open(j, 'w') as output_file:
        for line in input_file:
            data = json.loads(line)
            new_data = [{"timestamp": key, string: value} for key, value in data.items()]
            json.dump(new_data, output_file)
            output_file.write('\n')


In [28]:

# Read data from the JSON file
tempm_data = spark.read.json("data/tempm.json")
hum_data = spark.read.json("data/hum.json")

# Show the DataFrame
print(tempm_data.show())
print(hum_data.show())

+-----------+-------------------+
|temperature|          timestamp|
+-----------+-------------------+
|        3.0|2014-02-13T06:20:00|
|        7.0|2014-02-13T13:50:00|
|          2|2014-02-13T06:00:00|
|          3|2014-02-13T03:00:00|
|          6|2014-02-13T13:00:00|
|        4.0|2014-02-13T18:50:00|
|        6.0|2014-02-13T13:20:00|
|          6|2014-02-13T15:00:00|
|        4.0|2014-02-13T08:50:00|
|        4.0|2014-02-13T21:50:00|
|          3|2014-02-13T08:00:00|
|        3.0|2014-02-13T07:50:00|
|        4.0|2014-02-13T08:20:00|
|        3.0|2014-02-13T21:20:00|
|        6.0|2014-02-13T11:50:00|
|        6.0|2014-02-13T11:20:00|
|        5.0|2014-02-13T17:50:00|
|          6|2014-02-13T11:00:00|
|        2.0|2014-02-13T05:50:00|
|        3.0|2014-02-13T20:50:00|
+-----------+-------------------+
only showing top 20 rows

None
+--------+-------------------+
|humidity|          timestamp|
+--------+-------------------+
|      93|2014-02-13T06:20:00|
|      66|2014-02-13T13:50:00

In [29]:
# Read data from the JSON file
tempm_data = tempm_data.withColumn("date", col("timestamp").cast("date"))
hum_data = hum_data.withColumn("date", col("timestamp").cast("date"))
print(tempm_data.show())
print(hum_data.show())

+-----------+-------------------+----------+
|temperature|          timestamp|      date|
+-----------+-------------------+----------+
|        3.0|2014-02-13T06:20:00|2014-02-13|
|        7.0|2014-02-13T13:50:00|2014-02-13|
|          2|2014-02-13T06:00:00|2014-02-13|
|          3|2014-02-13T03:00:00|2014-02-13|
|          6|2014-02-13T13:00:00|2014-02-13|
|        4.0|2014-02-13T18:50:00|2014-02-13|
|        6.0|2014-02-13T13:20:00|2014-02-13|
|          6|2014-02-13T15:00:00|2014-02-13|
|        4.0|2014-02-13T08:50:00|2014-02-13|
|        4.0|2014-02-13T21:50:00|2014-02-13|
|          3|2014-02-13T08:00:00|2014-02-13|
|        3.0|2014-02-13T07:50:00|2014-02-13|
|        4.0|2014-02-13T08:20:00|2014-02-13|
|        3.0|2014-02-13T21:20:00|2014-02-13|
|        6.0|2014-02-13T11:50:00|2014-02-13|
|        6.0|2014-02-13T11:20:00|2014-02-13|
|        5.0|2014-02-13T17:50:00|2014-02-13|
|          6|2014-02-13T11:00:00|2014-02-13|
|        2.0|2014-02-13T05:50:00|2014-02-13|
|        3

In [30]:
# Φιλτράρισμα για θερμοκρασία μεταξύ 18°C και 22°C
filtered_temp = tempm_data.filter((col("temperature") >= 18) & (col("temperature") <= 22))

# Extract unique dates from the filtered DataFrame
unique_dates = filtered_temp.select("date").distinct()

# Count the number of unique dates
num_dates = unique_dates.count()

# Show the unique dates and the count
print("Number of dates with temperature between 18 and 22 C:", num_dates)
print("Unique Dates:")
unique_dates.show()

Number of dates with temperature between 18 and 22 C: 26
Unique Dates:
+----------+
|      date|
+----------+
|2014-06-03|
|2014-04-29|
|2014-05-24|
|2014-06-05|
|2014-04-27|
|2014-05-30|
|2014-04-20|
|2014-05-25|
|2014-05-26|
|2014-05-27|
|2014-05-21|
|2014-06-02|
|2014-05-18|
|2014-05-20|
|2014-04-30|
|2014-05-17|
|2014-04-22|
|2014-05-22|
|2014-06-04|
|2014-04-26|
+----------+
only showing top 20 rows



In [31]:
# Ελεγχος αν διαβαστηκαν σωστα τα αρχεια
print(tempm_data.show())

+-----------+-------------------+----------+
|temperature|          timestamp|      date|
+-----------+-------------------+----------+
|        3.0|2014-02-13T06:20:00|2014-02-13|
|        7.0|2014-02-13T13:50:00|2014-02-13|
|          2|2014-02-13T06:00:00|2014-02-13|
|          3|2014-02-13T03:00:00|2014-02-13|
|          6|2014-02-13T13:00:00|2014-02-13|
|        4.0|2014-02-13T18:50:00|2014-02-13|
|        6.0|2014-02-13T13:20:00|2014-02-13|
|          6|2014-02-13T15:00:00|2014-02-13|
|        4.0|2014-02-13T08:50:00|2014-02-13|
|        4.0|2014-02-13T21:50:00|2014-02-13|
|          3|2014-02-13T08:00:00|2014-02-13|
|        3.0|2014-02-13T07:50:00|2014-02-13|
|        4.0|2014-02-13T08:20:00|2014-02-13|
|        3.0|2014-02-13T21:20:00|2014-02-13|
|        6.0|2014-02-13T11:50:00|2014-02-13|
|        6.0|2014-02-13T11:20:00|2014-02-13|
|        5.0|2014-02-13T17:50:00|2014-02-13|
|          6|2014-02-13T11:00:00|2014-02-13|
|        2.0|2014-02-13T05:50:00|2014-02-13|
|        3

In [32]:
# Ελεγχος αν διαβαστηκαν σωστα τα αρχεια
print(hum_data.show())

+--------+-------------------+----------+
|humidity|          timestamp|      date|
+--------+-------------------+----------+
|      93|2014-02-13T06:20:00|2014-02-13|
|      66|2014-02-13T13:50:00|2014-02-13|
|      91|2014-02-13T06:00:00|2014-02-13|
|      84|2014-02-13T03:00:00|2014-02-13|
|      62|2014-02-13T13:00:00|2014-02-13|
|      75|2014-02-13T18:50:00|2014-02-13|
|      70|2014-02-13T13:20:00|2014-02-13|
|      56|2014-02-13T15:00:00|2014-02-13|
|      87|2014-02-13T08:50:00|2014-02-13|
|      75|2014-02-13T21:50:00|2014-02-13|
|      88|2014-02-13T08:00:00|2014-02-13|
|      93|2014-02-13T07:50:00|2014-02-13|
|      87|2014-02-13T08:20:00|2014-02-13|
|      81|2014-02-13T21:20:00|2014-02-13|
|      76|2014-02-13T11:50:00|2014-02-13|
|      76|2014-02-13T11:20:00|2014-02-13|
|      70|2014-02-13T17:50:00|2014-02-13|
|      69|2014-02-13T11:00:00|2014-02-13|
|      93|2014-02-13T05:50:00|2014-02-13|
|      81|2014-02-13T20:50:00|2014-02-13|
+--------+-------------------+----