In [34]:
 
from pyspark.sql import SparkSession

 

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("weather") \
    .getOrCreate()

df = spark.read.csv(
    "combined_paphos2024.csv",
    header=True,        # Use the first row as column names
    inferSchema=True,   # Automatically infer data types
    sep=",",            # Specify delimiter (default is ',')
    encoding="UTF-8"    # Handle encoding
)


In [35]:
df.show()

+-----------+-------------------+----+---------+----+--------+------+----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+--------------+-----------+-------+----------+--------------------+-------------------+--------------------+
|       name|           datetime|temp|feelslike| dew|humidity|precip|precipprob|preciptype|snow|snowdepth|windgust|windspeed|winddir|sealevelpressure|cloudcover|visibility|solarradiation|solarenergy|uvindex|severerisk|          conditions|               icon|            stations|
+-----------+-------------------+----+---------+----+--------+------+----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+--------------+-----------+-------+----------+--------------------+-------------------+--------------------+
|34.68,32.61|2024-12-08 00:00:00|15.0|     15.0|13.9|   92.91|   0.0|         0|      NULL|   0|        0|    16.6|      7.1|   69.0|          1017.0|      8

In [36]:

df.createOrReplaceTempView("weather")

In [37]:
df.columns

['name',
 'datetime',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'solarradiation',
 'solarenergy',
 'uvindex',
 'severerisk',
 'conditions',
 'icon',
 'stations']

In [38]:
sql = '''
select max(temp) from weather  WHERE MONTH(datetime) = 8
'''

In [39]:
result = spark.sql(sql)
result.show()

+---------+
|max(temp)|
+---------+
|     33.1|
+---------+



In [40]:
sql = '''
SELECT SUM(precip) AS total_precip, MONTH(datetime) AS month
FROM weather
GROUP BY MONTH(datetime)

''' 

In [41]:
result = spark.sql(sql)
result.show()

+------------------+-----+
|      total_precip|month|
+------------------+-----+
|205.63699999999983|   12|
|1.6430000000000005|    9|
|             0.008|    8|
|               0.0|    7|
|1.5000000000000002|   10|
|166.32799999999978|   11|
+------------------+-----+



In [42]:
sql = '''
SELECT max(temp) AS max_temp, MONTH(datetime) AS month
FROM weather
GROUP BY MONTH(datetime)
'''


In [43]:

result = spark.sql(sql)
result.show()


+--------+-----+
|max_temp|month|
+--------+-----+
|    22.6|   12|
|    32.1|    9|
|    33.1|    8|
|    34.6|    7|
|    28.8|   10|
|    27.0|   11|
+--------+-----+

