<a href="https://colab.research.google.com/github/werowe/HypatiaAcademy/blob/master/stats/consolidate_weather_data_run_sql_spark_queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd

# Define the directory containing the CSV files
directory = 'drive/MyDrive/weather'

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through files in the directory
for filename in os.listdir(directory):
    # Check if the file starts with 'paphos2024' and ends with '.csv'
    if filename.startswith('paphos20') and filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dataframes.append(df)

# Combine all DataFrames into one
df_combined = pd.concat(dataframes, ignore_index=True)

# Drop duplicate rows
df_combined = df_combined.drop_duplicates()

# Save the combined DataFrame to a new CSV file (optional)
df_combined.to_csv('combined_weather.csv', index=False)

# Print a summary of the combined DataFrame
print(f"Combined DataFrame shape: {df_combined.shape}")

Combined DataFrame shape: (25226, 24)


In [3]:
from pyspark.sql import SparkSession


# Initialize SparkSession
spark = SparkSession.builder \
    .appName("weather") \
    .getOrCreate()

df = spark.read.csv(
    "combined_weather.csv",
    header=True,        # Use the first row as column names
    inferSchema=True,   # Automatically infer data types
    sep=",",            # Specify delimiter (default is ',')
    encoding="UTF-8"    # Handle encoding
)



In [4]:
df.columns

['name',
 'datetime',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'solarradiation',
 'solarenergy',
 'uvindex',
 'severerisk',
 'conditions',
 'icon',
 'stations']

In [5]:
df.createOrReplaceTempView("weather")

In [13]:

sql = '''
SELECT round(SUM(precip)) AS total_precip, YEAR(datetime) AS year, MONTH(datetime) AS month
FROM weather
GROUP BY YEAR(datetime), MONTH(datetime)
ORDER BY year, month

'''

result = spark.sql(sql)
result.show()

+------------+----+-----+
|total_precip|year|month|
+------------+----+-----+
|        69.0|2024|    1|
|        35.0|2024|    2|
|         9.0|2024|    3|
|       239.0|2024|    4|
|        18.0|2024|    5|
|         0.0|2024|    6|
|         0.0|2024|    7|
|         0.0|2024|    8|
|         2.0|2024|    9|
|         2.0|2024|   10|
|       166.0|2024|   11|
|       487.0|2024|   12|
|        64.0|2025|    1|
|       108.0|2025|    2|
|        45.0|2025|    3|
|        84.0|2025|    4|
|        58.0|2025|    5|
|         1.0|2025|    6|
+------------+----+-----+



In [23]:
# is it cloudier this year than last


sql = '''
SELECT CAST(SUM(cloudcover) AS INT) AS clouds, YEAR(datetime) AS year, MONTH(datetime) AS month
FROM weather
GROUP BY YEAR(datetime), MONTH(datetime)
HAVING MONTH(datetime) = 5
ORDER BY year, month

'''


result = spark.sql(sql)
result.show()

+------+----+-----+
|clouds|year|month|
+------+----+-----+
| 28069|2024|    5|
|101716|2025|    5|
+------+----+-----+



In [50]:
# is it rainier this year than last


sql = '''
SELECT CAST(SUM(precip) AS INT) AS precip, YEAR(datetime) AS year, MONTH(datetime) AS month
FROM weather
GROUP BY YEAR(datetime), MONTH(datetime)
HAVING MONTH(datetime) = 5
ORDER BY year, month

'''


result = spark.sql(sql)
result.show()

+------+----+-----+
|precip|year|month|
+------+----+-----+
|    18|2024|    5|
|    57|2025|    5|
+------+----+-----+



In [39]:
# maximum temperature


sql = '''
SELECT round(max(temp),2) AS max_temp, date_format(datetime, 'yy-MM-dd') as YYMMDD
FROM weather
GROUP BY YYMMDD
ORDER BY max_temp desc

'''

result = spark.sql(sql)
result.show()

+--------+--------+
|max_temp|  YYMMDD|
+--------+--------+
|    34.6|24-07-28|
|    33.5|24-07-21|
|    33.4|24-07-29|
|    33.3|24-04-24|
|    33.3|24-07-20|
|    33.3|24-07-27|
|    33.1|24-08-17|
|    33.0|24-08-01|
|    32.9|24-08-21|
|    32.9|24-08-12|
|    32.9|24-07-31|
|    32.9|24-08-14|
|    32.9|24-07-24|
|    32.8|24-08-16|
|    32.8|24-08-18|
|    32.8|24-08-26|
|    32.7|24-07-25|
|    32.6|24-07-15|
|    32.3|24-07-19|
|    32.2|24-07-22|
+--------+--------+
only showing top 20 rows



In [7]:
sql = '''
SELECT round(avg(temp),2) AS temp, YEAR(datetime) AS year, MONTH(datetime) AS month
FROM weather
GROUP BY YEAR(datetime), MONTH(datetime)
ORDER BY year, month

'''

result = spark.sql(sql)
result.show()

+-----+----+-----+
| temp|year|month|
+-----+----+-----+
|14.35|2024|    1|
|14.39|2024|    2|
|16.43|2024|    3|
|20.06|2024|    4|
|21.59|2024|    5|
|25.29|2024|    6|
|29.12|2024|    7|
|28.42|2024|    8|
|26.56|2024|    9|
|22.68|2024|   10|
|18.29|2024|   11|
|15.07|2024|   12|
|14.31|2025|    1|
|11.74|2025|    2|
|16.74|2025|    3|
|17.84|2025|    4|
| 21.4|2025|    5|
|21.92|2025|    6|
+-----+----+-----+



In [46]:
# average temperature by month and year

sql = '''
SELECT round(avg(temp),1) AS temp, YEAR(datetime) AS year, MONTH(datetime) AS month
FROM weather
GROUP BY YEAR(datetime), MONTH(datetime)
ORDER BY month, year

'''

result = spark.sql(sql)
result.show()

+----+----+-----+
|temp|year|month|
+----+----+-----+
|14.4|2024|    1|
|14.3|2025|    1|
|14.4|2024|    2|
|11.7|2025|    2|
|16.4|2024|    3|
|16.7|2025|    3|
|20.1|2024|    4|
|17.8|2025|    4|
|21.6|2024|    5|
|21.4|2025|    5|
|25.3|2024|    6|
|21.9|2025|    6|
|29.1|2024|    7|
|28.4|2024|    8|
|26.6|2024|    9|
|22.7|2024|   10|
|18.3|2024|   11|
|15.1|2024|   12|
+----+----+-----+



In [48]:
# average temperature particular month

sql = '''
SELECT YEAR(datetime) AS year, MONTH(datetime) AS month, round(AVG(temp),2) AS avg_temp
FROM weather
WHERE YEAR(datetime) IN (2025, 2024, 2023) AND MONTH(datetime) = 5
GROUP BY YEAR(datetime), MONTH(datetime);

'''

result = spark.sql(sql)
result.show()

+----+-----+--------+
|year|month|avg_temp|
+----+-----+--------+
|2025|    5|    21.4|
|2024|    5|   21.59|
+----+-----+--------+

