<a href="https://colab.research.google.com/github/werowe/HypatiaAcademy/blob/master/stats/consolidate_weather_data_run_sql_spark_queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import os
import pandas as pd

# Define the directory containing the CSV files
directory = 'drive/MyDrive/weather'

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through files in the directory
for filename in os.listdir(directory):
    # Check if the file starts with 'paphos2024' and ends with '.csv'
    if filename.startswith('paphos20') and filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dataframes.append(df)

# Combine all DataFrames into one
df_combined = pd.concat(dataframes, ignore_index=True)

# Drop duplicate rows
df_combined = df_combined.drop_duplicates()

# Save the combined DataFrame to a new CSV file (optional)
df_combined.to_csv('combined_weather.csv', index=False)

# Print a summary of the combined DataFrame
print(f"Combined DataFrame shape: {df_combined.shape}")

Combined DataFrame shape: (11039, 24)


In [19]:
from pyspark.sql import SparkSession


# Initialize SparkSession
spark = SparkSession.builder \
    .appName("weather") \
    .getOrCreate()

df = spark.read.csv(
    "combined_weather.csv",
    header=True,        # Use the first row as column names
    inferSchema=True,   # Automatically infer data types
    sep=",",            # Specify delimiter (default is ',')
    encoding="UTF-8"    # Handle encoding
)



In [20]:
df.columns

['name',
 'datetime',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'solarradiation',
 'solarenergy',
 'uvindex',
 'severerisk',
 'conditions',
 'icon',
 'stations']

In [21]:
df.createOrReplaceTempView("weather")

In [22]:

sql = '''
SELECT SUM(precip) AS total_precip, YEAR(datetime) AS year, MONTH(datetime) AS month
FROM weather
GROUP BY YEAR(datetime), MONTH(datetime)

'''

result = spark.sql(sql)
result.show()

+------------------+----+-----+
|      total_precip|year|month|
+------------------+----+-----+
|               0.0|2024|    7|
|486.78400000000016|2024|   12|
|1.6430000000000005|2024|    9|
|1.5000000000000002|2024|   10|
|            23.965|2024|    1|
|166.32799999999978|2024|   11|
| 7.687999999999999|2025|    1|
|             0.008|2024|    8|
+------------------+----+-----+



In [29]:
sql = '''
SELECT round(avg(temp),2) AS temp, YEAR(datetime) AS year, MONTH(datetime) AS month
FROM weather
GROUP BY YEAR(datetime), MONTH(datetime)
sort by month

'''

result = spark.sql(sql)
result.show()

+-----+----+-----+
| temp|year|month|
+-----+----+-----+
|15.09|2024|    1|
|14.34|2025|    1|
|29.12|2024|    7|
|28.42|2024|    8|
|26.56|2024|    9|
|22.68|2024|   10|
|18.29|2024|   11|
|15.07|2024|   12|
+-----+----+-----+



In [28]:
sql = '''
SELECT avg(temp) AS temp, YEAR(datetime) AS year
FROM weather
GROUP BY YEAR(datetime)


'''

result = spark.sql(sql)
result.show()

+------------------+----+
|              temp|year|
+------------------+----+
|14.336621315192732|2025|
|  22.0299005611892|2024|
+------------------+----+



In [25]:
sql = '''
SELECT temp,  MONTH(datetime), YEAR(datetime)
FROM weather
where MONTH(datetime) = 1



'''

result = spark.sql(sql)
result.show()

+----+---------------+--------------+
|temp|month(datetime)|year(datetime)|
+----+---------------+--------------+
|11.9|              1|          2025|
|10.1|              1|          2025|
| 8.1|              1|          2025|
| 9.0|              1|          2025|
| 9.9|              1|          2025|
| 8.8|              1|          2025|
| 8.8|              1|          2025|
| 8.0|              1|          2025|
| 9.8|              1|          2025|
|12.8|              1|          2025|
|15.9|              1|          2025|
|18.7|              1|          2025|
|19.0|              1|          2025|
|18.0|              1|          2025|
|18.0|              1|          2025|
|18.0|              1|          2025|
|18.0|              1|          2025|
|14.2|              1|          2025|
|13.0|              1|          2025|
|11.8|              1|          2025|
+----+---------------+--------------+
only showing top 20 rows

