<a href="https://colab.research.google.com/github/werowe/HypatiaAcademy/blob/master/stats/consolidate_weather_data_run_sql_spark_queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd

# Define the directory containing the CSV files
directory = 'drive/MyDrive/weather'

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through files in the directory
for filename in os.listdir(directory):
    # Check if the file starts with 'paphos2024' and ends with '.csv'
    if filename.startswith('paphos20') and filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dataframes.append(df)

# Combine all DataFrames into one
df_combined = pd.concat(dataframes, ignore_index=True)

# Drop duplicate rows
df_combined = df_combined.drop_duplicates()

# Save the combined DataFrame to a new CSV file (optional)
df_combined.to_csv('combined_weather.csv', index=False)

# Print a summary of the combined DataFrame
print(f"Combined DataFrame shape: {df_combined.shape}")

Combined DataFrame shape: (11200, 24)


In [3]:
from pyspark.sql import SparkSession


# Initialize SparkSession
spark = SparkSession.builder \
    .appName("weather") \
    .getOrCreate()

df = spark.read.csv(
    "combined_weather.csv",
    header=True,        # Use the first row as column names
    inferSchema=True,   # Automatically infer data types
    sep=",",            # Specify delimiter (default is ',')
    encoding="UTF-8"    # Handle encoding
)



In [4]:
df.columns

['name',
 'datetime',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'solarradiation',
 'solarenergy',
 'uvindex',
 'severerisk',
 'conditions',
 'icon',
 'stations']

In [5]:
df.createOrReplaceTempView("weather")

In [6]:

sql = '''
SELECT SUM(precip) AS total_precip, YEAR(datetime) AS year, MONTH(datetime) AS month
FROM weather
GROUP BY YEAR(datetime), MONTH(datetime)

'''

result = spark.sql(sql)
result.show()

+------------------+----+-----+
|      total_precip|year|month|
+------------------+----+-----+
|               0.0|2024|    7|
|486.78400000000016|2024|   12|
|1.6430000000000005|2024|    9|
|1.5000000000000002|2024|   10|
|            23.965|2024|    1|
|166.32799999999978|2024|   11|
| 7.687999999999999|2025|    1|
|             0.008|2024|    8|
+------------------+----+-----+



In [7]:
sql = '''
SELECT round(avg(temp),2) AS temp, YEAR(datetime) AS year, MONTH(datetime) AS month
FROM weather
GROUP BY YEAR(datetime), MONTH(datetime)
sort by month

'''

result = spark.sql(sql)
result.show()

+-----+----+-----+
| temp|year|month|
+-----+----+-----+
|15.28|2024|    1|
|14.41|2025|    1|
|29.12|2024|    7|
|28.42|2024|    8|
|26.56|2024|    9|
|22.68|2024|   10|
|18.29|2024|   11|
|15.07|2024|   12|
+-----+----+-----+



In [8]:
sql = '''
SELECT avg(temp) AS temp, YEAR(datetime) AS year
FROM weather
GROUP BY YEAR(datetime)


'''

result = spark.sql(sql)
result.show()

+------------------+----+
|              temp|year|
+------------------+----+
|14.406673209028435|2025|
|22.018947058245622|2024|
+------------------+----+



In [9]:
sql = '''
SELECT YEAR(datetime) AS year, MONTH(datetime) AS month, AVG(temp) AS avg_temp
FROM weather
WHERE YEAR(datetime) IN (2025, 2024) AND MONTH(datetime) = 1
GROUP BY YEAR(datetime), MONTH(datetime);

'''

result = spark.sql(sql)
result.show()

+----+-----+------------------+
|year|month|          avg_temp|
+----+-----+------------------+
|2024|    1|15.282291666666676|
|2025|    1|14.406673209028435|
+----+-----+------------------+



In [10]:
# skip sea as runs slow

seago=True

if seago==False:
  exit()


#Sea

In [11]:
import json


ga=[]

for filename in os.listdir(directory):
       if filename.endswith('waves.json'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        fo = open(filepath)
        fs=fo.read()
        jf=json.loads(fs)
        for s in jf['hours']:
          f=[s['time'], s['waterTemperature']['noaa'],s['waveHeight']['noaa'],s['swellHeight']['noaa'],s['windWaveHeight']['noaa']]
          g=pd.DataFrame(f).T
          g.columns=['time','temp','height', 'swell', 'wind']
          g['time'] = pd.to_datetime(g['time'])
          g.set_index("time", inplace=True)
          ga.append(g)
        fo.close()



# Combine all DataFrames into one
sdf = pd.concat(ga, ignore_index=False)

# Drop duplicate rows
sdfc = sdf.drop_duplicates()



In [20]:
f

['2024-01-21T02:00:00+00:00', 17.71, 0.65, 0.46, 0.27]

In [18]:
s.keys()

dict_keys(['currentDirection', 'currentSpeed', 'gust', 'swellDirection', 'swellHeight', 'swellPeriod', 'time', 'waterTemperature', 'waveDirection', 'waveHeight', 'wavePeriod', 'windDirection', 'windWaveDirection', 'windWaveHeight', 'windWavePeriod'])

In [12]:
sdfc = sdfc.reset_index()

spark_df = spark.createDataFrame(sdfc)

In [13]:
spark_df.createOrReplaceTempView("sea")

In [27]:
sql = '''
SELECT
    round(MAX(temp),2) AS max_temp,
    round(MIN(temp),2) AS min_temp,
    round(AVG(temp),2) AS ave_temp,
    YEAR(time) AS year,
    MONTH(time) AS month
FROM sea
WHERE HOUR(time) = 12
GROUP BY YEAR(time), MONTH(time)
ORDER BY max_temp desc


'''

result = spark.sql(sql)
result.show()

+--------+--------+--------+----+-----+
|max_temp|min_temp|ave_temp|year|month|
+--------+--------+--------+----+-----+
|   29.78|   28.04|    29.0|2024|    8|
|   29.71|   29.05|   29.33|2024|    7|
|   29.58|   27.14|   28.25|2023|    8|
|   29.27|   27.01|   27.73|2023|    9|
|   29.25|   26.55|   27.63|2024|    9|
|    29.2|   28.67|    28.9|2023|    7|
|   27.65|   24.35|   25.78|2023|   10|
|   27.15|   24.49|    25.9|2024|   10|
|   25.34|    21.3|   23.62|2023|   11|
|    24.5|   21.15|   22.92|2024|   11|
|   21.35|   19.27|   20.47|2023|   12|
|   21.22|   19.14|   20.33|2024|   12|
|   19.62|   18.26|   18.94|2025|    1|
|   19.38|   17.74|   18.71|2024|    1|
+--------+--------+--------+----+-----+



In [24]:
sql = '''
SELECT
    round(MAX(height),2) AS max_height,
    round(AVG(height),2) AS avg_height,
    YEAR(time) AS year,
    MONTH(time) AS month
FROM sea
WHERE HOUR(time) = 12
GROUP BY YEAR(time), MONTH(time)
ORDER BY avg_height desc


'''

result = spark.sql(sql)
result.show()

+----------+----------+----------+----+-----+
|max_height|min_height|avg_height|year|month|
+----------+----------+----------+----+-----+
|      2.61|      0.14|      1.26|2024|    1|
|      4.26|      0.19|      0.97|2023|   11|
|      2.45|      0.09|      0.95|2024|   12|
|      1.25|      0.65|      0.87|2024|    7|
|      2.02|      0.17|      0.85|2024|    9|
|      1.14|       0.4|      0.82|2023|    7|
|      1.28|      0.47|      0.81|2023|    8|
|      2.46|       0.1|      0.79|2023|   12|
|      3.14|      0.22|      0.78|2024|   11|
|      1.25|      0.41|      0.72|2024|    8|
|      1.16|       0.2|      0.62|2023|    9|
|      1.69|      0.18|      0.56|2023|   10|
|      2.07|      0.18|      0.53|2024|   10|
|      1.04|      0.07|      0.52|2025|    1|
+----------+----------+----------+----+-----+



In [15]:
sql = '''
SELECT * from sea
order by YEAR(time) desc, MONTH(time) desc


'''

result = spark.sql(sql)
result.show()

+-------------------+-----+------+-----+----+
|               time| temp|height|swell|wind|
+-------------------+-----+------+-----+----+
|2025-01-01 00:00:00|19.14|  0.83| 0.76|0.27|
|2025-01-01 20:00:00|19.18|  0.42| 0.42|0.34|
|2025-01-01 01:00:00|19.17|  0.81| 0.74|0.24|
|2025-01-01 02:00:00| 19.2|  0.78| 0.73|0.21|
|2025-01-01 03:00:00|19.23|  0.76| 0.71|0.18|
|2025-01-01 04:00:00|19.22|  0.73| 0.51|0.16|
|2025-01-01 05:00:00| 19.2|  0.69| 0.31|0.15|
|2025-01-01 06:00:00|19.19|  0.66| 0.11|0.13|
|2025-01-01 07:00:00|19.19|  0.63| 0.26|0.12|
|2025-01-01 08:00:00|19.19|  0.61| 0.41|0.11|
|2025-01-01 09:00:00|19.19|  0.58| 0.56| 0.1|
|2025-01-01 10:00:00|19.22|  0.56| 0.54|0.27|
|2025-01-01 11:00:00|19.24|  0.53| 0.52|0.45|
|2025-01-01 12:00:00|19.27|  0.51|  0.5|0.62|
|2025-01-01 13:00:00|19.23|   0.5| 0.49|0.59|
|2025-01-01 14:00:00| 19.2|  0.49| 0.48|0.56|
|2025-01-01 15:00:00|19.16|  0.48| 0.47|0.53|
|2025-01-01 16:00:00|19.18|  0.47| 0.46|0.48|
|2025-01-01 17:00:00| 19.2|  0.46|