<a href="https://colab.research.google.com/github/werowe/HypatiaAcademy/blob/master/stats/consolidate_sea_data_run_sql_spark_queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import pandas as pd

# Define the directory containing the CSV files
directory = 'drive/MyDrive/weather'

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through files in the directory
for filename in os.listdir(directory):
    # Check if the file starts with 'paphos2024' and ends with '.csv'
    if filename.startswith('paphos') and filename.endswith('sea.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dataframes.append(df)

# Combine all DataFrames into one
df_combined = pd.concat(dataframes, ignore_index=True)

# Drop duplicate rows
df_combined = df_combined.drop_duplicates()

# Save the combined DataFrame to a new CSV file (optional)
df_combined.to_csv('combined_sea.csv', index=False)

# Print a summary of the combined DataFrame
print(f"Combined DataFrame shape: {df_combined.shape}")

Combined DataFrame shape: (1152, 5)


In [5]:
from pyspark.sql import SparkSession


# Initialize SparkSession
spark = SparkSession.builder \
    .appName("weather") \
    .getOrCreate()

df = spark.read.csv(
    "combined_sea.csv",
    header=True,        # Use the first row as column names
    inferSchema=True,   # Automatically infer data types
    sep=",",            # Specify delimiter (default is ',')
    encoding="UTF-8"    # Handle encoding
)



In [6]:
df.columns

['time', 'temp', 'height', 'swell', 'wind']

In [7]:
df.createOrReplaceTempView("sea")

In [21]:
sql = '''
SELECT round(mean(temp)) AS temp, YEAR(time) AS year, MONTH(time) AS month, ((dayofmonth(time) - 1) DIV 7) + 1 AS week
FROM sea
GROUP BY year, month, week
ORDER BY year, month, week

'''

result = spark.sql(sql)
result.show()

+----+----+-----+----+
|temp|year|month|week|
+----+----+-----+----+
|24.0|2025|   10|   3|
|24.0|2025|   10|   4|
|24.0|2025|   10|   5|
|24.0|2025|   11|   1|
|24.0|2025|   11|   2|
|23.0|2025|   11|   3|
|22.0|2025|   11|   4|
|22.0|2025|   11|   5|
|22.0|2025|   12|   1|
|21.0|2025|   12|   2|
|20.0|2025|   12|   3|
|20.0|2025|   12|   4|
|20.0|2025|   12|   5|
+----+----+-----+----+

