In [0]:
import requests
import pandas as pd
import calendar
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year as spark_year, month as spark_month
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, LongType, IntegerType

# Initialize Spark Session
spark = SparkSession.builder.appName("SeismicDataIngestion").getOrCreate()

# Define Schema
schema = StructType([
    StructField("event_id", StringType(), False),
    StructField("magnitude", DoubleType(), True),
    StructField("place", StringType(), True),
    StructField("time", TimestampType(), True),
    StructField("updated", LongType(), True),
    StructField("timezone", IntegerType(), True),
    StructField("url", StringType(), True),
    StructField("detail", StringType(), True),
    StructField("felt", IntegerType(), True),
    StructField("cdi", DoubleType(), True),
    StructField("mmi", DoubleType(), True),
    StructField("alert", StringType(), True),
    StructField("status", StringType(), True),
    StructField("tsunami", IntegerType(), True),
    StructField("significance", IntegerType(), True),
    StructField("network", StringType(), True),
    StructField("code", StringType(), True),
    StructField("ids", StringType(), True),
    StructField("sources", StringType(), True),
    StructField("types", StringType(), True),
    StructField("nst", IntegerType(), True),
    StructField("dmin", DoubleType(), True),
    StructField("rms", DoubleType(), True),
    StructField("gap", DoubleType(), True),
    StructField("magType", StringType(), True),
    StructField("propertyType", StringType(), True),
    StructField("title", StringType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("depth", DoubleType(), True),
    StructField("source", StringType(), True)
])

# Define Year Range for Backfilling
years = range(2018, 2024)  # Backfilling from 2018-2024
#years = range(2023, 2024)
batch_size = 1  # Load data in 6-month batches to avoid API limits

# Loop through Years & Months
for year in years:
    for month in range(1, 13):
        last_day = calendar.monthrange(year, month)[1]
        start_time = f"{year}-{month:02d}-01"
        end_time = f"{year}-{month:02d}-{last_day}"

        print(f"Fetching data from {start_time} to {end_time}")

        # USGS API Call (Batch Fetching)
        usgs_url = f"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={start_time}&endtime={end_time}"
        response = requests.get(usgs_url)
        if response.status_code == 200:
            data = response.json()
        elif response.status_code == 400:
            print(f"Bad Request: {response.text}")
            #trying to get the data in chunks
            chunk_data = []
            for day in range(1, last_day + 1, 7):  # Fetching in weekly chunks
                    chunk_start = f"{year}-{month:02d}-{day:02d}"
                    chunk_end = f"{year}-{month:02d}-{min(day+6, last_day):02d}"  # Ensures we don't exceed the month's last day
                    print(f"Fetching chunk: {chunk_start} to {chunk_end}")
                    chunk_url = f"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={chunk_start}&endtime={chunk_end}"
                    chunk_response = requests.get(chunk_url)
                    if chunk_response.status_code == 200:
                        chunk_data.extend(chunk_response.json()["features"])
                    else:
                        print(f"Failed to fetch data for {chunk_start} to {chunk_end} using {chunk_url}: {chunk_response.status_code}")
                        continue
            data = {"features": chunk_data}
        else:
            print(f"Failed to retrieve data from {start_time} to {end_time} using {usgs_url}: {response.status_code}")
            data = None

        # Extract Data
        earthquake_records = []
        if data:
            # Extract Data
            earthquake_records = []
            for feature in data["features"]:
                props = feature["properties"]
                geometry = feature["geometry"]
                earthquake_records.append({
                    "event_id": feature["id"],
                    "magnitude": props.get("mag"),
                    "place": props.get("place"),
                    "time": props.get("time"),  # Epoch time
                    "updated": props.get("updated"),
                    "timezone": props.get("tz"),
                    "url": props.get("url"),
                    "detail": props.get("detail"),
                    "felt": props.get("felt"),
                    "cdi": props.get("cdi"),
                    "mmi": props.get("mmi"),
                    "alert": props.get("alert"),
                    "status": props.get("status"),
                    "tsunami": props.get("tsunami"),
                    "significance": props.get("sig"),
                    "network": props.get("net"),
                    "code": props.get("code"),
                    "ids": props.get("ids"),
                    "sources": props.get("sources"),
                    "types": props.get("types"),
                    "nst": props.get("nst"),
                    "dmin": props.get("dmin"),
                    "rms": props.get("rms"),
                    "gap": props.get("gap"),
                    "magType": props.get("magType"),
                    "propertyType": props.get("type"),
                    "title": props.get("title"),
                    "longitude": geometry["coordinates"][0],
                    "latitude": geometry["coordinates"][1],
                    "depth": geometry["coordinates"][2],
                    "source": "USGS"
                })

            # Convert to Pandas DataFrame
            df_usgs = pd.DataFrame(earthquake_records)
            df_usgs["time"] = pd.to_datetime(df_usgs["time"], unit='ms')

            # Convert to PySpark DataFrame
            df_spark = spark.createDataFrame(df_usgs, schema=schema)

            # Partition by Year & Month
            df_spark = df_spark.withColumn("year", spark_year(col("time"))).withColumn("month", spark_month(col("time")))

            # Append to Delta Table in Partitions
            df_spark.write.format("delta").mode("append").partitionBy("year", "month", "source").saveAsTable("tabular.dataexpert.usgs_seismic_events_bronze")

            print(f"Data from {start_time} to {end_time} loaded successfully!")

count_df = spark.sql("SELECT COUNT(*) AS total_count FROM tabular.dataexpert.usgs_seismic_events_bronze")
#display(count_df)
print(f"USGS Backfill Completed with {count_df} records")




In [0]:
count_df = spark.sql("SELECT COUNT(*) AS total_count FROM tabular.dataexpert.usgs_seismic_events_bronze")
print(f"USGS Backfill Completed with {count_df} records")
display(count_df)