In [10]:
!pip install transformers torch pandas requests azure-storage-blob




In [24]:
# @title
import os

# Set your credentials here (or store securely using environment variables)
os.environ['WEATHERSTACK_API_KEY'] = 'your_weatherstack_api_key_here'
os.environ['AZURE_STORAGE_CONNECTION_STRING'] = 'your_azure_blob_connection_string_here'


In [25]:
import requests
import pandas as pd
from datetime import datetime
from transformers import pipeline
from azure.storage.blob import BlobServiceClient

def run_weather_etl(city="New York"):
    # Step 1: Extract
    api_key = os.getenv('WEATHERSTACK_API_KEY')
    url = f"http://api.weatherstack.com/current?access_key={api_key}&query={city}"
    response = requests.get(url)
    data = response.json()

    if "current" not in data:
        raise ValueError(f"Error fetching data: {data.get('error', {}).get('info', 'Unknown error')}")

    # Step 2: Transform
    temp = data['current']['temperature']
    humidity = data['current']['humidity']
    description = data['current']['weather_descriptions'][0]
    timestamp = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')

    # Format text for summarization
    weather_text = f"Temperature: {temp}°C, Humidity: {humidity}%, Sky: {description}."

    # Step 3: Summarize using Hugging Face
    summarizer = pipeline("summarization", model="t5-small")
    summary = summarizer(weather_text, max_length=30, min_length=5, do_sample=False)[0]['summary_text']

    # Step 4: Create DataFrame
    df = pd.DataFrame([{
        "City": city,
        "Temperature (°C)": temp,
        "Humidity (%)": humidity,
        "Sky Description": description,
        "Weather Summary": summary,
        "Timestamp (UTC)": timestamp
    }])

    # Save locally
    filename = f"weather_summary_{city.replace(' ', '_')}.csv"
    df.to_csv(filename, index=False)
    print(f"✅ Local file saved: {filename}")

    # Step 5: Upload to Azure Blob
    azure_connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
    container_name = "weather-insights"

    blob_service_client = BlobServiceClient.from_connection_string(azure_connection_string)

    # Ensure container exists
    try:
        blob_service_client.create_container(container_name)
    except:
        pass  # Container probably exists

    blob_client = blob_service_client.get_blob_client(container=container_name, blob=filename)

    with open(filename, "rb") as data:
        blob_client.upload_blob(data, overwrite=True)

    print(f"☁️ Uploaded to Azure Blob: {container_name}/{filename}")

    return df


In [26]:
# Call your ETL function for New York (or any city)
run_weather_etl("New York")


Device set to use cpu
Your max_length is set to 30, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


✅ Local file saved: weather_summary_New_York.csv
☁️ Uploaded to Azure Blob: weather-insights/weather_summary_New_York.csv


Unnamed: 0,City,Temperature (°C),Humidity (%),Sky Description,Weather Summary,Timestamp (UTC)
0,New York,9,19,Sunny,"temperature: 9°C, Humidity: 19%, Sky: Sunny .",2025-03-23 22:41:49


In [28]:
from google.colab import files
files.download("weather_summary_New_York.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
import os

# List of cities you want to process
cities = ["New York", "Los Angeles", "Chicago", "Houston", "Miami"]

# Loop through cities
for city in cities:
    print(f"\n🚀 Running ETL for: {city}")
    try:
        run_weather_etl(city)
    except Exception as e:
        print(f"❌ Error for {city}: {str(e)}")



🚀 Running ETL for: New York


Device set to use cpu
Your max_length is set to 30, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


✅ Local file saved: weather_summary_New_York.csv
☁️ Uploaded to Azure Blob: weather-insights/weather_summary_New_York.csv

🚀 Running ETL for: Los Angeles


Device set to use cpu
Your max_length is set to 30, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


✅ Local file saved: weather_summary_Los_Angeles.csv
☁️ Uploaded to Azure Blob: weather-insights/weather_summary_Los_Angeles.csv

🚀 Running ETL for: Chicago


Device set to use cpu
Your max_length is set to 30, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


✅ Local file saved: weather_summary_Chicago.csv
☁️ Uploaded to Azure Blob: weather-insights/weather_summary_Chicago.csv

🚀 Running ETL for: Houston


Device set to use cpu
Your max_length is set to 30, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


✅ Local file saved: weather_summary_Houston.csv
☁️ Uploaded to Azure Blob: weather-insights/weather_summary_Houston.csv

🚀 Running ETL for: Miami


Device set to use cpu
Your max_length is set to 30, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


✅ Local file saved: weather_summary_Miami.csv
☁️ Uploaded to Azure Blob: weather-insights/weather_summary_Miami.csv
