In [1]:
# pip install pandas pyarrow fsspec s3fs

In [7]:
import requests
import pandas as pd
from datetime import datetime
import pytz

# API endpoint and parameters
WEATHER_ENDPOINT = "https://api.openweathermap.org/data/2.5/weather"
API_KEY = "e5571f88fb32067fe934a6793b8b3108"  # Replace with your actual API key


provinces = {
    "Pathum Thani":{
        "lat": 14.0134,
        "lon": 100.5304
    },
    "Bangkok":{
            "lat": 13.7367,
            "lon": 100.5232
    },
    "Chiang Mai":{
        "lat": 18.7883,
        "lon": 98.9853
    },
    "Phuket":{
        "lat": 7.9519,
        "lon": 98.3381
    }
}
# Function to fetch and process weather data
def get_weather_data(province='Pathum Thani'):
    
    params = {
        "lat": provinces[province]['lat'],
        "lon": provinces[province]['lon'],
        "appid": API_KEY,
        "units": "metric"
    }
    try:
        # Make API request
        response = requests.get(WEATHER_ENDPOINT, params=params)
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
        
        # Convert timestamp to datetime
        # created_at = datetime.fromtimestamp(data['dt'])

        dt = datetime.now()
        thai_tz = pytz.timezone('Asia/Bangkok')
        created_at = dt.replace(tzinfo=thai_tz)


        timestamp = datetime.now()
        
        # Create dictionary with required fields
        weather_dict = {
            'timestamp': timestamp,
            'year': timestamp.year,
            'month': timestamp.month,
            'day': timestamp.day,
            'hour': timestamp.hour,
            'minute': timestamp.minute,
            'created_at': created_at,
            'requested_province':province,
            'location': data['name'],
            'weather_main': data['weather'][0]['main'],
            'weather_description': data['weather'][0]['description'],
            'main.temp': data['main']['temp']
        }
        
        # Create DataFrame
        # df = pd.DataFrame([weather_dict])
        
        # return df
        return weather_dict
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None
    except KeyError as e:
        print(f"Error processing data: Missing key {e}")
        return None

In [8]:
df=pd.DataFrame([get_weather_data(p) for p in list(provinces.keys())])
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype                       
---  ------               --------------  -----                       
 0   timestamp            4 non-null      datetime64[ns]              
 1   year                 4 non-null      int64                       
 2   month                4 non-null      int64                       
 3   day                  4 non-null      int64                       
 4   hour                 4 non-null      int64                       
 5   minute               4 non-null      int64                       
 6   created_at           4 non-null      datetime64[ns, Asia/Bangkok]
 7   requested_province   4 non-null      object                      
 8   location             4 non-null      object                      
 9   weather_main         4 non-null      object                      
 10  weather_description  4 non-null      objec

Unnamed: 0,timestamp,year,month,day,hour,minute,created_at,requested_province,location,weather_main,weather_description,main.temp
0,2025-04-10 08:44:01.068884,2025,4,10,8,44,2025-04-10 09:02:01.068798+07:00,Pathum Thani,Pathum Thani,Rain,light rain,34.99
1,2025-04-10 08:44:01.308434,2025,4,10,8,44,2025-04-10 09:02:01.308405+07:00,Bangkok,Pathum Wan,Clouds,overcast clouds,34.88
2,2025-04-10 08:44:01.549741,2025,4,10,8,44,2025-04-10 09:02:01.549728+07:00,Chiang Mai,Chiang Mai,Clouds,few clouds,37.16
3,2025-04-10 08:44:01.797601,2025,4,10,8,44,2025-04-10 09:02:01.797582+07:00,Phuket,Kathu,Clouds,scattered clouds,27.92


In [9]:

dt = datetime.now()
thai_tz = pytz.timezone('Asia/Bangkok')
dt = dt.replace(tzinfo=thai_tz)
print(dt) 

2025-04-10 08:44:02.676506+06:42


In [10]:
import pandas as pd

# lakeFS credentials from your docker-compose.yml
ACCESS_KEY = "access_key"
SECRET_KEY = "secret_key"

# lakeFS endpoint (running locally)
lakefs_endpoint = "http://lakefs-dev:8000/"

# lakeFS repository, branch, and file path
repo = "weather"
branch = "main"
path = "weather.parquet"

# Construct the full lakeFS S3-compatible path
lakefs_s3_path = f"s3a://{repo}/{branch}/{path}"

# Configure storage_options for lakeFS (S3-compatible)
storage_options = {
    "key": ACCESS_KEY,
    "secret": SECRET_KEY,
    "client_kwargs": {
        "endpoint_url": lakefs_endpoint
    }
}

In [11]:
df.to_parquet(
    lakefs_s3_path,
    storage_options=storage_options,
    partition_cols=['year','month','day','hour'],
    
)

# test read parquet files

In [84]:
path_all_partition = 's3a://weather3/main/weather.parquet'

df2=pd.read_parquet(    
    path=path_all_partition,
    storage_options=storage_options
)
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   timestamp            4 non-null      datetime64[ns]
 1   minute               4 non-null      int64         
 2   created_at           4 non-null      datetime64[ns]
 3   requested_province   4 non-null      object        
 4   location             4 non-null      object        
 5   weather_main         4 non-null      object        
 6   weather_description  4 non-null      object        
 7   main.temp            4 non-null      float64       
 8   year                 4 non-null      category      
 9   month                4 non-null      category      
 10  day                  4 non-null      category      
 11  hour                 4 non-null      category      
dtypes: category(4), datetime64[ns](2), float64(1), int64(1), object(4)
memory usage: 788.0+ bytes


Unnamed: 0,timestamp,minute,created_at,requested_province,location,weather_main,weather_description,main.temp,year,month,day,hour
0,2025-04-10 07:47:13.822721,47,2025-04-10 07:47:13,Pathum Thani,Pathum Thani,Clouds,overcast clouds,34.99,2025,4,10,7
1,2025-04-10 07:47:14.092639,47,2025-04-10 07:47:14,Bangkok,Pathum Wan,Clouds,overcast clouds,34.88,2025,4,10,7
2,2025-04-10 07:47:14.366322,47,2025-04-10 07:45:25,Chiang Mai,Chiang Mai,Clouds,few clouds,37.1,2025,4,10,7
3,2025-04-10 07:47:14.637560,47,2025-04-10 07:47:14,Phuket,Kathu,Clouds,few clouds,27.92,2025,4,10,7


In [86]:
path_single_partition = 's3a://weather3/main/weather.parquet/year=2025/month=4/day=10/hour=7/70cf855c5c5e4659ae01aba885c731c3-0.parquet'

df2=pd.read_parquet(    
    path=path_single_partition,
    storage_options=storage_options,
)
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   timestamp            4 non-null      datetime64[ns]
 1   minute               4 non-null      int64         
 2   created_at           4 non-null      datetime64[ns]
 3   requested_province   4 non-null      object        
 4   location             4 non-null      object        
 5   weather_main         4 non-null      object        
 6   weather_description  4 non-null      object        
 7   main.temp            4 non-null      float64       
 8   year                 4 non-null      category      
 9   month                4 non-null      category      
 10  day                  4 non-null      category      
 11  hour                 4 non-null      category      
dtypes: category(4), datetime64[ns](2), float64(1), int64(1), object(4)
memory usage: 788.0+ bytes


Unnamed: 0,timestamp,minute,created_at,requested_province,location,weather_main,weather_description,main.temp,year,month,day,hour
0,2025-04-10 07:47:13.822721,47,2025-04-10 07:47:13,Pathum Thani,Pathum Thani,Clouds,overcast clouds,34.99,2025,4,10,7
1,2025-04-10 07:47:14.092639,47,2025-04-10 07:47:14,Bangkok,Pathum Wan,Clouds,overcast clouds,34.88,2025,4,10,7
2,2025-04-10 07:47:14.366322,47,2025-04-10 07:45:25,Chiang Mai,Chiang Mai,Clouds,few clouds,37.1,2025,4,10,7
3,2025-04-10 07:47:14.637560,47,2025-04-10 07:47:14,Phuket,Kathu,Clouds,few clouds,27.92,2025,4,10,7


In [91]:
print(dt)

2025-04-10 07:50:44.311326+06:42


# Test Duck and Dask

In [88]:
# pip install duckdb
import duckdb
import pandas as pd

# Connect to an in-memory DuckDB instance.
con = duckdb.connect(database=':memory:')

storage_options = {
    "key": ACCESS_KEY,
    "secret": SECRET_KEY,
    "client_kwargs": {
        "endpoint_url": lakefs_endpoint
    }
}

# Use DuckDB's read_parquet() function to read the dataset.
# Assume the dataset is stored in the directory "output_parquet_dataset" with hive partitions:
# output_parquet_dataset/year=2025/month=02/day=17/...
query = """
INSTALL httpfs;
LOAD httpfs;

SET s3_endpoint='lakefs-dev:8000';
SET s3_access_key_id='access_key'; 
SET s3_secret_access_key='secret_key'; 
SET s3_url_style='path';
SET s3_use_ssl=false;

SELECT *
FROM read_parquet('s3a://weather3/main/weather.parquet')
"""

df_duck = con.execute(query).df()

print("DuckDB Parquet Query Result:")
df_duck.info()
df_duck.head(20)

HTTPException: HTTP Error: Unable to connect to URL "http://lakefs-dev:8000/weather3/main/weather.parquet": 404 (Not Found).

In [89]:
# pip install dask
import dask.dataframe as dd
df2 = dd.read_parquet(
    path=path_all_partition,
    storage_options=storage_options,
    dtype_backend='pyarrow'
)  

ImportError: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: pyarrow>=10.0.1 is required for PyArrow backed StringArray.