## Imports et config

In [60]:
import pandas as pd
import io
import os
import requests
import datetime as dt
from dotenv import load_dotenv
import boto3
import re
from tqdm import tqdm
from collections import Counter

load_dotenv()
CURRENT_STAT_ID = 7510

S3_RAW_PATH = "esquilaplu/raw/meteofrance"

## Reformattage des nom de fichier brutes dans S3

In [6]:
aws_s3_bucket = os.getenv("S3_BUCKET")
aws_access_key_id = os.getenv("ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("SECRET_ACCESS_KEY")

s3_client = boto3.client(
    "s3",
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

In [8]:
response = s3_client.list_objects_v2(Bucket=aws_s3_bucket, Prefix=S3_RAW_PATH)
files = [content["Key"].lstrip(S3_RAW_PATH) for content in response["Contents"] if (content["Key"] != S3_RAW_PATH and content["Key"].endswith(".csv"))]
files[:10]


['2023-05-01-0.csv',
 '2023-05-01-12.csv',
 '2023-05-01-15.csv',
 '2023-05-01-18.csv',
 '2023-05-01-21.csv',
 '2023-05-01-3.csv',
 '2023-05-01-6.csv',
 '2023-05-01-9.csv',
 '2023-05-02-0.csv',
 '2023-05-02-12.csv']

In [35]:
def format_hour(hour: str) -> str:
    return str(hour).zfill(2)

In [43]:
file = files[0]
hour = re.search(r"\d{4}-\d{2}-\d{2}-(\d{1,2})\.csv", file).group(1)
formatted_hour = format_hour(hour)
print(formatted_hour)

# replace hour part in filename with 2023-05-02-00.csv
date = re.search(r"\d{4}-\d{2}-\d{2}", file).group(0)
new_file = re.sub(r"\d{4}-\d{2}-\d{2}-\d{1,2}\.csv", f"{date}-{formatted_hour}.csv", file)
new_file

00


'2023-05-01-00.csv'

In [61]:
counter = Counter()
for file in tqdm(files):
    hour = re.search(r"\d{4}-\d{2}-\d{2}-(\d{1,2})\.csv", file).group(1)
    formatted_hour = format_hour(hour)
    
    if hour != formatted_hour:
        counter.update([hour])
        date = re.search(r"\d{4}-\d{2}-\d{2}", file).group(0)
        new_file = re.sub(r"\d{4}-\d{2}-\d{2}-\d{1,2}\.csv", f"{date}-{formatted_hour}.csv", file)

        # s3_client.copy_object(
        #     Bucket=aws_s3_bucket,
        #     CopySource={"Bucket": aws_s3_bucket, "Key": f"{S3_RAW_PATH}/{file}"},
        #     Key=f"{S3_RAW_PATH}/{new_file}",
        # )
        s3_client.delete_object(Bucket=aws_s3_bucket, Key=f"{S3_RAW_PATH}/{file}")
        
print(counter)

100%|██████████| 292/292 [00:14<00:00, 19.80it/s]

Counter({'0': 32, '3': 31, '6': 31, '9': 31})





## Récupération des données

In [22]:
def load_dataset(date: dt.datetime) -> pd.DataFrame:
    
    hour = date.strftime("%H")
    date_id = "".join([
        str(date.year),
        date.strftime("%m"),
        date.strftime("%d"),
    ])
    
    dt_id = date_id + hour
    
    url = f"https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/synop.{dt_id}.csv"
    headers = {
        "Referer": f"https://donneespubliques.meteofrance.fr/?fond=donnee_libre&prefixe=Txt%2FSynop%2Fsynop&extension=csv&date={date_id}&reseau={hour}",
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
        "Host": "donneespubliques.meteofrance.fr",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
    }

    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    buffer = io.StringIO(resp.text)
    
    buffer.seek(0)
    df = pd.read_csv(buffer, sep=";", parse_dates=["date"])
    return df


load_dataset(dt.datetime(2023, 5, 10, 12))

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4,Unnamed: 59
0,7005,2023-05-10 12:00:00,101490,110,1,270,6.900000,286.250000,283.950000,86,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
1,7015,2023-05-10 12:00:00,101300,30,1,250,3.000000,288.850000,283.350000,70,...,5,mq,840,6,mq,1020,mq,9,mq,
2,7020,2023-05-10 12:00:00,101660,60,0,250,7.500000,286.650000,284.550000,87,...,3,3,3000,3,0,9000,mq,mq,mq,
3,7027,2023-05-10 12:00:00,101650,20,1,300,2.500000,287.450000,286.150000,92,...,6,mq,1110,7,mq,1680,mq,9,mq,
4,7037,2023-05-10 12:00:00,101560,50,1,300,4.500000,287.150000,283.150000,77,...,3,mq,780,7,mq,1440,mq,mq,mq,
5,7072,2023-05-10 12:00:00,101370,-30,8,290,1.800000,288.650000,283.650000,72,...,5,mq,5520,7,mq,6600,mq,8,mq,
6,7110,2023-05-10 12:00:00,101980,30,1,290,5.500000,287.750000,282.550000,71,...,8,mq,1800,mq,mq,mq,mq,mq,mq,
7,7117,2023-05-10 12:00:00,101870,40,1,250,9.300000,287.650000,283.450000,76,...,2,6,1000,2,3,2000,1,0,6000,
8,7130,2023-05-10 12:00:00,101840,-30,7,330,2.400000,287.550000,284.650000,83,...,6,mq,1320,8,mq,1620,mq,9,mq,
9,7139,2023-05-10 12:00:00,101670,-30,8,240,4.800000,287.950000,285.050000,83,...,4,mq,1200,5,mq,1470,mq,mq,mq,


Chargement de la liste des stations météo

In [13]:
station_filepath = "resources/data/stations.csv"

stations = pd.read_csv(station_filepath, sep=";")
stations.head()

Unnamed: 0,ID,Nom,Latitude,Longitude,Altitude
0,7005,ABBEVILLE,50.136,1.834,69
1,7015,LILLE-LESQUIN,50.57,3.0975,47
2,7020,PTE DE LA HAGUE,49.725167,-1.939833,6
3,7027,CAEN-CARPIQUET,49.18,-0.456167,67
4,7037,ROUEN-BOOS,49.383,1.181667,151


In [14]:
stations.loc[stations["ID"] == CURRENT_STAT_ID, :]

Unnamed: 0,ID,Nom,Latitude,Longitude,Altitude
26,7510,BORDEAUX-MERIGNAC,44.830667,-0.691333,47


In [4]:
buffer = io.StringIO(resp.text)
buffer.seek(0)
df = pd.read_csv(buffer, sep=";", parse_dates=["date"])
df


Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4,Unnamed: 59
0,7005,2023-05-12 18:00:00,101870,110,3,30,2.8,286.350000,285.550000,95,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
1,7015,2023-05-12 18:00:00,101850,130,3,10,2.1,288.850000,286.350000,85,...,6,mq,1980,8,mq,2400,mq,9,mq,
2,7020,2023-05-12 18:00:00,102200,-10,7,30,6.3,285.750000,283.850000,88,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
3,7027,2023-05-12 18:00:00,102070,-20,6,340,5.0,283.950000,283.450000,97,...,8,mq,240,mq,mq,mq,mq,mq,mq,
4,7037,2023-05-12 18:00:00,101820,-30,5,360,4.5,284.550000,284.250000,98,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
5,7072,2023-05-12 18:00:00,101640,40,3,320,3.5,287.950000,285.450000,85,...,6,mq,2040,8,mq,2400,mq,9,mq,
6,7110,2023-05-12 18:00:00,102360,20,0,20,4.6,287.050000,283.050000,77,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
7,7117,2023-05-12 18:00:00,102310,-20,8,280,2.6,285.550000,283.450000,87,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
8,7130,2023-05-12 18:00:00,102120,90,1,350,5.3,285.950000,281.450000,74,...,3,mq,1680,7,mq,2100,mq,mq,mq,
9,7139,2023-05-12 18:00:00,101950,-10,8,320,5.8,284.450000,282.150000,86,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
