In [2]:
provincial_code = ['AB', 'BC', 'MB', 'NF', 'NS', 'NT', 'NU', 'ON', 'QC', 'YT']
provincial_name_to_code_map = {
    'Alberta': 'AB',
    'British Columbia': 'BC',
    'Manitoba': 'MB',
    'Newfoundland and Labrador': 'NF',
    'Nova Scotia': 'NS',
    'Northwest Territories': 'NT',
    'Nunavut': 'NU',
    'Ontario': 'ON',
    'Quebec': 'QC',
    'Yukon': 'YT'
}
provincial_report = [code + '_ASOS_Precipitation.csv' for code in provincial_code]

In [3]:
import pandas as pd
import os

reports = {}

for report, code in zip(provincial_report, provincial_code):
    reports[code] = pd.read_csv(report)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  reports[code] = pd.read_csv(report)


In [4]:
for province_code in reports.keys():
    reports[province_code]['valid_dt'] = pd.to_datetime(reports[province_code]['valid'], format='%Y-%m-%d %H:%M')

In [5]:
reports['AB']

Unnamed: 0,station,valid,tmpc,dwpc,relh,drct,sknt,mslp,p01m,vsby,...,TS,SN,RA,Hail,GS,IP,DZ,FZRA/FZDZ,precipitation_category,valid_dt
0,CYQF,2022-12-16 00:00,-8.00,-11.00,79.00,240.00,5.00,1032.60,0.0,9.00,...,False,False,False,False,False,False,False,False,MISSING,2022-12-16 00:00:00
1,CYQL,2022-12-16 00:00,-6.00,-9.00,79.31,250.00,4.00,1034.00,0.0,25.00,...,False,False,False,False,False,False,False,False,MISSING,2022-12-16 00:00:00
2,CYLL,2022-12-16 00:00,-15.00,-17.00,84.68,280.00,6.00,1030.50,0.0,6.00,...,False,False,False,False,False,False,False,False,,2022-12-16 00:00:00
3,CYOJ,2022-12-16 00:00,-12.00,-13.00,92.25,180.00,2.00,1028.70,0.0,10.00,...,False,True,False,False,False,False,False,False,SNOW_ONLY,2022-12-16 00:00:00
4,CYZU,2022-12-16 00:00,-4.00,-7.00,79.62,290.00,4.00,1030.40,0.0,15.00,...,False,False,False,False,False,False,False,False,MISSING,2022-12-16 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64197,CYQL,2024-01-23 23:00,4.00,-1.00,69.88,260.00,14.00,1008.90,0.0,60.00,...,False,False,False,False,False,False,False,False,MISSING,2024-01-23 23:00:00
64198,CYOJ,2024-01-23 23:00,-17.00,-20.00,77.49,360.00,5.00,1013.90,0.0,15.00,...,False,False,False,False,False,False,False,False,MISSING,2024-01-23 23:00:00
64199,CYZU,2024-01-23 23:00,-2.00,-6.00,74.08,290.00,3.00,1009.50,0.0,12.00,...,False,False,False,False,False,False,False,False,MISSING,2024-01-23 23:00:00
64200,CYQF,2024-01-23 23:00,-4.00,-8.00,73.70,0.00,0.00,1011.10,0.0,6.00,...,False,False,False,False,False,False,False,False,,2024-01-23 23:00:00


In [6]:
provincial_stations_info = {}
for province_code in reports.keys():
    provincial_stations_info[province_code] = {}
    for station in reports[province_code]['station'].unique():
        provincial_stations_info[province_code][station] = []
    for index, station, valid_dt in zip(reports[province_code].index, reports[province_code]['station'], reports[province_code]['valid_dt']):
        provincial_stations_info[province_code][station].append({ 'index': index, 'valid_dt': valid_dt, 'file_name': None, 'folder_name': None, 'timedelta': None })

In [7]:
import os
import datetime
from tqdm import tqdm

def extract_info_from_images_multiple_folders(folder_paths):
    extracted_info = []  # Initialize the list to store information

    # Process each folder in the list of folder paths
    for folder_path in tqdm(folder_paths):
        # Iterate over all the files in the current folder
        for file_name in os.listdir(folder_path):
            # Check if the file is an image based on its extension
            if file_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                # Split the file name to extract the required information
                parts = file_name.split('-')
                if len(parts) >= 4:
                    IATA_ID = parts[0]
                    date_time_str = '-'.join(parts[2:-1]) if len(parts) > 4 else '-'.join(parts[2:])
                    date_time_str = date_time_str.rsplit('.', 1)[0].replace('_', '-')
                    try:
                        date_time = datetime.datetime.strptime(date_time_str, "%Y-%m-%d-%H-%M-%S")
                        # Append the extracted information to the list
                        extracted_info.append({
                            "IATA_ID": IATA_ID,
                            "datetime": date_time,
                            "file_name": file_name,
                            "folder_name": folder_path
                        })
                    except ValueError as e:
                        print(f"Error parsing date and time from '{file_name}': {e}")

    # Return the list of extracted information
    return extracted_info

In [8]:
import os

def list_subdirectories(path):
    """
    Returns a list of subdirectories for the given directory path.

    :param path: String, the path to the directory you want to inspect.
    :return: List of strings, each representing a subdirectory path.
    """
    # Initialize a list to store the names of subdirectories
    subdirectories = []
    
    # Check if the given path is a directory
    if not os.path.isdir(path):
        print(f"The provided path: {path} is not a valid directory.")
        return subdirectories

    # Loop through the items in the directory
    for item in os.listdir(path):
        # Construct the full path of the item
        item_path = os.path.join(path, item)
        # Check if the item is a directory
        if os.path.isdir(item_path):
            subdirectories.append(item_path)

    return subdirectories

In [9]:
storage_path = '/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002/NavCan_WxCams_Sample_v4'
extract_paths = list_subdirectories(storage_path)

In [10]:
extract_paths = [path for path in extract_paths if 'images' not in path]

In [11]:
og_info = extract_info_from_images_multiple_folders(extract_paths)
for item in og_info[:5]:
    print(f"IATA_ID: {item['IATA_ID']}, datetime: {item['datetime']}, file_name: {item['file_name']}, folder_name: {item['folder_name']}")

100%|██████████| 11369/11369 [00:47<00:00, 241.55it/s]


In [11]:
import pandas as pd

root_path = "/space/hall5/sitestore/eccc/mrd/rpnarmp/snow000/NavCan_WxCams"
df_navcan = pd.read_excel(os.path.join(root_path, 'navcan_cameras_to_archive.xlsx'), engine='openpyxl')

In [12]:
df_navcan

Unnamed: 0,IATA_ID,direction,source,Climate_ID,Station_ID,WMO_ID,TC_ID,MSC_ID,AUTO/MAN,Name,Latitude,Longitude,Elevation(m),Data_Provider,Province/Territory,First Year,Last Year
0,CWWQ,N,navCan2,1195251,52158,,WWQ,1195251,MAN,Muncho Lake,59.0167,-125.767,836.5,NAV CANADA,British Columbia,2014,2022
1,CYAB,SE,navCan2,2400401,50497,,YAB,2400401,MAN,Arctic Bay,73.00639,-85.04732,21.9,NAV CANADA,Nunavut,2012,2022
2,CYBD,SW,navCan2,1060848,52978,,YBD,1060845,MAN,Bella Coola,52.38763,-126.59577,35.7,NAV CANADA,British Columbia,2014,2022
3,CYBV,W,navCan2,5030204,53318,,YBV,5030201,MAN,Berens River,52.35889,-97.01794,222.2,NAV CANADA,Manitoba,2015,2022
4,CYCY,N,navCan2,2400804,51077,71090.0,YCY,2400803,MAN,Clyde River,70.48592,-68.51694,26.5,NAV CANADA,Nunavut,2013,2022
5,CYDB,S,navCan2,2100181,49650,71001.0,YDB,2100180,MAN,Burwash,61.37066,-139.03995,806.2,NAV CANADA,Yukon,2011,2022
6,CYEK,W,navCan2,2300427,50557,71174.0,YEK,2300428,MAN,Arviat,61.09397,-94.07159,10.4,NAV CANADA,Nunavut,2006,2022
7,CYFO,SW,navCan2,5050973,52961,,YFO,5050961,MAN,Flin Flon,54.67807,-101.68189,304.2,NAV CANADA,Manitoba,2014,2022
8,CYGE,N,navCan2,1173214,52980,,YGE,1173209,MAN,Golden,51.29917,-116.9825,784.9,NAV CANADA,British Columbia,2014,2022
9,CYGQ,W,navCan2,6042717,53519,71834.0,YGQ,6042717,MAN,Geraldton,49.77869,-86.93855,348.4,NAV CANADA,Ontario,2015,2022


In [13]:
def get_provincial_code_from_iata_id(iata_id):
  filtered_df = df_navcan[df_navcan['IATA_ID'] == iata_id]
  return provincial_name_to_code_map[filtered_df['Province/Territory'].iloc[0]]

# Example
get_provincial_code_from_iata_id('CWWQ')

'BC'

In [14]:
from datetime import datetime, timedelta
from tqdm import tqdm

def find_closest_index(lst, target, time_window=timedelta(minutes=10)):
    """
    Performs binary search to find the index of the closest valid_dt within a 10-minute window to the target datetime.
    Returns None if no such valid_dt is found.
    """
    left, right = 0, len(lst) - 1
    result = None
    while left <= right:
        mid = (left + right) // 2
        time_difference = abs(target - lst[mid]['valid_dt'])
        
        if time_difference <= time_window:
            result = mid  # Found a candidate
            # Try to find an even closer time in the left half
            right = mid - 1
        elif lst[mid]['valid_dt'] < target:
            left = mid + 1
        else:
            right = mid - 1

    return result

for item in tqdm(og_info):
    provincial_code = get_provincial_code_from_iata_id(item['IATA_ID'])
    if item['IATA_ID'] not in provincial_stations_info[provincial_code]:
        continue
    
    station_info = provincial_stations_info[provincial_code][item['IATA_ID']]
    closest_index = find_closest_index(station_info, item['datetime'])
    
    if closest_index is not None:
        # If a suitable valid_dt is found, check and update the necessary information
        valid_dt = station_info[closest_index]['valid_dt']
        time_difference = abs(item['datetime'] - valid_dt)
        if (station_info[closest_index]['timedelta'] is None) or (time_difference < station_info[closest_index]['timedelta']):
            station_info[closest_index]['file_name'] = item['file_name']
            station_info[closest_index]['folder_name'] = item['folder_name']
            station_info[closest_index]['timedelta'] = time_difference

100%|██████████| 1443444/1443444 [07:30<00:00, 3207.40it/s]


In [15]:
from tqdm import tqdm

num_items = 0
for province_code in tqdm(list(provincial_stations_info.keys())):
    for station in tqdm(list(provincial_stations_info[province_code].keys())):
        for item in provincial_stations_info[province_code][station]:
            if item['file_name'] is not None:
                num_items += 1
                
print(f"{num_items} images linked with corresponding report")

  0%|          | 0/10 [00:00<?, ?it/s]
100%|██████████| 5/5 [00:00<00:00, 257.11it/s]

100%|██████████| 4/4 [00:00<00:00, 359.36it/s]

100%|██████████| 4/4 [00:00<00:00, 387.28it/s]

100%|██████████| 1/1 [00:00<00:00, 350.81it/s]

100%|██████████| 1/1 [00:00<00:00, 354.46it/s]

100%|██████████| 6/6 [00:00<00:00, 956.33it/s]

100%|██████████| 7/7 [00:00<00:00, 246.81it/s]

100%|██████████| 2/2 [00:00<00:00, 360.89it/s]

100%|██████████| 1/1 [00:00<00:00, 404.35it/s]
 90%|█████████ | 9/10 [00:00<00:00, 87.22it/s]
100%|██████████| 2/2 [00:00<00:00, 492.00it/s]
100%|██████████| 10/10 [00:00<00:00, 91.72it/s]

290452 images linked with corresponding report





In [16]:
for province_code in reports.keys():
  reports[province_code]['matched_image_file'] = None
  reports[province_code]['matched_image_folder'] = None

In [17]:
from tqdm import tqdm

for province_code in tqdm(list(provincial_stations_info.keys())):
    for station in tqdm(list(provincial_stations_info[province_code].keys())):
        for item in provincial_stations_info[province_code][station]:
            if item['file_name'] is not None:
                reports[province_code].loc[item['index'], 'matched_image_file'] = item['file_name']
                reports[province_code].loc[item['index'], 'matched_image_folder'] = item['folder_name']

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:04,  1.19s/it][A
 40%|████      | 2/5 [00:02<00:03,  1.09s/it][A
 60%|██████    | 3/5 [00:03<00:02,  1.07s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.10s/it][A
100%|██████████| 5/5 [00:05<00:00,  1.01s/it][A
 10%|█         | 1/10 [00:05<00:45,  5.05s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:01<00:03,  1.17s/it][A
 50%|█████     | 2/4 [00:01<00:01,  1.29it/s][A
 75%|███████▌  | 3/4 [00:02<00:00,  1.12it/s][A
100%|██████████| 4/4 [00:03<00:00,  1.25it/s][A
 20%|██        | 2/10 [00:08<00:31,  3.97s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:01<00:04,  1.34s/it][A
 50%|█████     | 2/4 [00:01<00:01,  1.16it/s][A
 75%|███████▌  | 3/4 [00:02<00:00,  1.10it/s][A
100%|██████████| 4/4 [00:03<00:00,  1.28it/s][A
 30%|███       | 3/10 [00:11<00:25,  3.58s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/

In [18]:
reports['AB'][reports['AB']['matched_image_file'].notna()]

Unnamed: 0,station,valid,tmpc,dwpc,relh,drct,sknt,mslp,p01m,vsby,...,RA,Hail,GS,IP,DZ,FZRA/FZDZ,precipitation_category,valid_dt,matched_image_file,matched_image_folder
151,CYOJ,2022-12-16 19:50,-14.00,-15.00,92.12,340.00,7.00,1029.70,0.0,0.75,...,False,False,False,False,False,False,SNOW_ONLY,2022-12-16 19:50:00,CYOJ-NE-2022_12_16-19_54_30.jpg,/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002...
152,CYQL,2022-12-16 20:00,-3.00,-7.00,73.89,240.00,15.00,1022.90,0.0,60.00,...,False,False,False,False,False,False,SNOW_ONLY,2022-12-16 20:00:00,CYQL-SW-2022_12_16-20_04_30.jpg,/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002...
153,CYQF,2022-12-16 20:00,-1.00,-7.00,63.75,240.00,5.00,1021.20,0.0,9.00,...,False,False,False,False,False,False,MISSING,2022-12-16 20:00:00,CYQF-W-2022_12_16-20_04_30.jpg,/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002...
154,CYOJ,2022-12-16 20:00,-14.00,-15.00,92.12,330.00,10.00,1030.00,0.0,0.62,...,False,False,False,False,False,False,SNOW_ONLY,2022-12-16 20:00:00,CYOJ-NE-2022_12_16-20_04_30.jpg,/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002...
155,CYLL,2022-12-16 20:00,-14.00,-16.00,84.80,110.00,10.00,1022.70,0.0,5.00,...,False,False,False,False,False,False,SNOW_ONLY,2022-12-16 20:00:00,CYLL-NW-2022_12_16-20_04_30.jpg,/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64197,CYQL,2024-01-23 23:00,4.00,-1.00,69.88,260.00,14.00,1008.90,0.0,60.00,...,False,False,False,False,False,False,MISSING,2024-01-23 23:00:00,CYQL-SW-2024_01_23-23_04_30.jpg,/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002...
64198,CYOJ,2024-01-23 23:00,-17.00,-20.00,77.49,360.00,5.00,1013.90,0.0,15.00,...,False,False,False,False,False,False,MISSING,2024-01-23 23:00:00,CYOJ-NE-2024_01_23-23_04_30.jpg,/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002...
64199,CYZU,2024-01-23 23:00,-2.00,-6.00,74.08,290.00,3.00,1009.50,0.0,12.00,...,False,False,False,False,False,False,MISSING,2024-01-23 23:00:00,CYZU-SE-2024_01_23-23_04_30.jpg,/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002...
64200,CYQF,2024-01-23 23:00,-4.00,-8.00,73.70,0.00,0.00,1011.10,0.0,6.00,...,False,False,False,False,False,False,,2024-01-23 23:00:00,CYQF-W-2024_01_23-23_04_30.jpg,/space/hall5/sitestore/eccc/mrd/rpnarmp/hol002...


In [19]:
import os

for province_code in reports.keys():
    reports[province_code].to_csv(f"{province_code}_ASOS_Matched.csv", index=False)