In [31]:
import json 
import os
import pandas as pd 
from datetime import datetime, timedelta
from pathlib import Path

In [32]:

def split_data(data_to_split:list, by:int) -> list:
    return [data_to_split[i:i + by] for i in range(0, len(data_to_split), by)]

In [33]:
def get_json_file_content(json_file_path:str, key:str=None) -> object:
    """get json file content
    Args:
        json_file_path (str): json file path
    Returns:
        object: json file content
    """
    if Path(json_file_path).exists():
        with open(json_file_path, 'r') as openfile:
            file_content = json.load(openfile)
            if file_content and key:
                try:
                    return file_content.get(key)
                except KeyError as e:
                    print('error',f"{e}")
            return file_content
    print('error', 'file does not found')

In [34]:
def combine_file_content(file_path:str, file_type:str) -> list:
    match file_type:
        case 'json':
            files = os.listdir(file_path)
            files = list(filter(lambda f: f.endswith('.json'), files))
            print(files)
            files = [f"{file_path + x}" for x in files]
            file_contents = []
            for file in files:
                print(file)
                file_content = get_json_file_content(f"{file}")
                file_contents += file_content
            return file_contents
        case 'csv':
            pass

In [35]:
def get_saturdays(start_date, end_date):
    saturdays = pd.bdate_range(start=start_date, end=end_date, freq='C', weekmask='Sat').strftime('%Y-%m-%d').tolist()
    return saturdays

In [36]:
def normalize_urls(urls, dates:list) -> list:
    updated_urls = []
    for date in dates:
        start_date = date 
        end_date = (datetime.strptime(date, '%Y-%m-%d') + timedelta(days=6)).strftime('%Y-%m-%d')
        for url in urls:
            if '.php' in url:
                updated_url = f"{url}&date_debut={start_date}&date_fin={end_date}"
            else:
                residence_cle = url.split('_')[-1].split('.')[0]
                updated_url = f"{url}?date_debut={start_date}&date_fin={end_date}&residence_cle={residence_cle}"
            updated_urls.append(updated_url)
    return updated_urls

In [37]:
start_date = "03/03/2025"
end_date = "30/10/2025"
dest_folder_path = "/home/keller/Documents/Jobdev/G2A/Pricing/dests/maeva/setups/new_urls/normal/"
output_path = "/home/keller/Documents/Jobdev/G2A/demo/"
name = "dest"
devided_by = 35

In [38]:
dest_data = combine_file_content(dest_folder_path, 'json')

['all_normal_urls.json']
/home/keller/Documents/Jobdev/G2A/Pricing/dests/maeva/setups/new_urls/normal/all_normal_urls.json


In [39]:
len(dest_data) * len(get_saturdays(start_date, end_date)) // 35

44533

In [40]:
len(dest_data) // 18

2546

In [41]:
cleaned_urls = []

for url in dest_data:
    if '.php' in url:
        cleaned_urls.append(url.split('&')[0])
    else:
        cleaned_urls.append(url.split('?')[0])

In [42]:
print(len(cleaned_urls))
cleaned_urls = list(set(cleaned_urls))
print(len(cleaned_urls))

45843
15314


In [43]:
saturdays = get_saturdays(start_date, end_date)
print(saturdays)

['2025-03-08', '2025-03-15', '2025-03-22', '2025-03-29', '2025-04-05', '2025-04-12', '2025-04-19', '2025-04-26', '2025-05-03', '2025-05-10', '2025-05-17', '2025-05-24', '2025-05-31', '2025-06-07', '2025-06-14', '2025-06-21', '2025-06-28', '2025-07-05', '2025-07-12', '2025-07-19', '2025-07-26', '2025-08-02', '2025-08-09', '2025-08-16', '2025-08-23', '2025-08-30', '2025-09-06', '2025-09-13', '2025-09-20', '2025-09-27', '2025-10-04', '2025-10-11', '2025-10-18', '2025-10-25']


In [44]:
number_data_per_file = len(cleaned_urls) // devided_by

In [45]:
splited_datas = split_data(cleaned_urls, number_data_per_file)
splited_datas

[['https://www.maeva.com/fr-fr/superbe-appartement-cosy-avec-balcon_921331.html',
  'https://maeva.com/fr-fr/residence-l-horizon_502814.html',
  'https://maeva.com/fr-fr/vvf-club-intense-l-aure-pyreneen_2938.html',
  'https://www.maeva.com/fr-fr/location-de-vacances-morzine-114_108021.html',
  'https://www.maeva.com/fr-fr/location-de-vacances-le-grand-bornand-230_157814.html',
  'https://maeva.com/fr-fr/residence-parc-de-la-mer_12089.html',
  'https://www.maeva.com/fr-fr/appartements-a-les-gets_1056011.html',
  'https://maeva.com/fr-fr/residence-champfleuri-2_691134.html',
  'https://www.maeva.com/fr-fr/village-club-du-soleil-morzine_57925.html',
  'https://www.maeva.com/fr-fr/residence-malinvern-b11_1063631.html',
  'https://www.maeva.com/fr-fr/sowell-family-les-bergers_56471.html',
  'https://www.maeva.com/fr-fr/residence-orr-des-forets_436004.html',
  'https://maeva.com/fr-fr/residence-la-villa-bellevue-penvenan_533764.html',
  'https://www.maeva.com/fr-fr/location-de-vacances-avori

In [46]:
new_splited_datas = [normalize_urls(data, saturdays) for data in splited_datas]

In [47]:
for i in range(len(new_splited_datas)):
    with open(f"{output_path}{name}_{i+1}.json", 'w') as outfile:
        json.dump(new_splited_datas[i], outfile)

last changed