In [1]:
import csv
import os
from datetime import datetime, timedelta
import copy

def get_ordered_booking_file_paths(root_dir):
    # List to hold file paths with corresponding dates
    files_with_dates = []

    # Walk through the root directory
    for dirpath, dirnames, filenames in os.walk(root_dir):
        try:
            dir_date = datetime.strptime(os.path.basename(dirpath), '%d-%m-%Y')
        except ValueError:
            continue 

        # Find all files that match the pattern
        for filename in filenames:
            if (filename.startswith('booking_1j')) and filename.endswith('.csv'):
                files_with_dates.append((os.path.join(dirpath, filename), dir_date))

    # Sort the list of files by the date part
    files_with_dates.sort(key=lambda x: x[1])
    print('Ordered Files :',files_with_dates)
    # Extract and return only the file paths in sorted order
    sorted_file_paths = [file_path for file_path, _ in files_with_dates]
    return sorted_file_paths

def fixed_date_range(year):
    start_date = datetime(year, 4, 1)
    end_date = datetime(year, 4, 3)
    return start_date, end_date

def parse_date(date_str):
    if  date_str=="" or any(c.isalpha() for c in date_str):
        return None
    date_formats = [
        '%Y-%m-%d %H:%M:%S',  
        '%Y-%m-%d',          
        '%d-%m-%Y',       
        '%d/%m/%Y'            
    ]
    
    for fmt in date_formats:
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    print('Date:' , date_str)
    print(date_str == "")
    raise ValueError(f"Date format not recognized: {date_str}")
def add_days(nb_days, date_initiale_str):
    date_initiale = parse_date(date_initiale_str)
    nouvelle_date = date_initiale + timedelta(days=nb_days)
    return nouvelle_date.strftime('%d/%m/%Y')

In [None]:


# Main execution block
#root_directory = './Booking'

root_directory = './MAEVAS'

file_paths = get_ordered_booking_file_paths(root_directory)

for path in file_paths:
    #print(path)
    continue

# Load data from files
annonces_par_fichier = []
missing_dates_dict = {}

for file_path in file_paths:
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")
        continue

    annonces = {}
    #print('Processing :', file_path)
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        print(file_path)
        for row in reader:
            key = f"{row['typologie']}-{row['nom']}-{row['localite']}"
            if key not in annonces:
                annonces[key] = []
            annonces[key].append({
                'date_debut': parse_date(row['date_debut']),
                'date_fin': parse_date(row['date_fin']),
                'row': row
            })
    annonces_par_fichier.append(annonces)

# Process missing keys and update dates
print('Processing missing dates...')
year_of_interest = 2024
start_date, end_date = fixed_date_range(year_of_interest)




In [3]:
def add_days(nb_days, date_initiale_str):
    date_initiale = parse_date(date_initiale_str)
    nouvelle_date = date_initiale + timedelta(days=nb_days)
    return nouvelle_date.strftime('%d/%m/%Y')

In [4]:
missing_annonces = []
for i in range(1, len(annonces_par_fichier)):
    print('File', i, '  and File', i-1)
    prev_annonce = annonces_par_fichier[i - 1]
    curr_annonce = annonces_par_fichier[i]
    common_keys = set(prev_annonce.keys()) & set(curr_annonce.keys())
    missing_keys = set(prev_annonce.keys()) - set(curr_annonce.keys())

    for key in missing_keys:
        annonces_par_fichier[i][key] = []

    for key in common_keys:
        if len(prev_annonce[key]) > len(curr_annonce[key]):
            for index in range(len(curr_annonce[key]), len(prev_annonce[key])):
                annonce = copy.deepcopy(prev_annonce[key][index])
                date_debut = annonce['date_debut']
                date_fin = annonce['date_fin']
                if(date_debut is not None):
                    annonce['date_debut'] = date_debut + timedelta(days=7)
                    annonce['date_fin'] = date_fin + timedelta(days=7)
                    annonce['row']['date_debut'] = add_days(7, annonce['row']['date_debut'])
                    annonce['row']['date_fin'] = add_days(7, annonce['row']['date_fin'])
                    missing_annonces.append(annonce)
                    annonces_par_fichier[i][key].append(annonce)


File 1   and File 0
File 2   and File 1
File 3   and File 2
File 4   and File 3
File 5   and File 4
File 6   and File 5
File 7   and File 6
File 8   and File 7
File 9   and File 8
File 10   and File 9
File 11   and File 10
File 12   and File 11
File 13   and File 12
File 14   and File 13
File 15   and File 14
File 16   and File 15
File 17   and File 16
File 18   and File 17
File 19   and File 18
File 20   and File 19
File 21   and File 20
File 22   and File 21
File 23   and File 22
File 24   and File 23
File 25   and File 24
File 26   and File 25
File 27   and File 26
File 28   and File 27
File 29   and File 28
File 30   and File 29
File 31   and File 30
File 32   and File 31
File 33   and File 32
File 34   and File 33
File 35   and File 34
File 36   and File 35
File 37   and File 36
File 38   and File 37
File 39   and File 38
File 40   and File 39
File 41   and File 40
File 42   and File 41
File 43   and File 42
File 44   and File 43
File 45   and File 44
File 46   and File 45
File 47

In [5]:
def write_missing_annonces(missing_annonces, file_name):
    print('Missings ', len(missing_annonces)) 
    
    # Get the headers from the first row, but filter out the 'Unnamed: 12' column
    header = [field for field in missing_annonces[0]['row'].keys() if field != 'Unnamed: 12']
    print(header)
    
    # Writing to CSV
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        for item in missing_annonces:
            # Remove the 'Unnamed: 12' column if it exists in the row
            row = {key: value for key, value in item['row'].items() if key in header}
            writer.writerow(row)

In [6]:
write_missing_annonces(missing_annonces,'booking_3j_missings.csv')

Missings  15847892
['web-scraper-order', 'date_price', 'date_debut', 'date_fin', 'prix_init', 'prix_actuel', 'typologie', 'n_offre', 'nom', 'localite', 'date_debut-jour', 'Nb semaines']


In [None]:
write_missing_annonces(missing_annonces,'maeva_missings.csv')

In [None]:
# Creating hosting DB for test
for i, prev_annonce in enumerate(annonces_par_fichier):
    if i == 0:
        keys = set(prev_annonce.keys())
        key = next(iter(keys))
    print('File:',i)    
    for index in range(len(prev_annonce[key])):
        annonce = prev_annonce[key][index]
        header = [field for field in annonce['row'].keys() if field != 'Unnamed: 12']
        
        with open(f'test/{key}_{i}.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=header)
            if file.tell() == 0:  
                writer.writeheader()
            row = {k: v for k, v in annonce['row'].items() if k in header}
            writer.writerow(row)