Basically, when it failed in the middle of a batch, it might have written out the boats already (due to an accidental generator instead of list comprehension).
This meant some duplicates. Luckily, its easy to tell, as boats are always consecutive for a day. So we just keep the last set of consecutive boats for each day.

In [26]:
import pandas as pd
import os

path = "D:\\Results"

for root, dirs, files in os.walk(path):
    print(root, dirs)
    if len(dirs) > 1:
        continue
    for file in files:
        if file.endswith(".new.csv"):
            os.remove(os.path.join(root, file))
        if file.endswith(".csv") and not file.endswith(".new.csv"): 
            print(f"Processing {file}")
            df = pd.read_csv(os.path.join(root, file), parse_dates=['date'])

            date_positions = {}
            prev_date = None
            start_index = 0

            for i, current_date in enumerate(df['date']):
                if current_date != prev_date:
                    if prev_date is not None:
                        if prev_date not in date_positions:
                            date_positions[prev_date] = []
                        date_positions[prev_date].append((start_index, i-1))
                    start_index = i
                    prev_date = current_date
                
            if prev_date is not None:
                if prev_date not in date_positions:
                    date_positions[prev_date] = []
                date_positions[prev_date].append((start_index, len(df)-1))

            non_consecutive_dates = {}
            for date, ranges in date_positions.items():
                if len(ranges) > 1:
                    non_consecutive_dates[date] = ranges

            for date, ranges in non_consecutive_dates.items():
                rows = None
                for start, end in ranges:
                    rows = df.iloc[start:end+1] if rows is None else pd.concat([rows, df.iloc[start:end+1]])
                    print(f"  {start} - {end}. Length: {end-start+1}")
            
            # keep only the last range
            date_changes = df['date'] != df['date'].shift()
            df['group_id'] = date_changes.cumsum()
            df['max_group_id'] = df.groupby('date')['group_id'].transform('max')
            df_last_group = df[df['group_id'] == df['max_group_id']].copy()
            df_last_group.drop(columns=['group_id', 'max_group_id'], inplace=True)
            # save the last range
            df_last_group.to_csv(os.path.join(root, file), index=False)
            print(f"  Saved {file}")

            



D:\Results ['UDM', '20_21', '16_17_18_19_22_hpc_incomplete', '2023_07-2024_07_incomplete']
D:\Results\UDM []
D:\Results\20_21 []
Processing coverage.csv
  Saved coverage.csv
Processing orders.csv
  Saved orders.csv
Processing boat_detections.csv
  Saved boat_detections.csv
D:\Results\16_17_18_19_22_hpc_incomplete []
Processing coverage.csv
  Saved coverage.csv
Processing orders.csv
  Saved orders.csv
Processing boat_detections.csv
  Saved boat_detections.csv
D:\Results\2023_07-2024_07_incomplete []
Processing coverage.csv
  Saved coverage.csv
Processing orders.csv
  Saved orders.csv
Processing boat_detections.csv
  Saved boat_detections.csv


D:\Results ['UDM', '20_21', '16_17_18_19_22_hpc_incomplete', '2023_07-2024_07_incomplete']
D:\Results\UDM []
D:\Results\20_21 []
Processing coverage.csv
  Saved coverage.csv
Processing orders.csv
  Saved orders.csv
Processing boat_detections.csv
  71212 - 71401. Length: 190
  71505 - 71694. Length: 190
  71402 - 71429. Length: 28
  71695 - 71722. Length: 28
  71430 - 71504. Length: 75
  71723 - 71797. Length: 75
  Saved boat_detections.csv
D:\Results\16_17_18_19_22_hpc_incomplete []
Processing coverage.csv
  Saved coverage.csv
Processing orders.csv
  Saved orders.csv
Processing boat_detections.csv
C:\Users\turner30\AppData\Local\Temp\ipykernel_28600\1675975194.py:13: UserWarning: Parsing dates in %d/%m/%Y format when dayfirst=False (the default) was specified. Pass `dayfirst=True` or specify a format to silence this warning.
  df = pd.read_csv(os.path.join(root, file), parse_dates=['date'])
  23350 - 24250. Length: 901
  26109 - 27009. Length: 901
  24251 - 24841. Length: 591
  27010 - 27600. Length: 591
  24842 - 24846. Length: 5
  27601 - 27605. Length: 5
  24847 - 25202. Length: 356
  27606 - 27984. Length: 379
  25203 - 25916. Length: 714
  27985 - 28698. Length: 714
  25917 - 26108. Length: 192
  28699 - 28890. Length: 192
  50271 - 51096. Length: 826
  56888 - 57713. Length: 826
  51097 - 52326. Length: 1230
  57714 - 58943. Length: 1230
  52327 - 52538. Length: 212
  58944 - 59155. Length: 212
  52539 - 53510. Length: 972
  59156 - 60127. Length: 972
  53511 - 53529. Length: 19
  60128 - 60146. Length: 19
  53530 - 54496. Length: 967
  60147 - 61113. Length: 967
  54497 - 54897. Length: 401
  61114 - 61514. Length: 401
  54898 - 55787. Length: 890
  61515 - 62404. Length: 890
  55788 - 56887. Length: 1100
  62405 - 63504. Length: 1100
  74200 - 74780. Length: 581
  79004 - 79584. Length: 581
  74781 - 75089. Length: 309
  79585 - 79893. Length: 309
  75090 - 75371. Length: 282
  79894 - 80175. Length: 282
  75372 - 75547. Length: 176
  80176 - 80351. Length: 176
  75548 - 75787. Length: 240
  80352 - 80591. Length: 240
  75788 - 76598. Length: 811
  80592 - 81402. Length: 811
  76599 - 77528. Length: 930
  81403 - 82332. Length: 930
  77529 - 78287. Length: 759
  82333 - 83091. Length: 759
  78288 - 78334. Length: 47
  83092 - 83138. Length: 47
  78335 - 79003. Length: 669
  83139 - 84157. Length: 1019
  121621 - 121870. Length: 250
  122510 - 122759. Length: 250
  121871 - 121992. Length: 122
  122760 - 122881. Length: 122
  121993 - 122257. Length: 265
  122882 - 123153. Length: 272
  122258 - 122496. Length: 239
  123154 - 123804. Length: 651
  122497 - 122509. Length: 13
  123805 - 125221. Length: 1417
  135370 - 136010. Length: 641
  141108 - 141748. Length: 641
  136011 - 136376. Length: 366
  141749 - 142114. Length: 366
  136377 - 137332. Length: 956
  142115 - 143070. Length: 956
  137333 - 137765. Length: 433
  143071 - 143503. Length: 433
  137766 - 138727. Length: 962
  143504 - 144465. Length: 962
  138728 - 140405. Length: 1678
  144466 - 146143. Length: 1678
  140406 - 141107. Length: 702
  146144 - 146845. Length: 702
  189595 - 189971. Length: 377
  200711 - 201087. Length: 377
  189972 - 190218. Length: 247
  201088 - 201334. Length: 247
  190219 - 190812. Length: 594
  201335 - 201928. Length: 594
  190813 - 191016. Length: 204
  201929 - 202132. Length: 204
  191017 - 192402. Length: 1386
  202133 - 203518. Length: 1386
  192403 - 193473. Length: 1071
  203519 - 204589. Length: 1071
  193474 - 194696. Length: 1223
  204590 - 205812. Length: 1223
  194697 - 194740. Length: 44
  205813 - 205856. Length: 44
  194741 - 195132. Length: 392
  205857 - 206248. Length: 392
  315616 - 316323. Length: 708
  316683 - 317390. Length: 708
  316324 - 316682. Length: 359
  317391 - 317749. Length: 359
  Saved boat_detections.csv
D:\Results\2023_07-2024_07_incomplete []
Processing coverage.csv
  Saved coverage.csv
Processing orders.csv
  Saved orders.csv
Processing boat_detections.csv
  Saved boat_detections.csv