1. Filtering Files by Area and Copying Them

We will create a method to filter files based on a specified geographical area and ensure vessels have at least 10 consecutive entries in the area.

In [39]:
import matplotlib.pyplot as plt
from matplotlib.patches import FancyArrow
import os
import pandas as pd
import numpy as np

def filter_files_by_area(input_folder, output_folder, area_bounds):
    """
    Filters vessel files to include only those with at least 10 consecutive entries in a specified area.
    Copies these files to the output folder and removes rows outside the area.
    """
    os.makedirs(output_folder, exist_ok=True)
    min_lat, max_lat, min_lon, max_lon = area_bounds

    files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    total_files = len(files)
    print(f"Processing {total_files} files from folder: {input_folder}")

    for idx, file in enumerate(files):
        print(f"({idx + 1}/{total_files}) Processing file: {file}")
        file_path = os.path.join(input_folder, file)
        
        try:
            # Read the file with specified dtypes
            df = pd.read_csv(
                file_path,
                dtype={
                    'Timestamp': str,
                    'Type of mobile': str,
                    'MMSI': str,
                    'Latitude': float,
                    'Longitude': float,
                    'Navigational status': str,
                    'ROT': float,
                    'SOG': float,
                    'COG': float,
                    'Heading': float,
                    'IMO': str,
                    'Callsign': str,
                    'Name': str,
                    'Ship type': str,
                    'Cargo type': str,
                    'Width': float,
                    'Length': float,
                    'Type of position fixing device': str,
                    'Draught': float,
                    'Destination': str,
                    'ETA': str,
                    'Data source type': str,
                    'A': str,
                    'B': str,
                    'C': str,
                    'D': str,
                },
                low_memory=False
            )
            print(f"Loaded file: {file}, Rows: {len(df)}")
            
            # Print the exact raw column names to check for issues
            print(f"Raw column names: {list(df.columns)}")  # Debug: raw column names

            # Strip leading "#" characters and any extra whitespace
            df.columns = df.columns.str.lstrip('#').str.strip()
            print(f"Cleaned column names: {list(df.columns)}")  # Debug: cleaned column names
            
            # Filter rows within the specified area
            df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
            df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')

            in_area = (
                (df['Latitude'] >= min_lat) & 
                (df['Latitude'] <= max_lat) & 
                (df['Longitude'] >= min_lon) & 
                (df['Longitude'] <= max_lon)
            )
            print(f"Rows matching area condition: {in_area.sum()}")  # Debug: print number of rows matching area
            df_area = df[in_area].copy()  # Make a copy to avoid SettingWithCopyWarning
            print(f"Rows in area for {file}: {len(df_area)}")

            # Check for at least 10 consecutive entries in the area
            if not df_area.empty:
                # Sort by Timestamp (assuming consecutive entries are based on this field)
                df_area = df_area.sort_values(by='Timestamp').reset_index(drop=True)

                # Print a few sample Timestamps to check if they are consistent
                print(f"Sample Timestamps from {file}:")
                print(df_area['Timestamp'].head())  # Debug: print first few Timestamps
                
                # Convert Timestamp to datetime with the correct format
                try:
                    df_area['Timestamp'] = pd.to_datetime(df_area['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
                except Exception as e:
                    print(f"Error converting Timestamp in {file}: {e}")
                    continue

                # Check for any invalid Timestamps
                if df_area['Timestamp'].isna().any():
                    print(f"Error: Some Timestamps could not be parsed in {file}")
                    print("Invalid Timestamps:", df_area[df_area['Timestamp'].isna()])  # Debug: show rows with invalid Timestamps
                    continue  # Skip this file if conversion fails
                
                # Calculate the difference between consecutive timestamps
                df_area['time_diff'] = df_area['Timestamp'].diff().fillna(pd.Timedelta(seconds=0))

                # Define what you consider "consecutive" (e.g., within 1 minute)
                consecutive_threshold = pd.Timedelta(minutes=1)  # Adjust based on your criteria
                df_area['consecutive'] = (df_area['time_diff'] <= consecutive_threshold).cumsum()

                # Find the maximum number of consecutive entries
                max_consecutive_count = df_area['consecutive'].value_counts().max()
                print(f"Max consecutive entries for {file}: {max_consecutive_count}")

                # If there are at least 10 consecutive entries, save the file
                if max_consecutive_count >= 10:
                    output_path = os.path.join(output_folder, file)
                    df_area.drop(columns=['time_diff', 'consecutive'], inplace=True)  # Clean up temporary columns
                    df_area.to_csv(output_path, index=False)
                    print(f"File saved: {output_path}")
                else:
                    print(f"Skipped {file}: Insufficient consecutive entries in area.")
            else:
                print(f"Skipped {file}: No rows in specified area.")

        except Exception as e:
            print(f"Error processing {file}: {e}")


2. Visualizing Data for a Specified Timeslot

We will generate 10 map snippets, each showing positions and vector arrows for one vessel during the specified timeslot.

In [40]:
def visualize_timeslot(folder, timeslot, output_folder="map_snippets"):
    """
    Visualizes vessel positions and vectors for a specified timeslot.
    
    :param folder: Folder containing filtered vessel files
    :param timeslot: Tuple specifying start and end times (e.g., "10:30", "11:00")
    :param output_folder: Folder to save the map snippets
    """
    os.makedirs(output_folder, exist_ok=True)
    start_time, end_time = timeslot

    files = [f for f in os.listdir(folder) if f.endswith('.csv')][:10]  # Take up to 10 files
    for i, file in enumerate(files):
        file_path = os.path.join(folder, file)
        df = pd.read_csv(file_path)
        
        # Filter rows by timeslot
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        df_timeslot = df[(df['Timestamp'].dt.time >= pd.to_datetime(start_time).time()) &
                         (df['Timestamp'].dt.time <= pd.to_datetime(end_time).time())]

        if df_timeslot.shape[0] > 10:
            # Take 10 evenly spread rows
            df_timeslot = df_timeslot.iloc[::len(df_timeslot) // 10][:10]

        # Create map snippet
        fig, ax = plt.subplots(figsize=(8, 8))
        ax.set_title(f"Vessel: {file} | Timeslot: {start_time}-{end_time}")

        for _, row in df_timeslot.iterrows():
            # Plot position
            ax.plot(row['Longitude'], row['Latitude'], 'bo')  # Blue point for position
            ax.text(row['Longitude'], row['Latitude'], row['Timestamp'].strftime('%H:%M'), fontsize=8)

            # Add vectors for COG/SOG
            cog_x = row['Longitude'] + 0.01 * row['SOG'] * np.cos(np.radians(row['COG']))
            cog_y = row['Latitude'] + 0.01 * row['SOG'] * np.sin(np.radians(row['COG']))
            ax.add_patch(FancyArrow(row['Longitude'], row['Latitude'], 
                                    cog_x - row['Longitude'], cog_y - row['Latitude'], 
                                    width=0.0002, color='red', label='COG/SOG'))

            # Add vectors for Heading
            heading_x = row['Longitude'] + 0.01 * np.cos(np.radians(row['Heading']))
            heading_y = row['Latitude'] + 0.01 * np.sin(np.radians(row['Heading']))
            ax.add_patch(FancyArrow(row['Longitude'], row['Latitude'], 
                                    heading_x - row['Longitude'], heading_y - row['Latitude'], 
                                    width=0.0002, color='green', label='Heading'))

        ax.set_xlabel("Longitude")
        ax.set_ylabel("Latitude")
        ax.legend(["Position", "COG/SOG", "Heading"], loc='upper left')

        # Save the map snippet
        snippet_path = os.path.join(output_folder, f"snippet_{i+1}.png")
        plt.savefig(snippet_path)
        plt.close(fig)
        print(f"Map snippet saved: {snippet_path}")


In [41]:
# Example usage:

# 1. Filter files by area
area_bounds = (53.0, 57.0, 7.0, 12.0)  # Define area (min_lat, max_lat, min_lon, max_lon)
input_folder = "grouped_files"
output_folder = "area_vessels"
filter_files_by_area(input_folder, output_folder, area_bounds)

# 2. Visualize timeslot
timeslot = ("10:30", "11:00")  # Define timeslot
visualize_timeslot(output_folder, timeslot)


Processing 1880 files from folder: grouped_files
(1/1880) Processing file: mmsi_219012563.csv
Loaded file: mmsi_219012563.csv, Rows: 216
Raw column names: ['# Timestamp', 'Type of mobile', 'MMSI', 'Latitude', 'Longitude', 'Navigational status', 'ROT', 'SOG', 'COG', 'Heading', 'IMO', 'Callsign', 'Name', 'Ship type', 'Cargo type', 'Width', 'Length', 'Type of position fixing device', 'Draught', 'Destination', 'ETA', 'Data source type', 'A', 'B', 'C', 'D']
Cleaned column names: ['Timestamp', 'Type of mobile', 'MMSI', 'Latitude', 'Longitude', 'Navigational status', 'ROT', 'SOG', 'COG', 'Heading', 'IMO', 'Callsign', 'Name', 'Ship type', 'Cargo type', 'Width', 'Length', 'Type of position fixing device', 'Draught', 'Destination', 'ETA', 'Data source type', 'A', 'B', 'C', 'D']
Rows matching area condition: 0
Rows in area for mmsi_219012563.csv: 0
Skipped mmsi_219012563.csv: No rows in specified area.
(2/1880) Processing file: mmsi_219263000.csv
Loaded file: mmsi_219263000.csv, Rows: 1318
Raw co

Max consecutive entries for mmsi_354340000.csv: 6
Skipped mmsi_354340000.csv: Insufficient consecutive entries in area.
(22/1880) Processing file: mmsi_636018508.csv
Loaded file: mmsi_636018508.csv, Rows: 2926
Raw column names: ['# Timestamp', 'Type of mobile', 'MMSI', 'Latitude', 'Longitude', 'Navigational status', 'ROT', 'SOG', 'COG', 'Heading', 'IMO', 'Callsign', 'Name', 'Ship type', 'Cargo type', 'Width', 'Length', 'Type of position fixing device', 'Draught', 'Destination', 'ETA', 'Data source type', 'A', 'B', 'C', 'D']
Cleaned column names: ['Timestamp', 'Type of mobile', 'MMSI', 'Latitude', 'Longitude', 'Navigational status', 'ROT', 'SOG', 'COG', 'Heading', 'IMO', 'Callsign', 'Name', 'Ship type', 'Cargo type', 'Width', 'Length', 'Type of position fixing device', 'Draught', 'Destination', 'ETA', 'Data source type', 'A', 'B', 'C', 'D']
Rows matching area condition: 2461
Rows in area for mmsi_636018508.csv: 2461
Sample Timestamps from mmsi_636018508.csv:
0    26/11/2024 19:28:40
1  