In [1]:
import pandas as pd
import os

# Load the CSV file into a DataFrame
csv_file_path = 'AIS first 2000.csv'
df = pd.read_csv(csv_file_path)

# Filter out rows where 'Navigational status' is 'Unknown value'
df_filtered = df[df['Navigational status'] != 'Unknown value']

# Specify the output folder
output_folder = 'grouped_files'  # e.g., 'output_files/'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Group the filtered data by MMSI
grouped = df_filtered.groupby('MMSI')

# Loop through each group and save it to a separate file in the specified folder
for mmsi, group in grouped:
    output_filename = os.path.join(output_folder, f"mmsi_{mmsi}.csv")
    group.to_csv(output_filename, index=False)
    print(f"Saved data for MMSI {mmsi} to {output_filename}")


Saved data for MMSI 148 to grouped_files/mmsi_148.csv
Saved data for MMSI 205770000 to grouped_files/mmsi_205770000.csv
Saved data for MMSI 209239000 to grouped_files/mmsi_209239000.csv
Saved data for MMSI 209275000 to grouped_files/mmsi_209275000.csv
Saved data for MMSI 209325000 to grouped_files/mmsi_209325000.csv
Saved data for MMSI 209525000 to grouped_files/mmsi_209525000.csv
Saved data for MMSI 209535000 to grouped_files/mmsi_209535000.csv
Saved data for MMSI 209903000 to grouped_files/mmsi_209903000.csv
Saved data for MMSI 210046000 to grouped_files/mmsi_210046000.csv
Saved data for MMSI 210174000 to grouped_files/mmsi_210174000.csv
Saved data for MMSI 210185000 to grouped_files/mmsi_210185000.csv
Saved data for MMSI 210510000 to grouped_files/mmsi_210510000.csv
Saved data for MMSI 210731000 to grouped_files/mmsi_210731000.csv
Saved data for MMSI 211101000 to grouped_files/mmsi_211101000.csv
Saved data for MMSI 211187510 to grouped_files/mmsi_211187510.csv
Saved data for MMSI 21

In [3]:
import pandas as pd
import os
from tqdm import tqdm  # For progress bars

# Specify the path to the CSV file and the output folder
csv_file_path = './temp_ais_data/aisdk-2024-11-26.csv'
output_folder = 'grouped_files'  # e.g., 'output_files/'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Set the chunk size (e.g., 100,000 rows at a time)
chunk_size = 100000

# Initialize an empty dictionary to accumulate data by MMSI
mmsi_dict = {}

# Get the total number of rows in the CSV to estimate progress
total_rows = sum(1 for _ in open(csv_file_path)) - 1  # Subtract 1 for the header
total_chunks = total_rows // chunk_size + 1  # Number of chunks to process

# Read the file in chunks and process
with tqdm(total=total_chunks, desc="Processing chunks", unit="chunk") as chunk_progress:
    for chunk in pd.read_csv(csv_file_path, chunksize=chunk_size):
        # Filter out rows where 'Navigational status' is 'Unknown value'
        chunk_filtered = chunk[chunk['Navigational status'] != 'Unknown value']
        
        # Group the filtered chunk by MMSI and accumulate in the dictionary
        with tqdm(total=chunk_filtered['MMSI'].nunique(), desc="Processing MMSI", unit="MMSI") as mmsi_progress:
            for mmsi, group in chunk_filtered.groupby('MMSI'):
                if mmsi not in mmsi_dict:
                    mmsi_dict[mmsi] = group
                else:
                    mmsi_dict[mmsi] = pd.concat([mmsi_dict[mmsi], group])
                mmsi_progress.update(1)  # Update the MMSI progress
        chunk_progress.update(1)  # Update the chunk progress

# Now save each MMSI group to a separate CSV file with progress output
with tqdm(total=len(mmsi_dict), desc="Saving MMSI files", unit="file") as save_progress:
    for mmsi, group in mmsi_dict.items():
        output_filename = os.path.join(output_folder, f"mmsi_{mmsi}.csv")
        group.to_csv(output_filename, index=False)
        save_progress.update(1)  # Update progress after saving each file

print("All MMSI files have been saved successfully!")


Processing MMSI: 100%|██████████| 1387/1387 [00:00<00:00, 22755.19MMSI/s]
Processing MMSI: 100%|██████████| 1395/1395 [00:00<00:00, 1977.10MMSI/s]
Processing MMSI: 100%|██████████| 1393/1393 [00:00<00:00, 2277.71MMSI/s]
Processing MMSI: 100%|██████████| 1403/1403 [00:00<00:00, 2085.42MMSI/s]
Processing MMSI: 100%|██████████| 1405/1405 [00:00<00:00, 2250.50MMSI/s]
Processing MMSI: 100%|██████████| 1416/1416 [00:00<00:00, 2090.99MMSI/s]
Processing MMSI: 100%|██████████| 1422/1422 [00:00<00:00, 2068.18MMSI/s]
Processing MMSI: 100%|██████████| 1410/1410 [00:00<00:00, 1838.76MMSI/s]
Processing MMSI: 100%|██████████| 1408/1408 [00:00<00:00, 2093.58MMSI/s]
Processing MMSI: 100%|██████████| 1407/1407 [00:00<00:00, 2032.99MMSI/s]
Processing MMSI: 100%|██████████| 1420/1420 [00:00<00:00, 1962.69MMSI/s]
Processing MMSI: 100%|██████████| 1414/1414 [00:00<00:00, 1560.62MMSI/s]
Processing MMSI: 100%|██████████| 1405/1405 [00:00<00:00, 1677.51MMSI/s]
Processing MMSI: 100%|██████████| 1411/1411 [00:00

All MMSI files have been saved successfully!



