In [1]:
import pandas as pd
import re

# Load dataset
file_path = '/Users/allig/ads/508/simplified_collision_data.csv'
df = pd.read_csv(file_path)

# Function to clean object names and extract activity status
def extract_status(obj_name):
    """Extracts activity status and cleans the object name."""
    obj_name = str(obj_name)  # Ensure string type
    is_active = 1 if "[+]" in obj_name else 0
    cleaned_name = re.sub(r"[\[\]\+\-]", "", obj_name).strip()  # Remove [+] and [-]
    return cleaned_name, is_active

# Apply extraction function to both object columns
df[['OBJECT_NAME_1_Clean', 'OBJECT_1_Active']] = df['OBJECT_NAME_1'].apply(lambda x: pd.Series(extract_status(x)))
df[['OBJECT_NAME_2_Clean', 'OBJECT_2_Active']] = df['OBJECT_NAME_2'].apply(lambda x: pd.Series(extract_status(x)))

# Create Activity_Status column based on activity status
df['Activity_Status'] = df['OBJECT_1_Active'] + (df['OBJECT_2_Active'] * 2)

# Drop original object columns and rename cleaned ones
df = df.drop(columns=['OBJECT_NAME_1', 'OBJECT_NAME_2'])
df = df.rename(columns={'OBJECT_NAME_1_Clean': 'OBJECT_NAME_1', 'OBJECT_NAME_2_Clean': 'OBJECT_NAME_2'})

# Save cleaned data
cleaned_file_path = '/Users/allig/ads/508/cleaned_collision_data.csv'
df.to_csv(cleaned_file_path, index=False)

print("Data cleaned and saved to:", cleaned_file_path)
print(df.head())


Data cleaned and saved to: /Users/allig/ads/508/cleaned_collision_data.csv
   NORAD_CAT_ID_1  DSE_1  NORAD_CAT_ID_2  DSE_2                      TCA  \
0           48379  7.424           33314  7.726  2025-03-18 10:36:24.650   
1           56155  7.671           56156  7.671  2025-03-18 06:42:02.541   
2           45394  6.801           49027  6.104  2025-03-17 13:58:44.363   
3           40030  7.650           45388  7.710  2025-03-18 12:43:07.735   
4           51057  5.305           56205  4.570  2025-03-16 01:29:22.126   

   TCA_RANGE  TCA_RELATIVE_SPEED  MAX_PROB  DILUTION        OBJECT_NAME_1  \
0      0.018               6.833  0.243800     0.005         STARLINK2612   
1      0.027               0.001  0.005994     0.006             PIESAT C   
2      0.034              10.515  0.077890     0.011         STARLINK1292   
3      0.040              10.447  0.057940     0.013              DTUSAT2   
4      0.045              15.169  0.000788     0.029  KEPLER16 (ASTRAEUS)   

   OB

Summary of Changes:
Extracts Active Status (1 for active, 0 for inactive).
Creates Activity_Status Column:
0 → Neither object active
1 → Only OBJECT_NAME_1 active
2 → Only OBJECT_NAME_2 active
3 → Both active
Removes [+] and [-] from OBJECT_NAME_1 and OBJECT_NAME_2.