In [1]:
import pandas as pd

# Constants

In [2]:
# ecco fields deemed not important by aaron
DROP_ECCO = ['User Edit Time', 'Type', 'Associated UUID', 'Note', 'Combat ID', 
             'Creator', 'Editor', 'AZ', 'Range', '2525', 'Closest Distance', 
             'Course', 'AIS MMSI', 'AIS IMO', 'AIS Call Sign', 'AIS Ship Type', 
             'AIS Destination', 'AIS ETA', 'Fused', 'Fused Tracks']

TIME_COLS = ['Create Time', 'Update Time', 'Deleted Time']


# Helper Functions

In [3]:
def printInfo(dfs):
    for df in dfs:
        print(df.info())
        print('\n======================================================\n')

def formatTime(orig_time):
    # remove the date and the trailing Z (indicates timezone)
    milTime = orig_time[11:-1]
    return milTime

# 2022-06-16T17:54:45.692026Z

# Load ECCO and Simulated Datasets

In [4]:
# load ecco dataset
df_ecco = pd.read_csv('Data/ecco.csv')
df_ecco.name = 'ecco.csv'

# load simulated dataset
df_simu = pd.read_csv('Data/simulatedDroneData.csv')
df_simu.name = 'simulatedDroneData.csv'

# display a summary of the fields for each dataset
printInfo([df_ecco, df_simu])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17587 entries, 0 to 17586
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   UUID                       9553 non-null   object 
 1   Name                       9553 non-null   object 
 2   Create Time                9553 non-null   object 
 3   Update Time                9553 non-null   object 
 4   User Edit Time             62 non-null     object 
 5   Type                       9553 non-null   object 
 6   Associated UUID            9553 non-null   object 
 7   Note                       0 non-null      float64
 8   Source Name                9553 non-null   object 
 9   Source Class               9553 non-null   object 
 10  Source LID                 9553 non-null   float64
 11  Combat ID                  9553 non-null   object 
 12  Object ID                  9553 non-null   object 
 13  Range To Contact           9553 non-null   flo

  df_ecco = pd.read_csv('Data/ecco.csv')


# Drop Rows and Columns from ECCO

In [5]:
# drop non-important features from ecco dataset
print(f'dropping the following fields:\n{DROP_ECCO}...\n')
df_ecco = df_ecco.drop(columns=DROP_ECCO)

# display a summary of the fields for each dataset
printInfo([df_ecco])

# drop all rows with nan
print(f'rows before dropping nan: {len(df_ecco)}')
df_ecco = df_ecco.dropna()
print(f'rows after dropping nan: {len(df_ecco)}')


dropping the following fields:
['User Edit Time', 'Type', 'Associated UUID', 'Note', 'Combat ID', 'Creator', 'Editor', 'AZ', 'Range', '2525', 'Closest Distance', 'Course', 'AIS MMSI', 'AIS IMO', 'AIS Call Sign', 'AIS Ship Type', 'AIS Destination', 'AIS ETA', 'Fused', 'Fused Tracks']...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17587 entries, 0 to 17586
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   UUID                       9553 non-null   object 
 1   Name                       9553 non-null   object 
 2   Create Time                9553 non-null   object 
 3   Update Time                9553 non-null   object 
 4   Source Name                9553 non-null   object 
 5   Source Class               9553 non-null   object 
 6   Source LID                 9553 non-null   float64
 7   Object ID                  9553 non-null   object 
 8   Range To Contact           9553 non-nu

# Reformat Time Columns in ECCO

In [6]:
print('Time columns before reformatting:')
print(df_ecco[TIME_COLS].head())

for tc in TIME_COLS:
    df_ecco[tc] = df_ecco[tc].apply(formatTime)

print('\n======================================================\n')
print('Time columns after reformatting:')
print(df_ecco[TIME_COLS].head())


Time columns before reformatting:
                   Create Time                  Update Time  \
0  2022-06-16T17:52:01.898832Z  2022-06-16T17:54:45.692026Z   
1  2022-06-16T17:53:00.654140Z  2022-06-16T17:54:45.692520Z   
2  2022-06-16T17:52:01.898832Z  2022-06-16T17:54:45.725468Z   
3  2022-06-16T17:52:01.898832Z  2022-06-16T17:54:45.829479Z   
4  2022-06-16T17:52:06.462136Z  2022-06-16T17:54:45.840658Z   

                  Deleted Time  
0  2022-06-16T18:00:30.308269Z  
1  2022-06-16T17:58:09.980982Z  
2  2022-06-16T18:00:30.308269Z  
3  2022-06-16T18:00:30.308269Z  
4  2022-06-16T17:58:16.230071Z  


Time columns after reformatting:
       Create Time      Update Time     Deleted Time
0  17:52:01.898832  17:54:45.692026  18:00:30.308269
1  17:53:00.654140  17:54:45.692520  17:58:09.980982
2  17:52:01.898832  17:54:45.725468  18:00:30.308269
3  17:52:01.898832  17:54:45.829479  18:00:30.308269
4  17:52:06.462136  17:54:45.840658  17:58:16.230071
