## Individual Challenge: Data Cleaning Expert

### Manuel Cardoso 56274

#### Setup

In [6]:
import pandas as pd
import numpy as np
import json

Duplicated dataset: NTSB

In [7]:
with open("../data_sources/ntsb-us-2003-2023.json", 'r', encoding='utf-8') as f: #changed path from the individual challenge delivery
    ic_ntsb = json.load(f)

##### THE CODE ON THE START OF THE CELL BELOW IS COPIED FROM THE START OF THE PROJECT JUST TO HAVE AN EQUAL STARTING DATASET, THE INDIVIDUAL CHALLENGE WILL BE DONE WITH THE DATA THAT REMAINS

We did not have to do much data cleaning on phase 1 as it is a good dataset, it's simple data cleaning like taking out unnecessary columns, type conversion and standardization to lowercase, which would be required for this challenge.

In [8]:
# Each record is one accident/incident entry in a list

# Convert to a DataFrame (this will flatten top-level fields)
# For nested fields like 'Vehicles', we might do a separate flatten later
messy_ntsb = pd.json_normalize(ic_ntsb, 
                            meta=[
                                'Oid','MKey','Closed','CompletionStatus','HasSafetyRec',
                                'HighestInjury','IsStudy','Mode','NtsbNumber',
                                'OriginalPublishedDate','MostRecentReportType','ProbableCause',
                                'City','Country','EventDate','State','Agency','BoardLaunch',
                                'BoardMeetingDate','DocketDate','EventType','Launch','ReportDate',
                                'ReportNum','ReportType','AirportId','AirportName','AnalysisNarrative',
                                'FactualNarrative','PrelimNarrative','FatalInjuryCount','MinorInjuryCount',
                                'SeriousInjuryCount','InvestigationClass','AccidentSiteCondition',
                                'Latitude','Longitude','DocketOriginalPublishDate'
                            ],
                            record_path=['Vehicles'],  # This flattens out the 'Vehicles' array
                            record_prefix='Vehicles.'
                           )

# combines all injury counts to 1 column
messy_ntsb['TotalInjuryCount'] = messy_ntsb[['FatalInjuryCount', 'MinorInjuryCount', 'SeriousInjuryCount']].sum(axis=1)

# dropping unnecessary columns
messy_ntsb.drop(columns=['AnalysisNarrative','FactualNarrative','PrelimNarrative','InvestigationClass','BoardLaunch'
                      ,'BoardMeetingDate','Launch','IsStudy','OriginalPublishedDate','DocketOriginalPublishDate'
                      ,'ReportType','ReportNum','ReportDate','MostRecentReportType','FatalInjuryCount','MinorInjuryCount'
                      ,'SeriousInjuryCount','DocketDate','Mode','HasSafetyRec','CompletionStatus','Closed'
                      ,'Vehicles.AircraftCategory','Vehicles.AmateurBuilt','Vehicles.EventID','Vehicles.AirMedical'
                      ,'Vehicles.AirMedicalType','Vehicles.flightScheduledType','Vehicles.flightServiceType'
                      ,'Vehicles.flightTerminalType','Vehicles.RegisteredOwner','Vehicles.RegulationFlightConductedUnder'
                      ,'Vehicles.RepGenFlag','Vehicles.RevenueSightseeing','Vehicles.SecondPilotPresent','Vehicles.Damage'
                      ,'AccidentSiteCondition'], inplace=True) 

# dropping NaT entries from EventDate
messy_ntsb = messy_ntsb.dropna(subset=['EventDate'])

# Type Conversion
messy_ntsb['EventDate'] = pd.to_datetime(messy_ntsb['EventDate']).dt.tz_localize(None)
messy_ntsb['Vehicles.VehicleNumber'] = pd.to_numeric(messy_ntsb['Vehicles.VehicleNumber'], errors='coerce').astype(int)
messy_ntsb['MKey'] = pd.to_numeric(messy_ntsb['MKey'], errors='coerce').astype(int)
messy_ntsb['Vehicles.NumberOfEngines'] = pd.to_numeric(messy_ntsb['Vehicles.NumberOfEngines'], errors='coerce').fillna(0).astype(float) # only change from the original Data Cleaning so it was easier to manipulate
messy_ntsb['Latitude'] = pd.to_numeric(messy_ntsb['Latitude'], errors='coerce').astype(float)
messy_ntsb['Longitude'] = pd.to_numeric(messy_ntsb['Longitude'], errors='coerce').astype(float)
messy_ntsb['TotalInjuryCount'] = pd.to_numeric(messy_ntsb['TotalInjuryCount'], errors='coerce').astype(int)

categorical_cols = [
    'Vehicles.DamageLevel',
    'Vehicles.ExplosionType',
    'Vehicles.FireType',
    'HighestInjury',
    'EventType',
    'AccidentSiteCondition'
]

for col in categorical_cols:
    if col in messy_ntsb.columns:
        messy_ntsb[col] = messy_ntsb[col].astype('category')

messy_ntsb = messy_ntsb.map(lambda x: x.lower() if isinstance(x, str) else x) # make all appropriate values lowercase

#################################################################################### CODE ABOVE IS COPIED FROM PROJECT PHASE 1 ####################################################################################

starting_entries = len(messy_ntsb)
print(f"Number of starting entries: {starting_entries}")

# Manipulating records
seed = 5 # for reproducibility

# Introducing missing values on Vehicles.Make column
missing_count = messy_ntsb['Vehicles.Make'].isna().sum()
print(missing_count)
np.random.seed(seed) 
n = starting_entries
n_missing = int(np.floor(0.1 * n)) # 0.1 = 10% missing values
missing_indices = np.random.choice(messy_ntsb.index, n_missing, replace=False)
messy_ntsb.loc[missing_indices, "Vehicles.Make"] = np.nan
print(f"Number of entries after introducing missing values: {len(messy_ntsb)}")

# Introducing duplicate records
n_dup = int(np.floor(0.05 * n)) # 0.05 = 5% duplicated records
# Randomly choose rows to duplicate
dup_indices = np.random.choice(messy_ntsb.index, n_dup, replace=False)
duplicates = messy_ntsb.loc[dup_indices].copy()
# Append duplicates to original DataFrame
messy_ntsb = pd.concat([messy_ntsb, duplicates], ignore_index=True)
print(f"Number of entries after introducing duplicate records: {len(messy_ntsb)}")
n = len(messy_ntsb) # reset

# Introducing negative and incorrect values for Vehicles.NumberOfEngines
n_invalid = int(np.floor(0.05 * n)) # 0.05 = 5% induced negatives and incorrect 
# Randomly choose rows
invalid_indices = np.random.choice(messy_ntsb.index, n_invalid, replace=False)
# Flip values to negative (ensure they're numeric first)
messy_ntsb.loc[invalid_indices, "Vehicles.NumberOfEngines"] = -messy_ntsb.loc[invalid_indices, "Vehicles.NumberOfEngines"].abs()
# Randomly choose rows
invalid_indices = np.random.choice(messy_ntsb.index, n_invalid, replace=False)
# Flip values to negative (ensure they're numeric first)
messy_ntsb.loc[invalid_indices, "Vehicles.NumberOfEngines"] = (messy_ntsb.loc[invalid_indices, "Vehicles.NumberOfEngines"] + (0.01*np.random.rand())) # add decimals to the Number of Engines
print(f"Number of entries after introducing invalid values: {len(messy_ntsb)}")

# Introducing Outliers in 'Longitude' and 'Latitude' columns
n_outliers = int(np.floor(0.025 * n)) # 0.025 = 2.5% induced outliers
# Longitude: valid range ~ -180 to 180
outlier_indices = np.random.choice(messy_ntsb.index, n_outliers, replace=False)
messy_ntsb.loc[outlier_indices, 'Longitude'] = (messy_ntsb.loc[outlier_indices, 'Longitude'] + np.random.uniform(400, 500, size=n_outliers)) # clearly invalid, just to induce
# Latitude: valid range ~ -90 to 90
outlier_indices = np.random.choice(messy_ntsb.index, n_outliers, replace=False)
messy_ntsb.loc[outlier_indices, 'Latitude'] = (messy_ntsb.loc[outlier_indices, 'Latitude'] + np.random.uniform(200, 300, size=n_outliers)) # clearly invalid, just to induce
print(f"Number of entries after introducing outliers in 'Longitude' and 'Latitude': {len(messy_ntsb)}")

messy_ntsb

Number of starting entries: 23403
1
Number of entries after introducing missing values: 23403
Number of entries after introducing duplicate records: 24573
Number of entries after introducing invalid values: 24573
Number of entries after introducing outliers in 'Longitude' and 'Latitude': 24573


Unnamed: 0,Vehicles.VehicleNumber,Vehicles.DamageLevel,Vehicles.ExplosionType,Vehicles.FireType,Vehicles.SerialNumber,Vehicles.Make,Vehicles.Model,Vehicles.NumberOfEngines,Vehicles.RegistrationNumber,Vehicles.FlightOperationType,...,Country,EventDate,State,Agency,EventType,AirportId,AirportName,Latitude,Longitude,TotalInjuryCount
0,1,substantial,none,none,0702,cirrus design corp,sr22t,1.0,n773gb,pers,...,usa,2023-12-31 17:40:00,ia,ntsb,acc,dvn,,41.610278,-90.588361,0
1,1,substantial,none,none,004ce,golden avio s r l,f30 brio,-1.0,n37ga,pers,...,usa,2023-12-31 16:13:00,va,ntsb,acc,hwy,warrenton/fauquier,38.586285,-77.710631,0
2,1,substantial,none,none,4692139,piper aircraft inc,pa46r-350t,1.0,n539ma,pers,...,usa,2023-12-31 14:13:00,nc,ntsb,acc,14a,lake norman airpark,35.624637,-80.912255,1
3,1,substantial,none,none,79-30941,bellanca,17-30a,1.0,n678mj,pers,...,usa,2023-12-30 17:00:00,mo,ntsb,acc,fyg,washington regional,38.587583,-90.993806,1
4,1,substantial,none,none,7452c,maule,m-6-235,1.0,n71ms,aobv,...,usa,2023-12-29 16:27:00,tx,ntsb,acc,bpt,jack brooks rgnl,30.070603,-94.215837,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24568,1,substantial,,none,24-0480,mooney,m20j,1.0,n201le,pers,...,usa,2007-08-08 20:30:00,mo,,acc,sus,spirit of st louis airport,38.661945,-90.653335,0
24569,1,substantial,none,none,17258976,cessna,172k,1.0,n7276g,pers,...,usa,2011-10-30 16:20:00,mt,ntsb,acc,6s5,ravalli county,46.611667,-114.046669,0
24570,1,substantial,none,none,22393,cessna,305a,1.0,n1831,pers,...,usa,2019-07-21 10:20:00,mo,ntsb,acc,m48,houston memorial airport,37.333332,-91.966667,0
24571,1,substantial,none,none,15064656,cessna,150g,1.0,n4606x,inst,...,usa,2005-06-01 11:00:00,ct,ntsb,acc,ijd,windham,41.743888,-72.180000,0


### Data Cleaning
#### For this challenge, I'm going to assume that the user noticed the errors on the specific columns and dealt with them (Qualitative Cleaning - "Manual crafting of rules and transform function")

In [9]:
# This block of code needs the previous block of code to be ran first
cleaning_messy_ntsb = messy_ntsb.copy()
# First, start by dropping duplicate rows (this will have to be done again at the end, we do it at the start anyway to minimize computing needs)
cleaning_messy_ntsb = cleaning_messy_ntsb.drop_duplicates()
print(f"Number of entries after dropping duplicates: {len(cleaning_messy_ntsb)}")

print("----------")

# Fixing missing values in column Vehicles.Make
# Filter rows where 'Vehicles.Make' is not missing
model_dict = {}
missing_count = cleaning_messy_ntsb['Vehicles.Make'].isna().sum()
print(f"Number of NaN: {missing_count}")
for index, row in cleaning_messy_ntsb.iterrows():
    make = row['Vehicles.Make']
    model = row['Vehicles.Model']
    
    if pd.notna(make):  # Only build dict from known makes
        model_dict[model] = make

# Iterate again to replace NaNs
for index, row in cleaning_messy_ntsb.iterrows():
    make = row['Vehicles.Make']
    model = row['Vehicles.Model']
    
    if pd.isna(make):  # Only build dict from known makes
        cleaning_messy_ntsb.loc[index, 'Vehicles.Make'] = model_dict.get(model, None)


print(f"Number of entries after fixing Vehicles.Make: {len(cleaning_messy_ntsb)}")

missing_count = cleaning_messy_ntsb['Vehicles.Make'].isna().sum()
print(f"Number of NaN after trying to fix Vehicles.Make: {missing_count}") 
# We can check that not all was fixed, I tried to fix by checking with other entries that had the same Model
# but if a Model never has a Make to begin with, this can't be done

print("----------")

# Fixing invalid values in Vehicles.NumberOfEngines
engines_dict = {}
negative_count = (cleaning_messy_ntsb['Vehicles.NumberOfEngines'] < 0).sum()
decimal_count = (cleaning_messy_ntsb['Vehicles.NumberOfEngines'] % 1 != 0).sum() - cleaning_messy_ntsb['Vehicles.NumberOfEngines'].isna().sum() # NaNs count here if not for the subtraction
print(f"Number of negatives: {negative_count}")
print(f"Number of numbers with decimal parts: {decimal_count}")
for index, row in cleaning_messy_ntsb.iterrows():
    engines = row['Vehicles.NumberOfEngines']
    model = row['Vehicles.Model']

    if pd.notna(engines) and engines >= 1 and engines == int(engines):
        engines_dict[model] = engines

    elif engines < 0:
        cleaning_messy_ntsb.loc[index, 'Vehicles.NumberOfEngines'] = abs(engines)

# Iterate again to check for the ones with decimal parts and previously negative
for index, row in cleaning_messy_ntsb.iterrows():
    model = row['Vehicles.Model']

    if model in engines_dict:
        cleaning_messy_ntsb.loc[index, 'Vehicles.NumberOfEngines'] = engines_dict[model]
    else:
        cleaning_messy_ntsb.loc[index, 'Vehicles.NumberOfEngines'] = np.nan # I could round the number to the closet whole number but that wouldn't be trustworthy 
                                                                 # so I think it's better to replace it with None


negative_count = (cleaning_messy_ntsb['Vehicles.NumberOfEngines'] < 0).sum()
decimal_count = (cleaning_messy_ntsb['Vehicles.NumberOfEngines'] % 1 != 0).sum() - cleaning_messy_ntsb['Vehicles.NumberOfEngines'].isna().sum()
print(f"Number of negatives after trying to fix Vehicles.NumberOfEngines: {negative_count}")
print(f"Number of numbers with decimal parts after trying to fix Vehicles.NumberOfEngines: {decimal_count}")

cleaning_messy_ntsb['Vehicles.NumberOfEngines'] = pd.to_numeric(cleaning_messy_ntsb['Vehicles.NumberOfEngines'], errors='coerce').fillna(0).astype(int) # fix Type Conversion

print("----------")

# Removing outliers from Longitude and Latitude ~ -180 to 180
longitude_count = (cleaning_messy_ntsb['Longitude'] < -180).sum() + (cleaning_messy_ntsb['Longitude'] > 180).sum()
latitude_count = (cleaning_messy_ntsb['Latitude'] < -90).sum() + (cleaning_messy_ntsb['Latitude'] > 90).sum()
print(f"Number of outliers in Longitude: {longitude_count}")
print(f"Number of outliers in Latitude: {latitude_count}")
for index, row in cleaning_messy_ntsb.iterrows():
    long = row['Longitude']
    lat = row['Latitude']

    if pd.notna(long) and (long < -180 or long > 180):
        cleaning_messy_ntsb.loc[index, 'Longitude'] = np.nan

    if pd.notna(lat) and (lat < -90 or lat > 90):
        cleaning_messy_ntsb.loc[index, 'Latitude'] = np.nan

longitude_count = (cleaning_messy_ntsb['Longitude'] < -180).sum() + (cleaning_messy_ntsb['Longitude'] > 180).sum()
latitude_count = (cleaning_messy_ntsb['Latitude'] < -90).sum() + (cleaning_messy_ntsb['Latitude'] > 90).sum()
print(f"Number of outliers in Longitude after fix: {longitude_count}")
print(f"Number of outliers in Latitude after fix: {latitude_count}")

print("----------")

cleaning_messy_ntsb = cleaning_messy_ntsb.drop_duplicates()
print(f"Number of entries after dropping duplicates at the end: {len(cleaning_messy_ntsb)}") # we do this again because there may exist rows that had records manipulated and weren't 
                                                                                    # duplicated because of that, but at the end of the Data Cleaning could be duplicated again
print(f"Number of starting entries for comparison: {starting_entries}") # this value being different is natural, as the order of the manipulation has the duplication happening before other
                                                                        # data issue insertions,, I tested with the duplication as the last manipulation and the number of entries
                                                                        # at the end of the Data Cleaning coincides with the starting entries (23403)

cleaning_messy_ntsb

Number of entries after dropping duplicates: 23700
----------
Number of NaN: 2365
Number of entries after fixing Vehicles.Make: 23700
Number of NaN after trying to fix Vehicles.Make: 154
----------
Number of negatives: 1198
Number of numbers with decimal parts: 1226
Number of negatives after trying to fix Vehicles.NumberOfEngines: 0
Number of numbers with decimal parts after trying to fix Vehicles.NumberOfEngines: 0
----------
Number of outliers in Longitude: 613
Number of outliers in Latitude: 608
Number of outliers in Longitude after fix: 0
Number of outliers in Latitude after fix: 0
----------
Number of entries after dropping duplicates at the end: 23522
Number of starting entries for comparison: 23403


Unnamed: 0,Vehicles.VehicleNumber,Vehicles.DamageLevel,Vehicles.ExplosionType,Vehicles.FireType,Vehicles.SerialNumber,Vehicles.Make,Vehicles.Model,Vehicles.NumberOfEngines,Vehicles.RegistrationNumber,Vehicles.FlightOperationType,...,Country,EventDate,State,Agency,EventType,AirportId,AirportName,Latitude,Longitude,TotalInjuryCount
0,1,substantial,none,none,0702,cirrus design corp,sr22t,1,n773gb,pers,...,usa,2023-12-31 17:40:00,ia,ntsb,acc,dvn,,41.610278,-90.588361,0
1,1,substantial,none,none,004ce,golden avio s r l,f30 brio,0,n37ga,pers,...,usa,2023-12-31 16:13:00,va,ntsb,acc,hwy,warrenton/fauquier,38.586285,-77.710631,0
2,1,substantial,none,none,4692139,piper aircraft inc,pa46r-350t,1,n539ma,pers,...,usa,2023-12-31 14:13:00,nc,ntsb,acc,14a,lake norman airpark,35.624637,-80.912255,1
3,1,substantial,none,none,79-30941,bellanca,17-30a,1,n678mj,pers,...,usa,2023-12-30 17:00:00,mo,ntsb,acc,fyg,washington regional,38.587583,-90.993806,1
4,1,substantial,none,none,7452c,maule,m-6-235,1,n71ms,aobv,...,usa,2023-12-29 16:27:00,tx,ntsb,acc,bpt,jack brooks rgnl,30.070603,-94.215837,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24537,1,minor,none,none,30280,boeing,737,2,n551wn,,...,usa,2008-10-01 10:55:00,il,ntsb,inc,mdw,chicago midway,41.889202,,0
24541,1,substantial,none,none,1993p,flightstar sportplanes,flightstar ii,1,n194pg,pers,...,usa,2011-05-22 19:45:00,ga,ntsb,acc,cco,newnan-coweta,33.258609,-84.844444,0
24553,1,substantial,none,none,21062182,cessna,210,1,n761dv,aobv,...,usa,2018-03-16 15:45:00,vt,ntsb,acc,,,44.380554,-73.227500,0
24561,1,substantial,none,none,17263883,cessna,172m,1,n20965,inst,...,usa,2006-08-03 08:30:00,ga,ntsb,acc,kags,augusta regional-bush field,33.369720,,0
