In [12]:
import pandas as pd
import numpy as np

In [13]:
# Load
flights = pd.read_csv('Dataset/Flight_Level_Data.csv')
pnr_flight = pd.read_csv('Dataset/PNR_Flight_Level_Data.csv')
pnr_remarks = pd.read_csv('Dataset/PNR_Remark_Level_Data.csv')
airports = pd.read_csv('Dataset/Airports_Data.csv')
bags = pd.read_csv('Dataset/Bag_Level_Data.csv')

In [14]:
# ------------------ PNR FLIGHT LEVEL DATA ------------------
pnr_flight = pnr_flight.drop_duplicates()
print("PNR_Flight columns:", pnr_flight.columns.tolist())  # Inspect for cleaning

count_cols = ['total_pax', 'lap_child_count', 'basic_economy_pax']
for col in count_cols:
    if col in pnr_flight.columns:
        pnr_flight[col] = pd.to_numeric(pnr_flight[col], errors='coerce').fillna(0)

if 'is_child' in pnr_flight.columns:
    pnr_flight['is_child'] = pnr_flight['is_child'].astype(str).str.lower().map({'true': 1, 'false': 0, 'yes': 1, 'no': 0})

if 'is_stroller_user' in pnr_flight.columns:
    pnr_flight['is_stroller_user'] = pnr_flight['is_stroller_user'].astype(str).str.lower().map({'true': 1, 'false': 0, 'yes': 1, 'no': 0})

if 'company_id' in pnr_flight.columns:
    pnr_flight['company_id'] = pnr_flight['company_id'].astype(str).str.upper().str.strip()

PNR_Flight columns: ['company_id', 'flight_number', 'scheduled_departure_date_local', 'scheduled_departure_station_code', 'scheduled_arrival_station_code', 'record_locator', 'pnr_creation_date', 'total_pax', 'is_child', 'basic_economy_ind', 'is_stroller_user', 'lap_child_count']


In [15]:
# ------------------ FLIGHT LEVEL DATA ------------------
flights = flights.drop_duplicates()

date_cols = [
    'scheduled_departure_datetime_local', 'scheduled_arrival_datetime_local',
    'actual_departure_datetime_local', 'actual_arrival_datetime_local'
]
for col in date_cols:
    if col in flights.columns:
        flights[col] = pd.to_datetime(flights[col], errors='coerce')

num_cols = ['total_seats', 'scheduled_ground_time_minutes', 'actual_ground_time_minutes', 'minimum_turn_minutes']
for col in num_cols:
    if col in flights.columns:
        flights[col] = pd.to_numeric(flights[col], errors='coerce')
        flights[col] = flights[col].fillna(flights[col].median())

for col in ['company_id', 'flight_number']:
    if col in flights.columns:
        flights[col] = flights[col].astype(str).str.upper().str.strip()

In [16]:
# ------------------ PNR REMARKS DATA ------------------
pnr_remarks = pnr_remarks.drop_duplicates()
if 'record_locator' in pnr_remarks.columns:
    pnr_remarks['record_locator'] = pnr_remarks['record_locator'].astype(str).str.strip()

if 'special_service_request' in pnr_remarks.columns:
    pnr_remarks['special_service_request'] = pnr_remarks['special_service_request'].fillna('NONE')

if 'flight_number' in pnr_remarks.columns:
    pnr_remarks['flight_number'] = pnr_remarks['flight_number'].astype(str).str.strip()

if 'pnr_creation_date' in pnr_remarks.columns:
    pnr_remarks['pnr_creation_date'] = pd.to_datetime(pnr_remarks['pnr_creation_date'], errors='coerce')

In [17]:
# ------------------ AIRPORTS DATA ------------------
airports = airports.drop_duplicates()
if 'airport_iata_code' in airports.columns:
    airports['airport_iata_code'] = airports['airport_iata_code'].astype(str).str.upper().str.strip()
if 'iso_country_code' in airports.columns:
    airports['iso_country_code'] = airports['iso_country_code'].astype(str).str.upper().str.strip()

In [18]:
# ------------------ BAG LEVEL DATA ------------------
bags = bags.drop_duplicates()
if 'bag_type' in bags.columns:
    bags['bag_type'] = bags['bag_type'].astype(str).str.title().str.strip()
if 'bag_tag_issue_date' in bags.columns:
    bags['bag_tag_issue_date'] = pd.to_datetime(bags['bag_tag_issue_date'], errors='coerce')
if 'company_id' in bags.columns:
    bags['company_id'] = bags['company_id'].astype(str).str.upper().str.strip()

for df in [flights, pnr_flight, pnr_remarks, bags]:
    if 'flight_number' in df.columns:
        df['flight_number'] = df['flight_number'].astype(str).str.strip()
    if 'company_id' in df.columns:
        df['company_id'] = df['company_id'].astype(str).str.upper().str.strip()
    if 'scheduled_departure_date_local' in df.columns:
        df['scheduled_departure_date_local'] = pd.to_datetime(df['scheduled_departure_date_local'], errors='coerce')

In [19]:
print("Flight Data Clean:", flights.shape)
print("PNR Flight Data Clean:", pnr_flight.shape)
print("PNR Remarks Clean:", pnr_remarks.shape)
print("Airports Data Clean:", airports.shape)
print("Bags Data Clean:", bags.shape)

Flight Data Clean: (8099, 15)
PNR Flight Data Clean: (687878, 12)
PNR Remarks Clean: (51698, 4)
Airports Data Clean: (5612, 2)
Bags Data Clean: (686952, 8)
