### Preliminary data cleaning:
- Combine datasets into one 
- Clean data names 
- Make Aboard into dictionaries 
- Replace "?" with nan 
- Fix date time column

In [1]:
import pandas as pd 
import numpy as np
from functions.extraction_and_cleaning import read_bronze


In [2]:
df = read_bronze()
df

Unnamed: 0,Date,Time,Location,Operator,Flight #,Route,AC\n Type,Registration,cn / ln,Aboard,Fatalities,Ground,Summary
0,"January 05, 1953",2139,"Belfast, Northern Ireland",British European Airways,?,Northolt - Belfast,Vickers Viking 610-1B,G-AJDL,262,35 (passengers:31 crew:4),27 (passengers:24 crew:3),0,Crashed into approach lights while attempting ...
1,"January 07, 1953",2055,"Issaquah, Washington",Flying Tiger Line,841,Seattle - Cheyenne - Fort Jackson,Douglas C-54B-10-DO,N86574,18350,7 (passengers:3 crew:4),7 (passengers:3 crew:4),0,The plane struck a tree on a mountain ridge at...
2,"January 07, 1953",0412,"Fish Haven, Idaho",Associated Air Transport,1-6-6A,Boeing Field - Cheyenne,Curtiss C-46,N1648M,22395,40 (passengers:37 crew:3),40 (passengers:37 crew:3),0,The plane crashed into mountains while en rout...
3,"January 15, 1953",0448,"25 nm off Agrigento, Italy",Military -Royal Air Force,-,?,Vickers Valetta Mk1 / Avero Lancaster,VX562 / TX270,?,26 (passengers:16 crew:10),26 (passengers:16 crew:10),0,Both aircraft crashed after a midair collision...
4,"January 26, 1953",1144,"Sinnai, Sardinia, Italy",Linee Aeree Italiane,?,Cagliari - Rome,Douglas DC-3,I-LAIL,4308,19 (passengers:15 crew:4),19 (passengers:15 crew:4),0,Crashed 10 miles east of Cagliari in the Sinna...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5032,"December 17, 1976",1940,"Ust-Kut, Russia",Aeroflot,?,Ust-Kut - Kirensk,Yakovlev 40,CCCP-88208,9631349,7 (passengers:4 crew:3),7 (passengers:4 crew:3),0,The cargo plane crashed into trees on takeoff....
5033,"December 17, 1976",21:30,"Kiev, USSR",Aeroflot,H-36,Chernivtsi - Kiev,Antonov AN-24RV,CCCP-46672,47309604,55 (passengers:50 crew:5),48 (passengers:44 crew:4),0,In heavy fog the crew continued their descend...
5034,"December 18, 1976",1656,"Nnear Yuzhno-Sakhalinsk, Russia",Aeroflot,?,Petropavlovsk-Kamchatsky - Yuzhno-Sakhalinsk,Ilyushin IL-14M,CCCP-61752,147001247,8 (passengers:3 crew:5),8 (passengers:3 crew:5),0,The survey fllight deviated from the approach ...
5035,"December 25, 1976",0345,"Near Bangkok, Thailand",EgyptAir,864,Cairo - Bangkok,Boeing B-707-366C,SU-AXA,20763,53 (passengers:44 crew:9),53 (passengers:44 crew:9),19,The aircraft crashed into an industrial area d...


First lets fix the column names

In [3]:
new_names = []
for item in df.columns:
    item = item.lower()
    item = item.strip()
    item = item.replace("\n"," ")
    item = item.replace("/"," ")
    item = item.replace("#","num")
    item = item.split()
    item = "_".join(item)
    new_names.append(item.title())

In [4]:
df.columns = new_names

In [5]:
def clean_aboard(x):
    x = x.replace("(","").replace(")","").split()
    a, b, c = x[0], x[1].split(":")[-1], x[2].split(":")[-1]
    try: 
        return {"total":int(a),"passengers":int(b),"crew":int(c)}
    except:
        if a == "?":
            a = float("nan")
        if b == "?":
            b = float("nan")
        if c == "?":
            c = float("nan")
    return {"total":a,"passengers":b,"crew":c}

In [6]:
df.Aboard = df.Aboard.apply(clean_aboard)
df.Fatalities = df.Fatalities.apply(clean_aboard)
df.head(3)

Unnamed: 0,Date,Time,Location,Operator,Flight_Num,Route,Ac_Type,Registration,Cn_Ln,Aboard,Fatalities,Ground,Summary
0,"January 05, 1953",2139,"Belfast, Northern Ireland",British European Airways,?,Northolt - Belfast,Vickers Viking 610-1B,G-AJDL,262,"{'total': 35, 'passengers': 31, 'crew': 4}","{'total': 27, 'passengers': 24, 'crew': 3}",0,Crashed into approach lights while attempting ...
1,"January 07, 1953",2055,"Issaquah, Washington",Flying Tiger Line,841,Seattle - Cheyenne - Fort Jackson,Douglas C-54B-10-DO,N86574,18350,"{'total': 7, 'passengers': 3, 'crew': 4}","{'total': 7, 'passengers': 3, 'crew': 4}",0,The plane struck a tree on a mountain ridge at...
2,"January 07, 1953",412,"Fish Haven, Idaho",Associated Air Transport,1-6-6A,Boeing Field - Cheyenne,Curtiss C-46,N1648M,22395,"{'total': 40, 'passengers': 37, 'crew': 3}","{'total': 40, 'passengers': 37, 'crew': 3}",0,The plane crashed into mountains while en rout...


In [7]:
df[df == "?"] = np.nan
df.Date = pd.to_datetime(df.Date)
df.head(3)

Unnamed: 0,Date,Time,Location,Operator,Flight_Num,Route,Ac_Type,Registration,Cn_Ln,Aboard,Fatalities,Ground,Summary
0,1953-01-05,2139,"Belfast, Northern Ireland",British European Airways,,Northolt - Belfast,Vickers Viking 610-1B,G-AJDL,262,"{'total': 35, 'passengers': 31, 'crew': 4}","{'total': 27, 'passengers': 24, 'crew': 3}",0,Crashed into approach lights while attempting ...
1,1953-01-07,2055,"Issaquah, Washington",Flying Tiger Line,841,Seattle - Cheyenne - Fort Jackson,Douglas C-54B-10-DO,N86574,18350,"{'total': 7, 'passengers': 3, 'crew': 4}","{'total': 7, 'passengers': 3, 'crew': 4}",0,The plane struck a tree on a mountain ridge at...
2,1953-01-07,412,"Fish Haven, Idaho",Associated Air Transport,1-6-6A,Boeing Field - Cheyenne,Curtiss C-46,N1648M,22395,"{'total': 40, 'passengers': 37, 'crew': 3}","{'total': 40, 'passengers': 37, 'crew': 3}",0,The plane crashed into mountains while en rout...


In [8]:
df.to_csv("data/silver/1920_2024.csv", index=False)