In [105]:
#Airline crash data processing and visualization code
#====================================================
#
#Change History:
#
#====================================================

# Dependencies
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#Function to put data values in bar chart
def autolabel(rects, ax):
   # Get y-axis height to calculate label position from.
   (y_bottom, y_top) = ax.get_ylim()
   y_height = y_top - y_bottom

   for rect in rects:
       height = rect.get_height()
       label_position = 1   #height + (y_height * 0.01)

       ax.text(rect.get_x() + rect.get_width()/2., label_position,
               '%d' % int(height),
               ha='center', va='bottom')




In [50]:
#Reading the cleaned up file

cleaned_file = "data/plane_crash_cleaned.csv"

oper_data = pd.read_csv(cleaned_file)
oper_data.head()

Unnamed: 0,Decade,Year,Date,Location,Operator,Op_Code,Route,Type,Make_Type,Aboard,Fatalities,Accident_Code,Summary
0,1900-1910,1908,1908-09-17,"Fort Myer, Virginia",Military - U.S. Army,Military,Demonstration,Wright Flyer III,Wright,2.0,1.0,Human Error,"During a demonstration flight, a U.S. Army fly..."
1,1911-1920,1912,1912-12-07,"AtlantiCity, New Jersey",Military - U.S. Navy,Military,Test flight,Dirigible,Dirigibl,5.0,5.0,War,First U.S. dirigible Akron exploded just offsh...
2,1911-1920,1913,1913-06-08,"Victoria, British Columbia, Canada",Private,Passenger,,Curtiss seaplane,Curtiss,1.0,1.0,Weather,The first fatal airplane accident in Canada oc...
3,1911-1920,1913,1913-09-09,Over the North Sea,Military - German Navy,Military,,Zeppelin L-1 (airship),Zeppelin,20.0,14.0,Landing/Takeoff,The airship flew into a thunderstorm and encou...
4,1911-1920,1913,1913-10-17,"Near Johannisthal, Germany",Military - German Navy,Military,,Zeppelin L-2 (airship),Zeppelin,30.0,30.0,War,Hydrogen gas which was being vented was sucked...


In [51]:
#Create pie chart Accident type%

ac_data = oper_data.groupby("Accident_Code")
actotal = ac_data.Accident_Code.count().sum()
ac_prct = (ac_data.Accident_Code.count()/actotal)*100

# Build Pie Chart

plt.pie(
    ac_prct,labels =ac_prct.index, 
    colors=["Gold", "lightskyblue","lightcoral","red","c","magenta","yellow","cyan"],
    labeldistance=1.1,
    pctdistance= 0.6,
    autopct="%1.1f%%",
    startangle=100,
    shadow = True,
    wedgeprops   = { 'linewidth' : 2,
                     'edgecolor' : "black" },
    explode = (0, 0, 0, 0.1, 0, 0.1, 0, 0))

plt.title("% of Flight crashes by Categories")


#size of plot to make circle big/small
fig = plt.gcf()
fig.set_size_inches(8,8) #size of plot

#save pie chart
plt.savefig("data/PieChart_Prcnt_of_crash_by_categories.png")
#plt.show()

<IPython.core.display.Javascript object>

In [52]:
#Analyzing data by flight category
#Grouping the data by "Op Code" and then showcase in pie chart

oper_data_grp = oper_data.groupby("Op_Code")

oper_data1 = oper_data_grp["Year"].count()
oper_data1.head()

df_machine = oper_data1.to_frame()
#df_machine.columns = ['Total_Year']

#df_machine['Total_Year'] = df_machine.index
df_machine.head()

Unnamed: 0_level_0,Year
Op_Code,Unnamed: 1_level_1
Military,781
Others,91
Passenger,4396


In [53]:
#Visualization of data by flight category

values_x=df_machine["Year"]
labels_y=df_machine.index

colors=["Red","Blue","Green"]

fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
wedges, texts = ax.pie(values_x, colors=colors, wedgeprops=dict(width=0.5), startangle=0)

bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)

kw = dict(xycoords='data', textcoords='data', arrowprops=dict(arrowstyle="-"),
          bbox=bbox_props, zorder=0, va="center")

for i, p in enumerate(wedges):
    ang = (p.theta2 - p.theta1)/2. + p.theta1
    y = np.sin(np.deg2rad(ang))
    x = np.cos(np.deg2rad(ang))
    horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
    connectionstyle = "angle,angleA=0,angleB={}".format(ang)
    kw["arrowprops"].update({"connectionstyle": connectionstyle})
    ax.annotate(labels_y[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                 horizontalalignment=horizontalalignment, **kw)

sort_legend = True
if sort_legend:
    patches, labels, dummy =  zip(*sorted(zip(patches, labels, labels_y),
                                         key=lambda x: values_x[2],
                                         reverse=True))

plt.legend(patches, labels,loc='best', bbox_to_anchor=(.01, 1.), fontsize=8)    
    
plt.title("Crashes by Flight Category")
plt.show()
plt.savefig("data/category_doughnut.png")

<IPython.core.display.Javascript object>

In [54]:
#Data visualization by year

df_crash_per_yr = oper_data.groupby("Year")

df = df_crash_per_yr["Year"].count()

df_crashyr = df.to_frame()
df_crashyr.columns = ['Total']
df_crashyr['Year'] = df_crashyr.index
df_crashyr.head()


Unnamed: 0_level_0,Total,Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1908,1,1908
1912,1,1912
1913,3,1913
1915,2,1915
1916,5,1916


In [57]:
#Visualization of data by year

#Set x axis and tick locations
x_axis = np.arange(len(df_crashyr))
tick_locations = [value+0.4 for value in x_axis]

# Create a list indicating where to write x labels and set figure size to adjust for space
plt.figure(figsize=(28,7))

plt.bar(x_axis,df_crashyr["Total"], color='r', alpha=0.5, align="edge")

plt.xticks(tick_locations, df_crashyr["Year"], rotation="vertical")

plt.grid()
plt.title("Flight Crashes per year ")
plt.xlabel('Year Of Crash', fontsize = 10)
plt.ylabel('Number of Flights')
plt.savefig('data/Bar_allFlights.png')
plt.show()

<IPython.core.display.Javascript object>

In [61]:
#Data Analysis by Operators

df_crash_by_type = oper_data.groupby("Operator")

df_operator = df_crash_by_type["Operator"].count()
df_operator.head()

crash_operator = df_operator.to_frame()
crash_operator.columns = ['Total']
crash_operator['Operator'] = crash_operator.index

#sort by high values on top and get top 20 , as too many records
oper_tot_sort = crash_operator.sort_values("Total", ascending=False).head(20)
oper_tot_sort.head()

Unnamed: 0_level_0,Total,Operator
Operator,Unnamed: 1_level_1,Unnamed: 2_level_1
Aeroflot,179,Aeroflot
Military - U.S. Air Force,176,Military - U.S. Air Force
Air France,70,Air France
Deutsche Lufthansa,65,Deutsche Lufthansa
United Air Lines,44,United Air Lines


In [78]:
#Visualization of which type of plane crashed most
#Set x axis and tick locations

xlength = len(oper_tot_sort)
x_axis = np.arange(len(oper_tot_sort))

tick_locations = [value+0.1 for value in x_axis]

# Create a list indicating where to write x labels and set figure size to adjust for space
plt.figure(figsize=(10,15))

plt.bar(x_axis,oper_tot_sort["Total"], color='maroon',align="edge")

plt.xticks(tick_locations, oper_tot_sort["Operator"], rotation="vertical", fontsize=5)

plt.grid()
plt.title("Flight Crashes by Flight Operators- " + "("+ "Top "+ str(xlength) + ")")
plt.xlabel('Name of Operators', fontsize = 10)
plt.ylabel('Number of Flights')
plt.savefig('data/Bar_allFlights_Operator.png')
plt.show()

<IPython.core.display.Javascript object>

In [86]:
#Data Analysis and visualization for cumulative Aboard and Fatalities vs. Year

var_yr=oper_data.groupby("Year", as_index=False)["Aboard","Fatalities"].sum()

plt.figure(figsize=(10,5))

plt.plot(var_yr["Year"],var_yr["Aboard"], color = 'blue')
plt.plot(var_yr["Year"],var_yr["Fatalities"], color = 'red')

plt.grid()
plt.title("Number of Fatalities")
plt.xlabel("Year")
plt.ylabel("Passengers")
plt.legend(loc="upper left")
plt.savefig("Data/aboardvsfatalities_by_yr")
plt.show()

<IPython.core.display.Javascript object>

In [110]:
#Plot Graph for Highest Crashes by Plane Type (1908-2007)

type_pd = oper_data.groupby("Make_Type").count().reset_index()
type_pd["Make_Type"] = type_pd["Make_Type"].replace(regex='de', value='De Havilland')
type_pd = type_pd.sort_values('Type',ascending=False)

x_axis = np.arange(len(type_pd.head(10)))
tick_locations = [value+0.4 for value in x_axis]
# Create a list indicating where to write x labels and set figure size to adjust for space
plt.figure(figsize=(10,9))

#plt.bar(x_axis,df_crash_per_yr["Year"], color='r', alpha=0.5, align="edge")
plt.bar(x_axis,type_pd.Type.head(10), alpha=0.5, align="edge")
plt.xticks(tick_locations, type_pd.Make_Type.head(10), rotation=45)
plt.grid()
plt.title("Highest Crashes by Plane Type (1908-2007)",fontdict ={'fontweight':'bold','fontsize':16})
plt.xlabel('Plane Types', fontdict ={'fontweight':'bold','fontsize':13})
plt.ylabel('Number of Planes Crashed',fontdict ={'fontweight':'bold','fontsize':13})
plt.savefig('data/Bar_EngineTypes.png')
plt.show()

<IPython.core.display.Javascript object>

In [99]:
#Data Visulatization by crash location (1908-2007)

type_pd = oper_data.groupby("Location").count().reset_index()
type_pd = type_pd.sort_values('Date',ascending=False)

x_axis = np.arange(len(type_pd.head(10)))
tick_locations = [value+0.4 for value in x_axis]

# Create a list indicating where to write x labels and set figure size to adjust for space
plt.figure(figsize=(10,6))

plt.bar(x_axis,type_pd.Date.head(10), alpha=0.5, align="edge")
plt.xticks(tick_locations, type_pd.Location.head(10), rotation=45)
plt.grid()
plt.title("Most Plane Crashes based on Location (1908-2007)",fontdict ={'fontweight':'bold','fontsize':16})
plt.xlabel('Location', fontdict ={'fontweight':'bold','fontsize':10})
plt.ylabel('Number of Planes Crashed',fontdict ={'fontweight':'bold','fontsize':13})
plt.tight_layout()
plt.savefig('data/Location_type.png')
plt.show()

<IPython.core.display.Javascript object>

In [103]:
#Data visualization for passenger travelling (1970-2008)

years = oper_data.groupby("Year").sum()
years =years.drop(years.index[0:58])
years =years.drop(years.index[-1]).reset_index()

#Using second data file with total passenger travelled information
all_flights = pd.read_csv("data/All_flights.csv",encoding="ISO-8859-1")
all_flights = all_flights.drop(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],axis=1)
all_flights = all_flights.replace(np.nan, 0)
all_flights = pd.DataFrame(all_flights.sum())
all_flights = all_flights.drop(all_flights.index[0:10])
all_flights = all_flights['1970':'2008']
all_flights.columns = ['Sum']
all_flights.index.name = 'Year'
all_flights = all_flights.reset_index()

years["Total_Passenger"] = 0
years["Percentage_dead"] = 0
for index,row in years.iterrows():
   for index1,row1 in all_flights.iterrows():
       #print(total_crash(row["Year"]))
       #print(((str(row["Year"].astype("int")))))
       if str(row["Year"].astype("int")) == row1["Year"]:

           years.loc[index,"Total_Passenger"] = row1["Sum"]
           years.loc[index,"Percentage_dead"] = (row['Fatalities']/row1["Sum"])*100

plt.figure(figsize=(10,8))
plt.plot(years.Year,years.Percentage_dead,marker='o')
plt.title("Percentage of Fatalities by Total passengers travelled (1970-2008)",fontdict ={'fontweight':'bold','fontsize':16})
plt.xlabel('Years', fontdict ={'fontweight':'bold','fontsize':13})
plt.ylabel('Number of Passengers',fontdict ={'fontweight':'bold','fontsize':13})
plt.savefig('Scatter_Percentage_of_Fatalities_from_total_travelled.png')


plt.figure(figsize=(10,8))
plt.plot(years.Year,years.Total_Passenger,marker='o')
plt.title("Total Passengers Travelled (1970-2008)",fontdict ={'fontweight':'bold','fontsize':16})
plt.xlabel('Years', fontdict ={'fontweight':'bold','fontsize':13})
plt.ylabel('Number of Passengers Travelled',fontdict ={'fontweight':'bold','fontsize':13})
plt.savefig('total_travelled.png')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [111]:
#Plot Graph for Highest Fatalities vs Aboard by Plane Type (1908-2007)

type_pd = oper_data.groupby("Make_Type").sum().reset_index()
type_pd["Make_Type"] = type_pd["Make_Type"].replace(regex='de', value='De Havilland')
type_pd = type_pd.sort_values('Fatalities',ascending=False)


plt.figure(figsize=(10,8))
plt.grid()
p1 = plt.plot(type_pd.Make_Type.head(10),type_pd.Fatalities.head(10),marker='o',label="Fatalities",color='r')
p2 = plt.plot(type_pd.Make_Type.head(10),type_pd.Aboard.head(10),marker='o',label="Aboard",color='g')
plt.legend(loc="best",fontsize = 'xx-large')
plt.title("Highest Fatalities vs Aboard by Plane Type (1908-2007)",fontdict ={'fontweight':'bold','fontsize':16})
plt.xlabel('Plane Types', fontdict ={'fontweight':'bold','fontsize':13})
plt.ylabel('Number of Passengers on Broad',fontdict ={'fontweight':'bold','fontsize':13})
plt.savefig('Scatter_Fatalities_Abroad_EngineTypes.png')

<IPython.core.display.Javascript object>

In [123]:
#Visualization of crashes by decades

crash_df_decade = oper_data.groupby(["Decade"]).count().reset_index()
x_axis = np.arange(len(crash_df_decade))
tick_locations = [value+0.4 for value in x_axis]

fig, ax = plt.subplots(figsize=(10,7))

plt1 = ax.bar(x_axis,crash_df_decade["Date"], alpha=0.5, color = "r", align="edge")
plt.xticks(tick_locations, crash_df_decade["Decade"], rotation=30, fontweight ='normal')
plt.yticks(fontsize=8)
autolabel(plt1, ax)
plt.title("Flight Crashes per Decade")
plt.xlabel("Decade")
plt.ylabel("Crash Count")
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

In [118]:
# #Data visualization by decade by operator type

# #Data for Passenger
# crash_pass = oper_data.loc[(oper_data.Op_Code == "Passenger")]

# crash_pass = crash_pass.groupby(["Decade"]).count()

# #Data for Others
# crash_other = oper_data.loc[(oper_data.Op_Code == "Others")]

# crash_other = crash_other.groupby(["Decade"]).count()

# #Data for Military
# crash_military = oper_data.loc[(oper_data.Op_Code == "Military")]

# crash_military = crash_military.groupby(["Decade"]).count()

# ind = np.arange(11)
# width = 0.25
# ax = plt.subplot()



# p1 = ax.bar(ind+width,crash_military["Op_Code"], width=width,color='r', alpha=0.5, align="center")
# p2 = ax.bar(ind,crash_pass["Op_Code"],width=width, color='b', alpha=0.5, align="center")
# p3 = ax.bar(ind-width,crash_other["Op_Code"],width=width, color='g', alpha=0.5, align="center")

# ax.set_xticks(ind + width/2)
# ax.set_xticklabels(("1900-1910","1911-1920","1921-1930","1931-1940","1941-1950","1951-1960","1961-1970","1971-1980","1981-1990","1991-2000","2001-2010"),rotation=45)
# ax.legend((p1[0], p2[0],p3[0]), ("Military","Passengers","Others"))
# plt.show()



ValueError: shape mismatch: objects cannot be broadcast to a single shape

In [122]:
crash_military

Unnamed: 0_level_0,Year,Date,Location,Operator,Op_Code,Route,Type,Make_Type,Aboard,Fatalities,Accident_Code,Summary
Decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1900-1910,1,1,1,1,1,1,1,1,1,1,1,1
1911-1920,19,19,19,19,19,1,19,19,19,19,19,19
1921-1930,7,7,7,7,7,4,6,7,7,7,7,6
1931-1940,5,5,5,5,5,1,5,5,5,5,5,5
1941-1950,149,149,149,149,149,67,145,149,148,148,149,132
1951-1960,92,92,92,92,92,42,92,92,92,92,92,88
1961-1970,126,126,126,126,126,38,126,126,126,126,126,115
1971-1980,92,92,91,92,92,43,92,92,92,92,92,87
1981-1990,119,119,118,119,119,44,119,119,119,119,119,112
1991-2000,103,103,103,103,103,60,102,103,102,102,103,99
