In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from collections import Counter
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Reading in the dataset

In [None]:
df = pd.read_csv("/kaggle/input/airplane-crashes-since-1908/Airplane_Crashes_and_Fatalities_Since_1908.csv")
df.head()

# Objective:
I´d like to analyze and predict airplane crashes and the likelihood of surviving such a crash (although I couldn´t find any data on passenger seating so far). Goal is to answer the question wether flying really is the safest way to travel.

# 1. data cleaning:
# 1.1. remove columns with 80 ore more percent of Null-values

In [None]:
df.info() #5268 rows
5268*0.8 #4214 rows needed at least
5268 - 4214 #1054 <- breakpoint for non-null cols to delete:
# cols to delete:
# Flight # --> 1069 non-null cols (close call!)
# no cols to delete!

No columns to delte, although `Flight #` was a close call

# 1.2: check the `Ground` column to see if it only contains `0` and `1`. If so convert it to a Boolean type

In [None]:
len(df.Ground.unique())

## there are 51 unique values in the `Ground` column. Unfortunately the dataset doesn´t provide information about this column and it´s values

# questions to answer:
# 2.1: on avg. how many plane crashes happen per year?

In [None]:
#converting the Date column to datetime Object:

df["Date"] = pd.to_datetime(df.Date).copy()
df.info()

In [None]:
df.Date.dt.year.value_counts() # 98 years were documented
yearly_avg_crashes = len(df.Date)/98
yearly_avg_crashes

## on avg. there occured 53 crashes each year. However we need to take into consideration, that as time progressed the number of flights also increased! Thus the Average is pretty skewed.

## 2.1.1: which year was the one with the least/most crashes?

In [None]:
df.Date.dt.year.value_counts(ascending=False, sort=True)

## 1908 and 1912 were the years with the least (only 1) crashes --> to no surprise, since aviation was still in it´s infancy then
## 1972 was the year with the most crashes (104)

# 2.2: on avg. how many months are in between crashes?

In [None]:
# sort dataframe by Date:
df.sort_values(by=['Date'], ascending=True)

# create new column for the timedelta:
df["Time between crashes"] = df.Date.diff()
df.head()

# calculate the average of the new column:
df["Time between crashes"].mean()

## On average a plane crashes every 6-7 days

## 2.2.1: what was the longest period in between crashes?

In [None]:
df["Time between crashes"].max() # max crash-free days was 7020
# convert 7020 days to years:
years_between_crashes = 7020/365
years_between_crashes

## the longest period in between crashes was 7020 days or 19 years!

In [None]:
df[df["Time between crashes"].dt.days==7020] # index 2514
df[2513:2515]

# the longest period between crashes was 1971 - 1991

# 2.3: which Operator suffered the most crashes in history?

In [None]:
df.Operator.value_counts(sort=True, ascending=False)

## Aeroflot (179 crashes) and the U.S. Air Force (176 crashes) suffered the most crashes in history. Being Military operators that´s no surprise. The civil operator with the most crashes is Air France

# 3.1: from the Route column exract start and destination and create new columns for each

In [None]:
df.Route.isnull().sum() # 1706 Null Values
df = df.dropna(subset=["Route"])
df.Route.isnull().sum()

In [None]:
divider = df.Route.str.contains("-")
df.Route[divider] # 3362 rows that use "-" as divider
df.Route[~divider] # 200 rows that don´t use above pattern 
df.Route.isnull().sum()

In [None]:
#create an empty list to store the separated column values in:
route = []
route.append(df.Route.str.split("-",expand=True))
    
# create a new dataframe:
route = route[0]
print(route.shape)
print(df.shape)
#as both dataframes have the same no. of rows we can safely merge them:
flights = pd.concat([df, route], axis=1)
# rename the new columns:
flights.rename(columns={0:"Start",1:"Destination"},inplace=True)
flights.head()

In [None]:
# move Values (if there are any) from columns 2 - 5 to the Destination col:
flights.Destination.fillna(value=flights[2],inplace=True)
flights.Destination.isnull().sum() 

flights.Destination.fillna(value=flights[3],inplace=True)
flights.Destination.isnull().sum() 

flights.Destination.fillna(value=flights[4],inplace=True)
flights.Destination.isnull().sum() 

flights.Destination.fillna(value=flights[5],inplace=True)
flights.Destination.isnull().sum() 

flights[["Start","Destination"]].head()

In [None]:
# checking left-over null values:
flights.Destination.isnull().sum()

## these are the 200 rows we earlier identified as not using `-` as delimiter, thus not having a "from - to" route

In [None]:
flights[flights.Destination.isnull()].head()

### we can now drop columns 2,3 and 4

In [None]:
flights.drop(columns=[2,3,4,5], inplace=True)

In [None]:
flights.head()

# 3.2: what´s the most common weekday for crashes?

In [None]:
flights.Date.dt.weekday.value_counts(sort=True, ascending=False)

## most crashes happen on Wednesdays,Thursdays and Fridays

# 4.1: on avg. how many passengers survive a crash (if any)? Create a new column called `survival rate`

In [None]:
flights[["Aboard", "Fatalities"]].describe()

In [None]:
flights["survival rate"] = 1-(flights.Fatalities/flights.Aboard)

In [None]:
flights["survival rate"].mean()*100

## the average survival rate is 17.79%

# Bonus
## - What were the most common causes for crashes? Could be extraced from the Summarycolumn (e.g. most common words)

In [None]:
# split all the paragraphs of a column into a list of words:
#word_list = flights.Summary.str.split()

# the list contains integers, we need to remove them or else the counter will throw an error:
#no_integers = [x for x in word_list if not isinstance(x, float)]

# flatten the list of lists:
#flat_list = []
#for sublist in no_integers:
#    str(sublist)
#    flat_list.append(sublist)     

In [None]:
# Pass the flat_it list to instance of Counter class. 
#count_dict = {}
#for item in flat_list:
 #   zahler = Counter(item)
  #  count_dict[zahler] = item
# most_common() produces k frequently encountered 
# input values and their respective counts. 
#most_occur = count_dict.most_common(100) 
  
#len(most_occur)