In [90]:
import pandas as pd
import numpy as np

In [91]:
df = pd.read_csv(
    "../data/celebrity_deaths_2016.csv", usecols=["dateofdeath", "age", "causeofdeath"]
)
df.head()

Unnamed: 0,dateofdeath,age,causeofdeath
0,2016-01-01,71,brain cancer
1,2016-01-01,74,cancer
2,2016-01-01,79,cancer
3,2016-01-01,45,complications of a stroke
4,2016-01-01,83,heart failure


In [92]:
# how many of the different columns have na values?
total_records = df.shape[0]
na_dates = df.loc[df["dateofdeath"].isna()].shape[0]
na_age = df.loc[df["age"].isna()].shape[0]
na_cause = df.loc[df["causeofdeath"].isna()].shape[0]
print(f"Null stats of {total_records}:")
print(f"\tDates: {na_dates}")
print(f"\tAges: {na_age}")
print(f"\tCause: {na_cause}")

Null stats of 6543:
	Dates: 0
	Ages: 27
	Cause: 5008


In [93]:
# extract the month from the dateofdeath column and set it as the index
df["monthofdeath"] = df["dateofdeath"].str.slice(5, 7).astype(np.int8)
df = df.set_index("monthofdeath").sort_index()
df.head()

Unnamed: 0_level_0,dateofdeath,age,causeofdeath
monthofdeath,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2016-01-01,71,brain cancer
1,2016-01-21,47,heart disease
1,2016-01-21,87,pneumonia
1,2016-01-21,90,pulmonary distress
1,2016-01-21,73,stroke


# finding the average age of death
My approach without reference to the book first

In [94]:
# average age of death for February to July (2-7) dropping any null values
# but, we have some bad data
# let's look at what values are not good first
good_ages = df["age"].astype(str).str.strip().str.isdigit()
# df[~good_ages]

# now we know which ages have numerical data in them we can
# calculate the mean
ages = df[good_ages].loc[slice(2, 7), "age"].astype(np.int16)
# which is a huge mean of 110, and there were some numbers in there
# that looked like "between X and Y" but had no separator so didn't
# get filled out
ages.describe()

count    3322.000000
mean      110.058098
std       475.118996
min         9.000000
25%        69.000000
50%        82.000000
75%        89.000000
max      9394.000000
Name: age, dtype: float64

In [95]:
# book approach
# uses the pd.to_numeric(df["age"]) function
# "coerce" turns exceptions on non-numerics into NaN instead
df["book_age"] = pd.to_numeric(df["age"], errors="coerce")
# recommends this for filtering ages, but since the four digit ages
# seem like ranges, could also be compared to some further massaging
# of those ranges into values halfway between the two
df_filtered = df.loc[df["book_age"] < 120]
df_filtered.loc[slice(2, 7), "book_age"].describe()

count    3304.000000
mean       77.178874
std        16.292004
min         9.000000
25%        69.000000
50%        81.000000
75%        89.000000
max       116.000000
Name: book_age, dtype: float64

In [96]:
# let's look at causes of death
df["causeofdeath"].dropna().value_counts().head(15)

# somewhat clunky way of looking at how many pneumonia deaths there were by month
# have to reset the index to be able to access the month column again
# df.reset_index()[df.reset_index()["causeofdeath"].str.strip() == "pneumonia"]["monthofdeath"].value_counts()

causeofdeath
cancer                                 248
heart attack                           125
traffic collision                       56
lung cancer                             51
pneumonia                               50
heart failure                           49
shot                                    42
stroke                                  36
pancreatic cancer                       35
brain cancer                            33
injuries sustained in a plane crash     23
Alzheimer's disease                     22
prostate cancer                         17
leukemia                                17
cardiac arrest                          17
Name: count, dtype: int64

# Extension questions
1. Add a new column, `day` from the day of the month in which the celebrity died. Then create a multi-index from `month` and `day`. What was the average age of death from Feb 15 through July 15?
2. The CSV file contains `causeofdeath`. Load it into a data frame and find the five most common causes of death. Now replace any NaN values in that column with the string `"unknown"` and find the five most common causes of death.
3. If someone asked whether cancer is in the top 10 causes what would you say? Can you be more specific than that?

In [97]:
# 1. reloading and creating a multi-index
df = pd.read_csv(
    "../data/celebrity_deaths_2016.csv", usecols=["dateofdeath", "age", "causeofdeath"]
)
df["month"] = df["dateofdeath"].str.slice(5, 7).astype(np.int8)
df["day"] = df["dateofdeath"].str.slice(8, 10).astype(np.int8)
df = df.set_index(["month", "day"]).sort_index()
# update the age to numeric
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,dateofdeath,age,causeofdeath
month,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,29,2016-12-29,83.0,
12,29,2016-12-29,88.0,
12,29,2016-12-29,88.0,
12,29,2016-12-29,89.0,
12,29,2016-12-29,53.0,cancer


In [None]:
df2 = df.dropna(subset=["age"])
df2 = df2.loc[df2["age"] < 120]
df2.loc[(slice(2, 7), slice(1, 15)), "age"].describe()


count    2040.000000
mean       76.919118
std        16.248524
min        11.000000
25%        70.000000
50%        81.000000
75%        88.000000
max       116.000000
Name: age, dtype: float64

In [99]:
# 2. top causes of death (without replacing NaN)
df["causeofdeath"].dropna().value_counts().head(5)

causeofdeath
cancer               248
heart attack         125
traffic collision     56
lung cancer           51
pneumonia             50
Name: count, dtype: int64

In [100]:
# now filling na with "unknown"
df["causeofdeath"].fillna("unknown").value_counts().head(5)

causeofdeath
unknown               5008
 cancer                248
 heart attack          125
 traffic collision      56
 lung cancer            51
Name: count, dtype: int64

In [101]:
has_cancer = df["causeofdeath"].dropna()
has_cancer = has_cancer[has_cancer.str.contains("cancer")]
print(f"Causes containing 'cancer': {has_cancer.shape[0]}")
has_cancer.value_counts()

Causes containing 'cancer': 529


causeofdeath
 cancer                               248
 lung cancer                           51
 pancreatic cancer                     35
 brain cancer                          33
 prostate cancer                       17
                                     ... 
 gastrointestinal cancer                1
complications from bladder cancer       1
 complications from bladder cancer      1
 abdominal cancer                       1
liver cancer                            1
Name: count, Length: 64, dtype: int64

You could say that cancer and cancer related causes were the top known cause of death in the data set.
Of those, lung cancer was the most common specified type of cancer.

In [None]:
# this is the book version of the first question, that leaves the month and day as strings
# - not quite sure why the answer is different - need to dig into the sorting methods
filename = "../data/celebrity_deaths_2016.csv"

df = pd.read_csv(filename, usecols=["dateofdeath", "age"])
# Clean all non-integers from the "age" column
# (1) Remove all NaNs
df = df.dropna(subset=["age"])

# (2) Remove all non-digits
df = df[df["age"].str.isdigit()]
df["age"] = df["age"].astype(np.int64)

# Get the month, in slice [5:7]
df["month"] = df["dateofdeath"].str.slice(5, 7)

# Get the day, in slice [8:]
df["day"] = df["dateofdeath"].str.slice(8, None)

# Set a multi-index
df = df.set_index(["month", "day"])

# Sort the index
df = df.sort_index()

# Get the rows from Feb 15th through July 15th, and the 'age' column, then the average
df.loc[("02", "15") : ("07", "15"), "age"].mean()


np.float64(77.05183037332367)