In [None]:
# import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import titanic dataset
titanic_data = pd.read_csv("titanic_data.csv") 
titanic_data.head(5)

In [None]:
titanic_data['age'] = pd.to_numeric(titanic_data['age'], errors = 'coerce')
titanic_data.dropna(inplace = True)
titanic_data = titanic_data[titanic_data['age'] > 1]
titanic_data.head(10)

In [None]:
titanic_data.info()

#### Quick Introduction to histograms....

In [None]:
# Create a histogram
_ = plt.hist(titanic_data['age'], bins=10 )
_ = plt.xlabel('age')
_ = plt.ylabel('number of people')
fig = plt.gcf()
fig.set_size_inches(10, 5)

- Note that the histogram is very hard to read
- Note that plt.hist returns 3 arrays, but we only want a plot therefore we assign these to a dummy variable _
- The number of bins and thier size was assigned automatically by the default settings of plt.hist()
- we can specify the bins to modify the plot and make it more readable


In [None]:
titanic_data.describe()

In [None]:
# calculate percentiles
age_percentiles = np.percentile(titanic_data['age'], [25, 50, 75])

# Print the result
print(age_percentiles)

In [None]:
# Graph the box plot the age data

# import seaborn

# Create box plot with Seaborn's default settings
_ = sns.boxplot(x='age',data=titanic_data)

# Label the axes
_ = plt.xlabel('age')

#ax = sns.boxplot(x=tips["total_bill"])

# Show the plot
plt.show()

In [None]:
### Outliers

In [None]:
# Remove outliers
df_no_outliers = titanic_data[titanic_data['age'] < 65]

# Calculate mean
df_no_outliers.describe()

In [None]:
# Create a new histogram
_ = plt.hist(df_no_outliers['age'], bins=10)
_ = plt.xlabel('age')
_ = plt.ylabel('number of people')
fig = plt.gcf()
fig.set_size_inches(10, 5)

In [None]:
### Missing Values

In [None]:
# Remove age 29 and replace with nan
# First define a series with that has all values that equal 29 replaced with nan
#df_no_outliers.loc[df_no_outliers[df_no_outliers['age']==29]] = np.nan
age_wo_29 = titanic_data['age'].replace(29, np.nan)

# Add the newly defined series as a column in our titanic dataframe
df_no_outliers['age'] = age_wo_29

df_no_outliers.head(100)

In [None]:
# calculate new statistics
df_no_outliers.describe()

In [None]:
### Date Time

In [None]:
# Import datetime
import datetime as dt

# Import data with datetime (appointment data from a doctor's office)
date_df = pd.read_csv("datetime_data.csv")
date_df.head()

In [None]:
date_df.info()

In [None]:
- Note that date features like "AppointmentDay" and "ScheduledDay" seem to be "object" instead of date time.
- This is the case beause  pandas is currently looking at these values as strings instead of dates
- To convert these features into datetime we cans simply use the pandas function to_datetime

In [None]:
# Convert AppointmentDay into datetime
date_df['AppointmentDay'] = pd.to_datetime(date_df['AppointmentDay'])

# Convert AppointmentDay into datetime
date_df['ScheduledDay'] = pd.to_datetime(date_df['AppointmentDay'])

date_df.info()

In [None]:
Now that you have these values with datetime type, you can use them to create a better analysis.
- datetime64 has several properties in pandas that we can now use 

link:https://pandas.pydata.org/pandas-docs/version/0.23/api.html#datetimelike-properties

In [None]:
# Create a new field that tells us the year in which each appointment was set
date_df['year_set'] = date_df['ScheduledDay'].dt.year

# Create a new field that tells us which day of the year the appointment was set
date_df['day_num'] = date_df['ScheduledDay'].dt.dayofyear

date_df.sort_values(by=['Age']).head(26)