In [58]:
# import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

# Import titanic dataset
titanic_data = pd.read_csv("titanic_data.csv")
titanic_data

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S,3,?,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S,10,?,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S,?,?,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S,D,?,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,?,C,?,22,"Montevideo, Uruguay"


In [59]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [60]:
titanic_data['sex']= titanic_data['sex'].map(str)

## Filling Missing Data
Lets replace our missing values in the age column with the mean value.


In [61]:
# check for missing values
titanic_data.age.value_counts()

# modify pandas display options to view the full list
pd.options.display.max_rows = 4000

In [62]:
# is '?' concidered a null value?
titanic_data.isnull().sum()

pclass       0
survived     0
name         0
sex          0
age          0
sibsp        0
parch        0
ticket       0
fare         0
cabin        0
embarked     0
boat         0
body         0
home.dest    0
dtype: int64

In [63]:
# replace ? with none
titanic_data = titanic_data.replace({'?': None})

# change the type fo the age column to numeric
titanic_data['age'] = pd.to_numeric(titanic_data['age'], errors = 'coerce')

# lets check for null values again
titanic_data.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

Lets replace missing age values with the average age of the people within the same gender

In [64]:
titanic_data['sex'].value_counts()

male      843
female    466
Name: sex, dtype: int64

In [65]:
# create a dictionary with the average age for each sex
gender_ages = dict(titanic_data.groupby('sex')['age'].mean())
gender_ages

{'female': 28.6870706185567, 'male': 30.585232978723408}

In [None]:
gender_ages['male']

In [66]:
# create a column of the average ages
titanic_data['age_mean'] = titanic_data['sex'].apply(lambda x: gender_ages[x])

# replace all missing ages with the value in this column
titanic_data['age'].fillna(titanic_data['age_mean'], inplace=True)

# check the age column again
titanic_data['age'].value_counts()

30.585233    185
28.687071     78
24.000000     47
22.000000     43
21.000000     41
30.000000     40
18.000000     39
25.000000     34
28.000000     32
36.000000     31
26.000000     30
27.000000     30
29.000000     30
19.000000     29
23.000000     26
32.000000     24
20.000000     23
31.000000     23
35.000000     23
33.000000     21
45.000000     21
17.000000     20
39.000000     20
16.000000     19
42.000000     18
40.000000     18
34.000000     16
50.000000     15
38.000000     14
48.000000     14
47.000000     14
2.000000      12
41.000000     11
4.000000      10
1.000000      10
44.000000     10
54.000000     10
9.000000      10
49.000000      9
43.000000      9
37.000000      9
51.000000      8
14.000000      8
55.000000      8
3.000000       7
60.000000      7
8.000000       6
6.000000       6
58.000000      6
46.000000      6
15.000000      6
52.000000      6
64.000000      5
57.000000      5
62.000000      5
5.000000       5
61.000000      5
13.000000      5
56.000000     

In [None]:
# Exercise 1
# Think about ways to replace the missing values in the age column. 
# What other information can be used to replace the missing values.


## Skewed Data

Creating a histogram

In [None]:
%matplotlib inline
# Create a histogram
plt.hist(titanic_data['age'], density=True, bins=25)
plt.title("Age: Feature Distribution")
plt.xlabel("Age")
plt.ylabel("No of records")


In [None]:
titanic_data.describe()

In [None]:
# Create a copy of our dataset
titanic_data_transformed = titanic_data.copy()

# applying the log transformation to the test data
titanic_data_transformed['age'] = titanic_data_transformed['age'].apply(lambda x: np.log(x+1))

# Visualize the new log distributions
plt.hist(titanic_data_transformed['age'], density=True, bins=30)
plt.title("Age: Feature Distribution")
plt.xlabel("Age")
plt.ylabel("No of records")


In [None]:
titanic_data_transformed.describe()

## Outliers

In [None]:
# calculate percentiles
age_percentiles = np.percentile(titanic_data['age'], [25, 50, 75])

# Print the result
print(age_percentiles)

- if we split our data into buckets where we have the most common ages

In [None]:
# import seaborn
import seaborn as sns


# Create box plot with Seaborn's default settings
_ = sns.boxplot(x='age',data=titanic_data)


# Label the axes
_ = plt.xlabel('age')


# Show the plot
plt.show()

In [None]:
titanic_data.describe()

In [None]:
# Remove outliers
df_no_outliers = titanic_data[titanic_data['age'] < 65]
df_no_outliers['age'].value_counts()

In [None]:
# Create box plot with Seaborn's default settings
_ = sns.boxplot(x='age',data=df_no_outliers)

# Label the axes
_ = plt.xlabel('age')


# Show the plot
plt.show()

In [None]:
# calculate new statistics
df_no_outliers.describe()

In [None]:
# Exercise 2
# Remove the outliers from the 'fare' feature

## DateTime Manipulation

In [None]:
# Import datetime
import datetime as dt

# Import data with datetime (appointment data from a doctor's office)
date_df = pd.read_csv("datetime_data.csv")
date_df.head()

In [None]:
date_df.info()

- Note that date features like "AppointmentDay" and "ScheduledDay" seem to be "object" instead of date time.
- This is the case beause  pandas is currently looking at these values as strings instead of dates
- To convert these features into datetime we can simply use the pandas function to_datetime

In [None]:
# Convert AppointmentDay into datetime
date_df['AppointmentDay'] = pd.to_datetime(date_df['AppointmentDay'])

# Convert AppointmentDay into datetime
date_df['ScheduledDay'] = pd.to_datetime(date_df['AppointmentDay'])

date_df.info()

Now that you have these values with datetime type, you can use them to create a better analysis.
- datetime64 has several properties in pandas that we can now use 

link:https://pandas.pydata.org/pandas-docs/version/0.23/api.html#datetimelike-properties

In [None]:
# Create a new field that tells us the year in which each appointment was set
date_df['year_set'] = date_df['ScheduledDay'].dt.year

# Create a new field that tells us which day of the year the appointment was set
date_df['day_num'] = date_df['ScheduledDay'].dt.dayofyear

date_df.sort_values(by=['Age']).head(26)

In [None]:
# Exercise 3
# Find the week of the year corresponding to the 'ScheduleDay' feature
# (Hint: use the link above to find the method necessary to solve this exercise)