In [1]:
import seaborn as sns
import pandas as pd

titanic_df = sns.load_dataset("titanic")

Some usefull functions for a quick exploration are:
- `head()` shows you the first 5 rows
- `info()` shows the missing values per variable
- `describe()` shows some basic statistics for all the numaric variables

In [2]:
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [4]:
titanic_df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


## Transforming Data

Now we will show some example on how to transform your data for exploration

### Filtering

In [5]:
males = titanic_df[titanic_df['sex']=='male']
males.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [6]:
males = titanic_df[titanic_df.sex=='male']
males.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [7]:
pclass_12 = titanic_df[titanic_df['pclass']!=3]
pclass_12.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [8]:
class_list = [1, 2]
pclass_3 = titanic_df[~titanic_df['pclass'].isin(class_list)]
pclass_3.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [9]:
not_null = titanic_df[titanic_df['age'].notnull()]
not_null.shape

(714, 15)

In [10]:
titanic_df['embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [11]:
titanic_df[titanic_df['age']>30 & 
           (titanic_df['embarked']=='S')].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Sorting

In [12]:
titanic_df.sort_values(by=['age']).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
803,1,3,male,0.42,0,1,8.5167,C,Third,child,False,,Cherbourg,yes,False
755,1,2,male,0.67,1,1,14.5,S,Second,child,False,,Southampton,yes,False
644,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
469,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False


In [13]:
titanic_df.sort_values(by=['age'], ascending=False).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [14]:
titanic_df.sort_values(by=['pclass', 'age'], ascending=[True, False]).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
745,0,1,male,70.0,1,1,71.0,S,First,man,True,B,Southampton,no,False
54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False


### Slicing

In [15]:
titanic_df.loc[1]

survived               1
pclass                 1
sex               female
age                   38
sibsp                  1
parch                  0
fare             71.2833
embarked               C
class              First
who                woman
adult_male         False
deck                   C
embark_town    Cherbourg
alive                yes
alone              False
Name: 1, dtype: object

In [16]:
titanic_df.loc[1:3]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False


In [17]:
titanic_df.loc[1:4, ['survived', 'parch']]

Unnamed: 0,survived,parch
1,1,0
2,1,0
3,1,0
4,0,0


### Mutating

In [18]:
titanic_df['age_10'] = titanic_df['age'] + 10
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_10
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,32.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,48.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,36.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,45.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,45.0


In [19]:
titanic_df['age_fare'] = titanic_df['age'] + titanic_df['fare']
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_10,age_fare
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,32.0,29.25
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,48.0,109.2833
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,36.0,33.925
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,45.0,88.1
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,45.0,43.05


In [20]:
titanic_sliced = titanic_df.iloc[:, :-3]
titanic_sliced.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no


### Aggregation and Grouping

Aggregation | Description
--- | ---
`count()` | Total number of items
`first()`, `last()` | First and last item
`mean()`, `median()` | Mean and median
`min()`, `max()` | Minimum and maximum
`std()`, `var()` | Standard deviation and variance
`mad()` | Mean absolute deviation
`prod()` | Product of all items
`sum()` | Sum of all items

In [21]:
titanic_df.sum()['survived']

342

In [22]:
titanic_df.groupby('sex')['survived'].sum()

sex
female    233
male      109
Name: survived, dtype: int64

In [23]:
titanic_df.groupby('sex')['age'].agg([('mean_age', 'mean'), ('min_age', 'min')])

Unnamed: 0_level_0,mean_age,min_age
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,27.915709,0.75
male,30.726645,0.42


### Use your own functions

In [26]:
import numpy as np

def my_function(x):
    return np.sum(x) + 10

titanic_df.groupby('sex')['age'].agg(my_function)

sex
female     7296.00
male      13929.17
Name: age, dtype: float64

In [27]:
titanic_df.groupby('sex')['age'].agg(lambda x: np.sum(x) + 10)

sex
female     7296.00
male      13929.17
Name: age, dtype: float64

### Multiple output Trick

The default behaviour of jupyter notebook is that is only show the last output. If you want to see all the outputs from a cell you can run the following two lines of code:

In [28]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### reset_index()

A usefull function if you want to use the index column for sorting for example.

In [29]:
res = titanic_df.groupby('sex').size()
type(res)
res

pandas.core.series.Series

sex
female    314
male      577
dtype: int64

In [30]:
res = titanic_df.groupby('sex').size().reset_index(name="counts")
type(res)
res

pandas.core.frame.DataFrame

Unnamed: 0,sex,counts
0,female,314
1,male,577
