## Today, we will be learning how to use four essential pandas functions to generate descriptive data statistics.

This tutorial pairs with a Medium article by Cornellius Yudha Wijaya (June 12, 2020)

In [1]:
## first, import all your necessary libraries

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
## code from @pylang at https://github.com/jupyter/notebook/issues/2790 on behalf of jupyter nb tweet embedding
class Tweet(object):
    def __init__(self, embed_str=None):
        self.embed_str = embed_str

    def _repr_html_(self):
        return self.embed_str

## the tweet referenced in the bit of text below
s = ("""
<blockquote class="twitter-tweet"><p lang="en" dir="ltr">🐧🐧🐧<br><br>This penguin data is a great alternative to iris &amp; available for use by CC0 🤩 Thank you Dr. Kristen Gorman w/ <a href="https://twitter.com/UAFcfos?ref_src=twsrc%5Etfw">@UAFcfos</a>, Marty Downs w/ <a href="https://twitter.com/USLTER?ref_src=twsrc%5Etfw">@USLTER</a>, &amp; <a href="https://twitter.com/PalmerLTER?ref_src=twsrc%5Etfw">@PalmerLTER</a> for help, info &amp; making it available for use 🎉<br><br>Data, examples, &amp; use info here: <a href="https://t.co/dSIqWNFlVw">https://t.co/dSIqWNFlVw</a> 🧵 1/6 <a href="https://t.co/2Eu4AxoeZl">pic.twitter.com/2Eu4AxoeZl</a></p>&mdash; Allison Horst (@allison_horst) <a href="https://twitter.com/allison_horst/status/1270046399418138625?ref_src=twsrc%5Etfw">June 8, 2020</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
""")

## This data is newly released in response to a popular teaching dataset controversially having eugenics affiliations...always dig into your sources, no matter how attractive!

In [3]:
Tweet(s)

In [4]:
## access the data from Allison's github linked in her tweet. We want the csv for pandas manipulation.
data_url = 'https://raw.githubusercontent.com/allisonhorst/palmerpenguins/master/data-raw/penguins_raw.csv'
data = pd.read_csv(data_url,sep=",")

## let's preview the data
data.head(5)

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,2007-11-11,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,2007-11-11,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,2007-11-16,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,2007-11-16,,,,,,,,Adult not sampled.
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,2007-11-16,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,


In [5]:
## get an overall idea of columns' datatypes and sparsity

data.head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 17 columns):
studyName              5 non-null object
Sample Number          5 non-null int64
Species                5 non-null object
Region                 5 non-null object
Island                 5 non-null object
Stage                  5 non-null object
Individual ID          5 non-null object
Clutch Completion      5 non-null object
Date Egg               5 non-null object
Culmen Length (mm)     4 non-null float64
Culmen Depth (mm)      4 non-null float64
Flipper Length (mm)    4 non-null float64
Body Mass (g)          4 non-null float64
Sex                    4 non-null object
Delta 15 N (o/oo)      3 non-null float64
Delta 13 C (o/oo)      3 non-null float64
Comments               2 non-null object
dtypes: float64(6), int64(1), object(10)
memory usage: 808.0+ bytes


## 1. pipe()

In [6]:
%%time

## function to extract the binomial nomenclature, i.e. scientific species name, as its own column
def extract_binomial_nomenclature(df):
    df['binomial_nomenclature'] = df['Species'].str.split('(').str.get(1).str.slice(stop=-1)
    return df

## function to extract the simplified species name i.e. first descriptor term of a given species
def extract_simple_species(df):
    df['species_simplified'] = df['Species'].str.split(' ').str.get(0)
    return df

## function to make recombined species name that no longer includes 'Penguin'
def add_binomial_to_simplified(df):
    df['species_recombined'] = df['species_simplified'] + " (" + df['binomial_nomenclature'] + ")"

## method stringing in pandas
add_binomial_to_simplified(extract_simple_species(extract_binomial_nomenclature(data)))
data.head()

CPU times: user 9.1 ms, sys: 7.53 ms, total: 16.6 ms
Wall time: 25.1 ms


Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments,binomial_nomenclature,species_simplified,species_recombined
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,2007-11-11,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,2007-11-11,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,2007-11-16,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,2007-11-16,,,,,,,,Adult not sampled.,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,2007-11-16,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)


In [7]:
%%time

data.pipe(extract_binomial_nomenclature).pipe(extract_simple_species).pipe(add_binomial_to_simplified)

CPU times: user 7.66 ms, sys: 583 µs, total: 8.25 ms
Wall time: 9.82 ms


## 2. apply()

In [8]:
#Selecting only the numerical columns then applying mean function to each column

data.select_dtypes('number').apply(np.mean)

Sample Number            63.151163
Culmen Length (mm)       43.921930
Culmen Depth (mm)        17.151170
Flipper Length (mm)     200.915205
Body Mass (g)          4201.754386
Delta 15 N (o/oo)         8.733382
Delta 13 C (o/oo)       -25.686292
dtype: float64

In [9]:
def mm_to_in(col):
    ## 25.4 mm in an inch
    return (col.mean())/25.4

data.loc[:, data.columns.str.endswith(" (mm)")].apply(mm_to_in)

Culmen Length (mm)     1.729210
Culmen Depth (mm)      0.675243
Flipper Length (mm)    7.910047
dtype: float64

## 3. agg()

In [10]:
data.loc[:, data.columns.str.endswith(" (mm)")].agg(['mean', 'std', mm_to_in])

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm)
mean,43.92193,17.15117,200.915205
std,5.459584,1.974793,14.061714
mm_to_in,1.72921,0.675243,7.910047


In [11]:
data[['Culmen Length (mm)', 'Body Mass (g)']].agg({'Culmen Length (mm)': ['mean',mm_to_in], 'Body Mass (g)': 'std'})

Unnamed: 0,Culmen Length (mm),Body Mass (g)
mean,43.92193,
mm_to_in,1.72921,
std,,801.954536


## 4. applymap()

In [12]:
data.applymap(lambda x: len(str(x)))

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments,binomial_nomenclature,species_simplified,species_recombined
0,7,1,35,6,9,18,4,3,10,4,4,5,6,4,3,3,30,18,6,27
1,7,1,35,6,9,18,4,3,10,4,4,5,6,6,7,9,3,18,6,27
2,7,1,35,6,9,18,4,3,10,4,4,5,6,6,17,9,3,18,6,27
3,7,1,35,6,9,18,4,3,10,3,3,3,3,3,3,3,18,18,6,27
4,7,1,35,6,9,18,4,3,10,4,4,5,6,6,7,9,3,18,6,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,7,2,41,6,5,18,5,3,10,4,4,5,6,4,7,9,3,21,9,33
340,7,2,41,6,5,18,5,2,10,4,4,5,6,6,7,19,37,21,9,33
341,7,2,41,6,5,18,5,2,10,4,4,5,6,4,6,9,37,21,9,33
342,7,2,41,6,5,18,6,3,10,4,4,5,6,4,7,9,3,21,9,33


In [13]:
data

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments,binomial_nomenclature,species_simplified,species_recombined
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,2007-11-11,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,2007-11-11,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,2007-11-16,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,2007-11-16,,,,,,,,Adult not sampled.,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,2007-11-16,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,,Pygoscelis adeliae,Adelie,Adelie (Pygoscelis adeliae)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,PAL0910,64,Chinstrap penguin (Pygoscelis antarctica),Anvers,Dream,"Adult, 1 Egg Stage",N98A2,Yes,2009-11-19,55.8,19.8,207.0,4000.0,MALE,9.70465,-24.53494,,Pygoscelis antarctica,Chinstrap,Chinstrap (Pygoscelis antarctica)
340,PAL0910,65,Chinstrap penguin (Pygoscelis antarctica),Anvers,Dream,"Adult, 1 Egg Stage",N99A1,No,2009-11-21,43.5,18.1,202.0,3400.0,FEMALE,9.37608,-24.40753,Nest never observed with full clutch.,Pygoscelis antarctica,Chinstrap,Chinstrap (Pygoscelis antarctica)
341,PAL0910,66,Chinstrap penguin (Pygoscelis antarctica),Anvers,Dream,"Adult, 1 Egg Stage",N99A2,No,2009-11-21,49.6,18.2,193.0,3775.0,MALE,9.46180,-24.70615,Nest never observed with full clutch.,Pygoscelis antarctica,Chinstrap,Chinstrap (Pygoscelis antarctica)
342,PAL0910,67,Chinstrap penguin (Pygoscelis antarctica),Anvers,Dream,"Adult, 1 Egg Stage",N100A1,Yes,2009-11-21,50.8,19.0,210.0,4100.0,MALE,9.98044,-24.68741,,Pygoscelis antarctica,Chinstrap,Chinstrap (Pygoscelis antarctica)


In [None]:
## map on a Series obj same as applymap on a DataFrame

data['Individual ID'].apply(lambda x: len(str(x)))

# Pt. 2: Penguins by the Sea

Now that we have an idea of how to do some data grouping for targeted statistics, let's get some visuals going using seaborn!

In [None]:
ax = sns.scatterplot(x=data['Flipper Length (mm)'], y=data['Body Mass (g)'], hue=data["species_simplified"], style=data["species_simplified"])
legend = ax.legend()
legend.texts[0].set_text("Species")
ax.set_title("Penguin size, Palmer Station LTER")


In [None]:
dist = sns.catplot(x="species_simplified", y="Culmen Depth (mm)", hue="Species", kind="swarm", data=data)
dist.set_xticklabels(rotation=90)
dist.set(xlabel='Species')

In [None]:
bins = np.linspace(min(data["Culmen Depth (mm)"]), max(data["Culmen Depth (mm)"]), num=15)

ax1 = sns.distplot(data["Culmen Depth (mm)"][data['species_simplified']== "Adelie"], bins=bins, 
                   kde=False, hist_kws=dict(edgecolor='k', lw=1))
ax2 = sns.distplot(data["Culmen Depth (mm)"][data['species_simplified']== "Gentoo"], bins=bins, 
                   color='Red', kde=False, hist_kws=dict(edgecolor='k', lw=1))
ax3 = sns.distplot(data["Culmen Depth (mm)"][data['species_simplified']== "Chinstrap"], bins=bins, 
                   color='Yellow', kde=False, hist_kws=dict(edgecolor='k', lw=1))