Our goal today is to introduce you to some details about Python that we will need. We will do it with a series of questions. You should open up a Jupyter notebook either in the Jupyter Hub (use the scipy one) or on your local computer.

In [1]:
# import our standard set of packages

import urllib
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt

In [21]:
# Let's continue exploring the penguins data set (our first Case Study that we will work on together)

penguins_url = 'https://drive.google.com/uc?export=download&id=1-SiGKvihMs9sP2I2FZd-sVRm-VnZFihi'
penguins_data = pd.read_csv(penguins_url)
penguins_data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [22]:
# We can sort a dataframe by the values of one of the features. 

penguins_data.sort_values('bill_length_mm', ascending=True)

# The ascending parameter identifies whether to sort from smallest to largest or vice-versa.

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
142,Adelie,Dream,32.1,15.5,188.0,3050.0,female,2009
98,Adelie,Dream,33.1,16.1,178.0,2900.0,female,2008
70,Adelie,Torgersen,33.5,19.0,190.0,3600.0,female,2008
92,Adelie,Dream,34.0,17.1,185.0,3400.0,female,2008
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,,2007
...,...,...,...,...,...,...,...,...
253,Gentoo,Biscoe,55.9,17.0,228.0,5600.0,male,2009
293,Chinstrap,Dream,58.0,17.8,181.0,3700.0,female,2007
185,Gentoo,Biscoe,59.6,17.0,230.0,6050.0,male,2007
3,Adelie,Torgersen,,,,,,2007


In [23]:
# We can drop a feature

# Note that for some commands we need to identify whether we wish to perform the command on the 
# vertical or horizontal axis. axis = 0 is rows; axis = 1 is columns

penguins_data.drop('sex', axis=1)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,2007
3,Adelie,Torgersen,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,2007
...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,2009


In [24]:
# Note above that two of our penguins are missing almost all of their measurments. Let's drop them from the dataset.

penguins_data = penguins_data.drop([3, 271], axis = 0)
penguins_data.sort_values('bill_length_mm', ascending=True)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
142,Adelie,Dream,32.1,15.5,188.0,3050.0,female,2009
98,Adelie,Dream,33.1,16.1,178.0,2900.0,female,2008
70,Adelie,Torgersen,33.5,19.0,190.0,3600.0,female,2008
92,Adelie,Dream,34.0,17.1,185.0,3400.0,female,2008
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,,2007
...,...,...,...,...,...,...,...,...
267,Gentoo,Biscoe,55.1,16.0,230.0,5850.0,male,2009
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
253,Gentoo,Biscoe,55.9,17.0,228.0,5600.0,male,2009
293,Chinstrap,Dream,58.0,17.8,181.0,3700.0,female,2007


Note if you rerun the previous cell you get an error because we have already removed the rows with index 3 and 271.

In [26]:
# You can refer to individual parts of a dataframe using .loc[] or .iloc.

penguins_data.loc[339, :]

species              Chinstrap
island                   Dream
bill_length_mm            55.8
bill_depth_mm             19.8
flipper_length_mm          207
body_mass_g               4000
sex                       male
year                      2009
Name: 339, dtype: object

In [25]:
# We can produce a dataframe that is only the penguins from Biscoe Island.

biscoe_data = penguins_data.loc[penguins_data.loc[:, 'island']=='Biscoe', :]
biscoe_data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
20,Adelie,Biscoe,37.8,18.3,174.0,3400.0,female,2007
21,Adelie,Biscoe,37.7,18.7,180.0,3600.0,male,2007
22,Adelie,Biscoe,35.9,19.2,189.0,3800.0,female,2007
23,Adelie,Biscoe,38.2,18.1,185.0,3950.0,male,2007
24,Adelie,Biscoe,38.8,17.2,180.0,3800.0,male,2007
...,...,...,...,...,...,...,...,...
270,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,female,2009
272,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,female,2009
273,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,male,2009
274,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,female,2009
