## Lesson 11 - Pandas Part I

In this lesson we will cover some basic features of [Pandas](http://pandas.pydata.org):

* Series
* DataFrame
* index, columns
* dtypes, info, describe
* read_csv
* head, tail
* Indexing with bracket/dot notation, loc, iloc
* transpose
* to_csv, to_excel, read_excel
* to_datetime

In [1]:
import pandas as pd
import numpy as np

### Series

In [2]:
# a list of strings
my_list = ['cubs', 'pirates', 'giants', 'yankees', 'donkeys']
my_list

['cubs', 'pirates', 'giants', 'yankees', 'donkeys']

In [3]:
# pandas Series from list
series_from_list = pd.Series(my_list)
series_from_list

0       cubs
1    pirates
2     giants
3    yankees
4    donkeys
dtype: object

In [4]:
# indexing a Series is similar to lists and arrays
series_from_list[3]

'yankees'

In [5]:
# a numpy array
my_array = np.random.rand(5)
my_array

array([0.30524267, 0.60331929, 0.78050443, 0.34128171, 0.42774023])

In [6]:
# pandas Series from array
series_from_array = pd.Series(my_array)
series_from_array

0    0.305243
1    0.603319
2    0.780504
3    0.341282
4    0.427740
dtype: float64

In [7]:
# indexing supports lists
series_from_array[[1, 3]]

1    0.603319
3    0.341282
dtype: float64

In [8]:
# indexing supports slices
series_from_array[3:]

3    0.341282
4    0.427740
dtype: float64

### DataFrame

#### 2D array to DataFrame

In [9]:
# create a 2D numpy array
my_2d_array = np.random.randn(5,5)
my_2d_array

array([[ 1.61580232, -0.20043629,  1.61469283, -0.90024258, -0.24544603],
       [-1.46648569,  1.73823183, -0.92414027, -0.81294815,  0.2382035 ],
       [-1.15223821,  0.01237087,  0.43611211, -0.02951594,  0.13019147],
       [ 0.34051167, -0.52155966, -0.28445121,  0.61381083,  1.10246569],
       [ 0.30339172, -0.30838146,  0.40072294,  0.38563107, -1.50539051]])

In [10]:
# make a DataFrame from the 2D numpy array
pd.DataFrame(my_2d_array)

Unnamed: 0,0,1,2,3,4
0,1.615802,-0.200436,1.614693,-0.900243,-0.245446
1,-1.466486,1.738232,-0.92414,-0.812948,0.238204
2,-1.152238,0.012371,0.436112,-0.029516,0.130191
3,0.340512,-0.52156,-0.284451,0.613811,1.102466
4,0.303392,-0.308381,0.400723,0.385631,-1.505391


In [11]:
# we can set the index and column labels when we create the DataFrame
df_from_2d_array = pd.DataFrame(my_2d_array, 
                                index=['row1', 'row2', 'row3', 'row4', 'row5'], 
                                columns=['col1', 'col2', 'col3', 'col4', 'col5'])
df_from_2d_array

Unnamed: 0,col1,col2,col3,col4,col5
row1,1.615802,-0.200436,1.614693,-0.900243,-0.245446
row2,-1.466486,1.738232,-0.92414,-0.812948,0.238204
row3,-1.152238,0.012371,0.436112,-0.029516,0.130191
row4,0.340512,-0.52156,-0.284451,0.613811,1.102466
row5,0.303392,-0.308381,0.400723,0.385631,-1.505391


#### Converting multiple Series to a DataFrame

In [12]:
# method 1: getting data as a list of series will orient them as rows
x = pd.DataFrame(data=[series_from_list, series_from_array])
x

Unnamed: 0,0,1,2,3,4
0,cubs,pirates,giants,yankees,donkeys
1,0.305243,0.603319,0.780504,0.341282,0.42774


In [13]:
# in this example, we need to transpose the table (we'll see this again later in the lesson)
x = x.transpose()
x

Unnamed: 0,0,1
0,cubs,0.305243
1,pirates,0.603319
2,giants,0.780504
3,yankees,0.341282
4,donkeys,0.42774


In [14]:
# method 2: pass list/Series as value of dictionary
y = pd.DataFrame({'a': series_from_list, 'b': series_from_array})
y

Unnamed: 0,a,b
0,cubs,0.305243
1,pirates,0.603319
2,giants,0.780504
3,yankees,0.341282
4,donkeys,0.42774


In [15]:
# method 3: use pd.concat to combine series in column orientation
df = pd.concat([series_from_list, series_from_array], axis=1)
df

Unnamed: 0,0,1
0,cubs,0.305243
1,pirates,0.603319
2,giants,0.780504
3,yankees,0.341282
4,donkeys,0.42774


### index, columns

In [16]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [17]:
df.columns

RangeIndex(start=0, stop=2, step=1)

In [18]:
# set the index and column names to an existing DataFrame
df.index = ['a', 'b', 'c', 'd', 'e']
df.columns = ['team', 'random']
df

Unnamed: 0,team,random
a,cubs,0.305243
b,pirates,0.603319
c,giants,0.780504
d,yankees,0.341282
e,donkeys,0.42774


In [19]:
# add a new column to the DataFrame
df['integers'] = [2, 3, 5, 8, 13]
df

Unnamed: 0,team,random,integers
a,cubs,0.305243,2
b,pirates,0.603319,3
c,giants,0.780504,5
d,yankees,0.341282,8
e,donkeys,0.42774,13


### dtypes, info, describe

In [20]:
# gives the datatype of each column
df.dtypes

team         object
random      float64
integers      int64
dtype: object

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 3 columns):
team        5 non-null object
random      5 non-null float64
integers    5 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 160.0+ bytes


In [22]:
df.describe()

Unnamed: 0,random,integers
count,5.0,5.0
mean,0.491618,6.2
std,0.198337,4.438468
min,0.305243,2.0
25%,0.341282,3.0
50%,0.42774,5.0
75%,0.603319,8.0
max,0.780504,13.0


### read_csv

In [23]:
# by default column headers are the first row and row indexes are integers starting from zero
df_sio = pd.read_csv('scripps_pier_20151110.csv')

In [24]:
df_sio.head()

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
0,11/10/15 1:42,22.307,3.712,33.199,19.95
1,11/10/15 1:35,22.311,3.588,33.201,19.94
2,11/10/15 1:29,22.305,3.541,33.2,19.95
3,11/10/15 1:23,22.323,3.463,33.2,19.95
4,11/10/15 1:17,22.316,3.471,33.199,19.95


In [25]:
# by default, read_csv will infer the object types
df_sio.dtypes

Date            object
chl (ug/L)     float64
pres (dbar)    float64
sal (PSU)      float64
temp (C)       float64
dtype: object

In [26]:
df_sio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 5 columns):
Date           66 non-null object
chl (ug/L)     66 non-null float64
pres (dbar)    66 non-null float64
sal (PSU)      66 non-null float64
temp (C)       66 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.7+ KB


In [27]:
df_sio.describe()

Unnamed: 0,chl (ug/L),pres (dbar),sal (PSU),temp (C)
count,66.0,66.0,66.0,66.0
mean,22.349576,3.041818,33.199318,20.06697
std,0.038988,0.254295,0.004959,0.0685
min,22.305,2.714,33.184,19.94
25%,22.319,2.81325,33.197,20.04
50%,22.3335,2.997,33.199,20.07
75%,22.385,3.2155,33.203,20.105
max,22.426,3.712,33.206,20.19


In [28]:
# we can also specify the dtype (and specify index and header to defaults)
# sometimes it's better to specify the dtype as object and convert to int, float, etc. later
df_sio = pd.read_csv('scripps_pier_20151110.csv', dtype=object, index_col=None, header=0)

In [29]:
df_sio.head()

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
0,11/10/15 1:42,22.307,3.712,33.199,19.95
1,11/10/15 1:35,22.311,3.588,33.201,19.94
2,11/10/15 1:29,22.305,3.541,33.2,19.95
3,11/10/15 1:23,22.323,3.463,33.2,19.95
4,11/10/15 1:17,22.316,3.471,33.199,19.95


In [30]:
df_sio.dtypes

Date           object
chl (ug/L)     object
pres (dbar)    object
sal (PSU)      object
temp (C)       object
dtype: object

In [31]:
df_sio.describe()

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
count,66,66.0,66.0,66.0,66.0
unique,66,44.0,64.0,18.0,21.0
top,11/9/15 19:41,22.311,2.875,33.199,20.05
freq,1,4.0,2.0,9.0,11.0


#### Changing dtype of columns

In [32]:
# method 1: list comprehension (one column)
df_sio['chl (ug/L)'] = [float(x) for x in df_sio['chl (ug/L)']]

In [33]:
# method 2: pd.to_numeric (one column)
df_sio['pres (dbar)'] = pd.to_numeric(df_sio['pres (dbar)'])

In [34]:
# method 3: apply(pd.to_numeric) (multiple columns)
df_sio[['sal (PSU)','temp (C)']] = df_sio[['sal (PSU)','temp (C)']].apply(pd.to_numeric)

In [35]:
df_sio.dtypes

Date            object
chl (ug/L)     float64
pres (dbar)    float64
sal (PSU)      float64
temp (C)       float64
dtype: object

### head, tail

In [36]:
# add a number to change the number of rows printed
df_sio.head(7)

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
0,11/10/15 1:42,22.307,3.712,33.199,19.95
1,11/10/15 1:35,22.311,3.588,33.201,19.94
2,11/10/15 1:29,22.305,3.541,33.2,19.95
3,11/10/15 1:23,22.323,3.463,33.2,19.95
4,11/10/15 1:17,22.316,3.471,33.199,19.95
5,11/10/15 1:11,22.315,3.476,33.198,19.95
6,11/10/15 1:05,22.31,3.448,33.199,19.96


In [37]:
# tail works the same way
df_sio.tail(3)

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
63,11/9/15 19:22,22.418,3.316,33.202,19.96
64,11/9/15 19:16,22.41,3.209,33.2,19.96
65,11/9/15 19:10,22.426,3.328,33.203,19.95


### Indexing with bracket/dot notation, loc, iloc

Pandas has three indexing methods:

* `[ ]` and `.` work on labels of columns
* `.loc` works on labels of indexes
* `.iloc` works on the positions of indexes (so it only takes integers)

In [38]:
df

Unnamed: 0,team,random,integers
a,cubs,0.305243,2
b,pirates,0.603319,3
c,giants,0.780504,5
d,yankees,0.341282,8
e,donkeys,0.42774,13


#### brackets only -- column by header

In [39]:
# to get a column (Series), use the column header (don't need .loc, .iloc, or .ix)
df['team']

a       cubs
b    pirates
c     giants
d    yankees
e    donkeys
Name: team, dtype: object

In [40]:
# for multiple columns, put a list inside the brackets (so two sets of brackets)
df[['team', 'random']]

Unnamed: 0,team,random
a,cubs,0.305243
b,pirates,0.603319
c,giants,0.780504
d,yankees,0.341282
e,donkeys,0.42774


#### dot-notation

In [41]:
# if the column name has only alpha-numerics, we can use a dot instead of brackets and quotes
df.team

a       cubs
b    pirates
c     giants
d    yankees
e    donkeys
Name: team, dtype: object

#### loc -- row by index

In [42]:
# to get a row by name, use .loc with the row index
df.loc['a']

team            cubs
random      0.305243
integers           2
Name: a, dtype: object

In [43]:
# for multiple rows, put a list inside the brackets (so two sets of brackets)
df.loc[['a', 'd']]

Unnamed: 0,team,random,integers
a,cubs,0.305243,2
d,yankees,0.341282,8


#### iloc -- row (or column) by position

In [44]:
# to get a row by position, use .iloc with the row number
df.iloc[0]

team            cubs
random      0.305243
integers           2
Name: a, dtype: object

In [45]:
# for multiple rows, put a list inside the brackets (so two sets of brackets)
df.iloc[[0, 3]]

Unnamed: 0,team,random,integers
a,cubs,0.305243,2
d,yankees,0.341282,8


In [46]:
# or pass a slice
df.iloc[2:]

Unnamed: 0,team,random,integers
c,giants,0.780504,5
d,yankees,0.341282,8
e,donkeys,0.42774,13


In [47]:
# iloc also works with columns
df.iloc[:, [0, 2]]

Unnamed: 0,team,integers
a,cubs,2
b,pirates,3
c,giants,5
d,yankees,8
e,donkeys,13


### transpose

In [48]:
df.transpose()

Unnamed: 0,a,b,c,d,e
team,cubs,pirates,giants,yankees,donkeys
random,0.305243,0.603319,0.780504,0.341282,0.42774
integers,2,3,5,8,13


In [49]:
df.T

Unnamed: 0,a,b,c,d,e
team,cubs,pirates,giants,yankees,donkeys
random,0.305243,0.603319,0.780504,0.341282,0.42774
integers,2,3,5,8,13


### to_csv, to_excel

In [50]:
df.to_csv('teams.csv')

In [51]:
# use the sep option if the separator is not a comma
df.to_csv('teams.tsv', sep='\t')

In [52]:
# to_excel requires the openpyxl package
df.to_excel('teams.xlsx')

### read_csv (again), read_excel

In [53]:
pd.read_csv('teams.csv')

Unnamed: 0.1,Unnamed: 0,team,random,integers
0,a,cubs,0.305243,2
1,b,pirates,0.603319,3
2,c,giants,0.780504,5
3,d,yankees,0.341282,8
4,e,donkeys,0.42774,13


In [54]:
df1 = pd.read_csv('teams.csv', index_col=0)
df1

Unnamed: 0,team,random,integers
a,cubs,0.305243,2
b,pirates,0.603319,3
c,giants,0.780504,5
d,yankees,0.341282,8
e,donkeys,0.42774,13


In [55]:
# default datatypes
df1.dtypes

team         object
random      float64
integers      int64
dtype: object

In [56]:
# we can specify the dtypes when we read_csv
df2 = pd.read_csv('teams.csv', index_col=0, dtype=object)
df3 = pd.read_csv('teams.csv', index_col=0, dtype={'team': object, 'random': np.float, 'integers': np.int})

In [57]:
# specify datatypes: all object
df2.dtypes

team        object
random      object
integers    object
dtype: object

In [58]:
# specify datatypes: per column
df3.dtypes

team         object
random      float64
integers      int64
dtype: object

In [59]:
# use the sep option if the separator is not a comma
df4 = pd.read_csv('teams.tsv', index_col=0, sep='\t')
df4

Unnamed: 0,team,random,integers
a,cubs,0.305243,2
b,pirates,0.603319,3
c,giants,0.780504,5
d,yankees,0.341282,8
e,donkeys,0.42774,13


In [60]:
# read_excel requires the xlrd package
df5 = pd.read_excel('teams.xlsx', index_col=0)
df5

Unnamed: 0,team,random,integers
a,cubs,0.305243,2
b,pirates,0.603319,3
c,giants,0.780504,5
d,yankees,0.341282,8
e,donkeys,0.42774,13


### to_datetime

We will cover time series in greater detail in a future lesson.

In [61]:
df_sio.head()

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
0,11/10/15 1:42,22.307,3.712,33.199,19.95
1,11/10/15 1:35,22.311,3.588,33.201,19.94
2,11/10/15 1:29,22.305,3.541,33.2,19.95
3,11/10/15 1:23,22.323,3.463,33.2,19.95
4,11/10/15 1:17,22.316,3.471,33.199,19.95


In [62]:
time = pd.to_datetime(df_sio['Date'])
time.head()

0   2015-11-10 01:42:00
1   2015-11-10 01:35:00
2   2015-11-10 01:29:00
3   2015-11-10 01:23:00
4   2015-11-10 01:17:00
Name: Date, dtype: datetime64[ns]

In [63]:
df_sio['Date'] = time

In [64]:
df_sio.head()

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
0,2015-11-10 01:42:00,22.307,3.712,33.199,19.95
1,2015-11-10 01:35:00,22.311,3.588,33.201,19.94
2,2015-11-10 01:29:00,22.305,3.541,33.2,19.95
3,2015-11-10 01:23:00,22.323,3.463,33.2,19.95
4,2015-11-10 01:17:00,22.316,3.471,33.199,19.95


In [65]:
# to do this in a single step, we can use read_csv's parse_dates keyword
pd.read_csv('scripps_pier_20151110.csv', index_col=None, parse_dates=['Date']).head()

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
0,2015-11-10 01:42:00,22.307,3.712,33.199,19.95
1,2015-11-10 01:35:00,22.311,3.588,33.201,19.94
2,2015-11-10 01:29:00,22.305,3.541,33.2,19.95
3,2015-11-10 01:23:00,22.323,3.463,33.2,19.95
4,2015-11-10 01:17:00,22.316,3.471,33.199,19.95
