In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from pandas import Series, DataFrame
import pandas as pd

# Chapter 5 - Getting Started with pandas

## Series
A 1D array-like object containing array of data

In [5]:
obj = Series([4, 7, -5, 3])

In [6]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [7]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])  # With an index
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2[['a', 'd']]

a   -5
d    4
dtype: int64

In [11]:
obj2[obj2 > 0]  # Filtering

d    4
b    7
c    3
dtype: int64

In [13]:
np.exp(obj2)  # Normal math operations

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [15]:
'e' in obj2  # Can be treated as a Python Dictionary

False

In [18]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [19]:
# Can alter a series' index by place
obj2.index = ['bob', 'steve', 'ben', 'sam']

In [20]:
obj2

bob      4
steve    7
ben     -5
sam      3
dtype: int64

## Dataframe
Spreadsheet like data structure. Ordered columns with different value types. Has both a row and column index.

A common way to construct Dataframes is from dict of equal length lists or Numpy arrays:

In [21]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
         'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

In [22]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [23]:
# We can specify the order of columns
DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [26]:
# Passing a column without data
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'])

In [27]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,


In [28]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [30]:
# Two methods for retrieving columns
frame2['state']  # dict-like notation
frame2.state     # Attribute

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In [31]:
# Retrieving rows
frame2.ix[2]

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object

In [34]:
frame2['newCol'] = frame2.state == 'Ohio'

In [35]:
frame2

Unnamed: 0,year,state,pop,debt,newCol
0,2000,Ohio,1.5,,True
1,2001,Ohio,1.7,,True
2,2002,Ohio,3.6,,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,,False


In [36]:
del frame2['newCol']  # Deleting a column

In [38]:
frame2.T

Unnamed: 0,0,1,2,3,4
year,2000,2001,2002,2001,2002
state,Ohio,Ohio,Ohio,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9
debt,,,,,


In [39]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan]], dtype=object)

In [44]:
frame2.index.name = 'blah'
frame2.columns.name = 'colblah'

In [45]:
frame2

colblah,year,state,pop,debt
blah,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,


In [46]:
del frame2.columns.name
frame2

Unnamed: 0_level_0,year,state,pop,debt
blah,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,


In [47]:
'year' in frame2.columns

True