# Getting Started with pandas
- pandas is designed to work with tabular/heterogeneous data, while numpy is designed to work with homogenous numerical data

In [1]:
import pandas as pd
import numpy as np

### Series

In [2]:
obj = pd.Series([2,4,6,8]) #1D array like object with labels/indices for each element
obj

0    2
1    4
2    6
3    8
dtype: int64

In [3]:
obj.values # values attribute provides the data itself

array([2, 4, 6, 8])

In [4]:
obj.index # use index to access labels/indices

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = pd.Series([1,2,3,4],index=['one','two','three','four']) # enter labels into index
obj2

one      1
two      2
three    3
four     4
dtype: int64

In [6]:
obj2['two'] # now we can access data elements by their label

2

In [7]:
obj2[obj2 > 2] # we can use boolean arrays as indices like we can in numpy

three    3
four     4
dtype: int64

In [8]:
obj2 ** 2 # and we can operate on them the same way as well

one       1
two       4
three     9
four     16
dtype: int64

In [9]:
'two' in obj2 # it is similar to a dict and we can use it the same way

True

In [10]:
adict = {'one':1,'two':2,'three':3,'four':4}
nowaseries = pd.Series(adict)
nowaseries # the keys appear in sorted order when converted from dict to series

four     4
one      1
three    3
two      2
dtype: int64

In [11]:
obj2['five'] = 5
obj3 = obj2 - nowaseries # whenever we operate on multiple series, the keys are auto sorted
obj3

five     NaN
four     0.0
one      0.0
three    0.0
two      0.0
dtype: float64

In [12]:
obj3.isnull()# can use .isnull and .notnull to check for missing data values

five      True
four     False
one      False
three    False
two      False
dtype: bool

In [13]:
obj2.name = 'int' # we can set the name for a Series elements and its indices
obj2.index.name = 'str'
obj2

str
one      1
two      2
three    3
four     4
five     5
Name: int, dtype: int64

### DataFrame
- basically a rectangular table of data with an ordered collection of columns (which can have different types)
- think of it as a dict of Series
- internally, data is stored as 2D blocks
- it is only 2D, but we can represent higher dimensional data using hierarchical indexing

In [14]:
dict_of_eq_len_lists = {'state':['Ohio', 'Ohio','Ohio','Nevada','Nevada','Nevada'],
                       'year':[2000,2001,2002,2001,2002,2003],
                       'pop': [1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(dict_of_eq_len_lists) # we can also create if from numpy arrays
frame # results displayed in a nice HTML table (thanks Jupyter, very cool)

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [15]:
frame.head() # use head to see only the first 5 rows

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [16]:
pd.DataFrame(dict_of_eq_len_lists, columns=['year','pop','state'])
# change the order of the columns like so

Unnamed: 0,year,pop,state
0,2000,1.5,Ohio
1,2001,1.7,Ohio
2,2002,3.6,Ohio
3,2001,2.4,Nevada
4,2002,2.9,Nevada
5,2003,3.2,Nevada


In [17]:
frame['year'] # Series can be retrieved either as a dict or as an attribute

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [18]:
frame.loc[1] # use loc to get rows by their index

pop       1.7
state    Ohio
year     2001
Name: 1, dtype: object

In [19]:
frame['year'] = np.arange(6) # values in columns can be assigned to as need
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,0
1,1.7,Ohio,1
2,3.6,Ohio,2
3,2.4,Nevada,3
4,2.9,Nevada,4
5,3.2,Nevada,5


In [20]:
debts = pd.Series([-1.2,-1.5,-1.7],index=[1,3,5])
frame['debt'] = debts # we can assign a series to a column, and the indices are matched
frame # nonrepresented indices appear NaN

Unnamed: 0,pop,state,year,debt
0,1.5,Ohio,0,
1,1.7,Ohio,1,-1.2
2,3.6,Ohio,2,
3,2.4,Nevada,3,-1.5
4,2.9,Nevada,4,
5,3.2,Nevada,5,-1.7


In [21]:
del frame['debt'] # delete columns the same as in python dicts
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,0
1,1.7,Ohio,1
2,3.6,Ohio,2
3,2.4,Nevada,3
4,2.9,Nevada,4
5,3.2,Nevada,5


- ops on the columns via indexing return views of the data, not copies

In [22]:
dict_of_dicts = {'Nevada':{2001:2.4,2002:2.9}, 
                'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame2 = pd.DataFrame(dict_of_dicts) # can create dataframe from dict of dicts as well
frame2 # in this case, the outer keys are the columns and the inner keys are the row indices

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


- the same method works for dicts of Series

In [23]:
frame2.T # we can transpose dataframes the same way we do numpy arrays

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [24]:
frame2.values # use values attribute to turn table data into a 2D array

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

### Index Objects
- dataframe indices are actually their own object and are immutable

In [25]:
indices = pd.Index(['a','b','c'])
indices

Index([u'a', u'b', u'c'], dtype='object')

In [26]:
'a' in indices # index objects also behave like fixed-size sets with duplicates

True

In [27]:
indices.is_monotonic # there are several set-like functions we can use on Index objects

True

### Reindexing
- reindex - to create a new object with the data conformed to a new indices

In [28]:
frame # we can reindex the rows and columns of this data

Unnamed: 0,pop,state,year
0,1.5,Ohio,0
1,1.7,Ohio,1
2,3.6,Ohio,2
3,2.4,Nevada,3
4,2.9,Nevada,4
5,3.2,Nevada,5


In [29]:
frame.reindex(reversed(frame.index)) # here reindex rearranges the rows as specified

Unnamed: 0,pop,state,year
5,3.2,Nevada,5
4,2.9,Nevada,4
3,2.4,Nevada,3
2,3.6,Ohio,2
1,1.7,Ohio,1
0,1.5,Ohio,0


In [30]:
frame.reindex(columns=reversed(frame.columns)) # and we can use the columns option to rearrange columns

Unnamed: 0,year,state,pop
0,0,Ohio,1.5
1,1,Ohio,1.7
2,2,Ohio,3.6
3,3,Nevada,2.4
4,4,Nevada,2.9
5,5,Nevada,3.2


### Drop Entries from an Axis
- there are builtins for deleting data elements from an axis

In [31]:
frame.drop(3) # it is straightforward to use; we can pass it a list of entries to drop as well

Unnamed: 0,pop,state,year
0,1.5,Ohio,0
1,1.7,Ohio,1
2,3.6,Ohio,2
4,2.9,Nevada,4
5,3.2,Nevada,5


In [32]:
frame.drop('year', axis=1, inplace=True) # this is how we specify a drop from any axis in-place

In [33]:
frame

Unnamed: 0,pop,state
0,1.5,Ohio
1,1.7,Ohio
2,3.6,Ohio
3,2.4,Nevada
4,2.9,Nevada
5,3.2,Nevada


### Indexing, Selection, and Filtering
- we can index into series similar to how we do with numpy arrays

In [55]:
frame['pop'][[1,3]] # here we get a column and provide a list of indices to collect

1    1.7
3    2.4
Name: pop, dtype: float64

In [57]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'], columns=['one','two','three','four'])
data # just creating some new examples

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [63]:
data[data['three'] > 5] # we can select rows using boolean arrays and slicing as show below

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [64]:
data[:2] # selecting rows with slicing

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [65]:
data > 6 # create convenient boolean indices by using a scalar comparison

Unnamed: 0,one,two,three,four
Ohio,False,False,False,False
Colorado,False,False,False,True
Utah,True,True,True,True
New York,True,True,True,True
