# Getting Started with pandas
- pandas is designed to work with tabular/heterogeneous data, while numpy is designed to work with homogenous numerical data

In [55]:
import pandas as pd
import numpy as np

### Series

In [4]:
obj = pd.Series([2,4,6,8]) #1D array like object with labels/indices for each element
obj

0    2
1    4
2    6
3    8
dtype: int64

In [5]:
obj.values # values attribute provides the data itself

array([2, 4, 6, 8])

In [6]:
obj.index # use index to access labels/indices

RangeIndex(start=0, stop=4, step=1)

In [9]:
obj2 = pd.Series([1,2,3,4],index=['one','two','three','four']) # enter labels into index
obj2

one      1
two      2
three    3
four     4
dtype: int64

In [12]:
obj2['two'] # now we can access data elements by their label

2

In [14]:
obj2[obj2 > 2] # we can use boolean arrays as indices like we can in numpy

three    3
four     4
dtype: int64

In [15]:
obj2 ** 2 # and we can operate on them the same way as well

one       1
two       4
three     9
four     16
dtype: int64

In [17]:
'two' in obj2 # it is similar to a dict and we can use it the same way

True

In [19]:
adict = {'one':1,'two':2,'three':3,'four':4}
nowaseries = pd.Series(adict)
nowaseries # the keys appear in sorted order when converted from dict to series

four     4
one      1
three    3
two      2
dtype: int64

In [26]:
obj2['five'] = 5
obj3 = obj2 - nowaseries # whenever we operate on multiple series, the keys are auto sorted
obj3

five     NaN
four     0.0
one      0.0
three    0.0
two      0.0
dtype: float64

In [27]:
obj3.isnull()# can use .isnull and .notnull to check for missing data values

five      True
four     False
one      False
three    False
two      False
dtype: bool

In [35]:
obj2.name = 'int' # we can set the name for a Series elements and its indices
obj2.index.name = 'str'
obj2

str
one      1
two      2
three    3
four     4
five     5
Name: int, dtype: int64

### DataFrame
- basically a rectangular table of data with an ordered collection of columns (which can have different types)
- think of it as a dict of Series
- internally, data is stored as 2D blocks
- it is only 2D, but we can represent higher dimensional data using hierarchical indexing

In [39]:
dict_of_eq_len_lists = {'state':['Ohio', 'Ohio','Ohio','Nevada','Nevada','Nevada'],
                       'year':[2000,2001,2002,2001,2002,2003],
                       'pop': [1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(dict_of_eq_len_lists) # we can also create if from numpy arrays
frame # results displayed in a nice HTML table (thanks Jupyter, very cool)

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [40]:
frame.head() # use head to see only the first 5 rows

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [41]:
pd.DataFrame(dict_of_eq_len_lists, columns=['year','pop','state'])
# change the order of the columns like so

Unnamed: 0,year,pop,state
0,2000,1.5,Ohio
1,2001,1.7,Ohio
2,2002,3.6,Ohio
3,2001,2.4,Nevada
4,2002,2.9,Nevada
5,2003,3.2,Nevada


In [44]:
frame['year'] # Series can be retrieved either as a dict or as an attribute

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [49]:
frame.loc[1] # use loc to get rows by their index

pop       1.7
state    Ohio
year     2001
Name: 1, dtype: object

In [57]:
frame['year'] = np.arange(6) # values in columns can be assigned to as need
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,0
1,1.7,Ohio,1
2,3.6,Ohio,2
3,2.4,Nevada,3
4,2.9,Nevada,4
5,3.2,Nevada,5


In [62]:
debts = pd.Series([-1.2,-1.5,-1.7],index=[1,3,5])
frame['debt'] = debts # we can assign a series to a column, and the indices are matched
frame # nonrepresented indices appear NaN

Unnamed: 0,pop,state,year,debt
0,1.5,Ohio,0,
1,1.7,Ohio,1,-1.2
2,3.6,Ohio,2,
3,2.4,Nevada,3,-1.5
4,2.9,Nevada,4,
5,3.2,Nevada,5,-1.7
