In [1]:
import numpy as np
print("numpy version: {}".format(np.__version__))
import pandas as pd 
print("pandas version: {}".format(pd.__version__))
import matplotlib
import matplotlib.pyplot as plt
print("matplotlib version: {}".format(matplotlib.__version__))
import scipy as sp
print("scipy version: {}".format(sp.__version__))
import sklearn as sl
print("scikit-learn: {}".format(sl.__version__))
import seaborn as sns
print("seaborn: {}".format(sns.__version__))
import statsmodels as sm
print("statsmodels: {}".format(sm.__version__))

numpy version: 1.17.4
pandas version: 0.25.3
matplotlib version: 3.1.2
scipy version: 1.3.3
scikit-learn: 0.21.3
seaborn: 0.9.0
statsmodels: 0.10.2


In [2]:
%matplotlib inline

pandas adopts significant
parts of NumPy’s idiomatic style of array-based computing, especially array-based
functions and a preference for data processing without for loops.

While pandas adopts many coding idioms from NumPy, the biggest difference is that
pandas is designed for working with tabular or heterogeneous data. NumPy, by con‐
trast, is best suited for working with homogeneous numerical array data.

### Series

A **Series** is a one-dimensional array-like object containing a sequence of values (of
similar types to NumPy types) and an associated array of data labels, called its index.
The simplest Series is formed from only an array of data:

In [3]:
obj = pd.Series([4, 7, -5, 3])

In [4]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.values

array([ 4,  7, -5,  3])

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [8]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [9]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [10]:
obj2['a']

-5

In [11]:
obj2['d'] = 6

In [12]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [13]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [14]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [15]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [16]:
'b' in obj2

True

In [17]:
'e' in obj2

False

In [18]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [19]:
obj3 = pd.Series(sdata)

In [20]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

When you are only passing a dict, the index in the resulting Series will have the dict’s
keys in sorted order. You can override this by passing the dict keys in the order you
want them to appear in the resulting Series:

In [21]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [22]:
obj4 = pd.Series(sdata, index=states)

In [23]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [24]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [25]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [26]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

A useful Series feature for many applications is that it automatically aligns by index
label in arithmetic operations:

In [27]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [28]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

similar to a join operation

In [29]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [30]:
obj4.name = 'population'

In [31]:
obj4.index.name = 'state'

In [32]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

A Series’s index can be altered in-place by assignment:

In [33]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [34]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']

In [35]:
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

A **DataFrame** represents a rectangular table of data and contains an ordered collection of columns. 
The DataFrame has both a row and column index; it can be thought of as a dict of Series all sharing the same index.
Under the hood, the data is stored a s one or more two-dimensional blocks rather than a list, dict, or some other collection of one-dimensional arrays.

While a DataFrame is physically two-dimensional, you can use it to
represent higher dimensional data in a tabular format using hierarchical indexing

There are many ways to construct a DataFrame, though one of the most common is
from a dict of equal-length lists or NumPy arrays:

In [36]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [37]:
frame = pd.DataFrame(data)

In [38]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [39]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [40]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [41]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four', 'five', 'six'])

In [42]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [43]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [44]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [45]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

```frame2[column]``` works for any column name, but ```frame2.column``` only works when the column name is a valid Python variable name.

In [46]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [47]:
frame2['debt'] = 16.5

In [48]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [49]:
frame2['debt'] = np.arange(6.)

In [50]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


When you are assigning lists or arrays to a column, the value’s length must match the
length of the DataFrame. If you assign a Series, its labels will be realigned exactly to
the DataFrame’s index, inserting missing values in any holes:

In [51]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [52]:
frame2['debt'] = val

In [53]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [54]:
frame2['eastern'] = frame2.state == 'Ohio'

New columns cannot be created with the ```frame2.eastern``` syntax.

In [55]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [56]:
del frame2['eastern']

In [57]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

Another common form of data is a nested dict of dicts:

In [58]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [59]:
frame3 = pd.DataFrame(pop)

In [60]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [61]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [62]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [63]:
pdata = {'Ohio': frame3['Ohio'][:-1], 'Nevada': frame3['Nevada'][:2]}

In [64]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [65]:
frame3.index.name = 'year'; frame3.columns.name = 'state'

In [66]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [67]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [68]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

### Index Objects

In [69]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])

In [70]:
index = obj.index

In [71]:
index

Index(['a', 'b', 'c'], dtype='object')

In [72]:
index[1:]

Index(['b', 'c'], dtype='object')

Index objects are immutable and thus can’t be modified by the user:

In [73]:
labels = pd.Index(np.arange(3))

In [74]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [75]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)

In [76]:
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [77]:
obj2.index is labels

True

In [78]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [82]:
frame3.columns, frame3.index

(Index(['Nevada', 'Ohio'], dtype='object', name='state'),
 Int64Index([2001, 2002, 2000], dtype='int64', name='year'))

In [80]:
'Ohio' in frame3.columns

True

In [81]:
2003 in frame3.index

False

Unlike Python sets, a pandas Index can contain duplicate labels:

In [83]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])

In [84]:
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

#### Reindexing

In [85]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [86]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

Calling reindex on this Series rearranges the data according to the new index, introducing missing values if any index values were not already present:

In [87]:
obj2 = obj.reindex([chr(x + 97) for x in range(5)])

In [88]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [89]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [90]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [91]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [92]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    index=['a', 'c', 'd'],
                    columns=['Ohio', 'Texas', 'California'])

In [93]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [94]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [95]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [96]:
states = ['Texas', 'Utah', 'California']

In [97]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [98]:
frame.loc[[chr(x + 97) for x in range(4)], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [99]:
obj = pd.Series(np.arange(5.), index=[chr(x + 97) for x in range(5)])

In [100]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [101]:
new_obj = obj.drop('c')

In [102]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [103]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [104]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                   index=['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns=['one', 'two', 'three', 'four'])

In [105]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [106]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [107]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [108]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


Many functions, like drop , which modify the size or shape of a Series or DataFrame,
can manipulate an object in-place without returning a new object:

In [109]:
obj.drop('c', inplace=True)

In [110]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [111]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [112]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [113]:
obj['b']

1.0

In [114]:
obj[1]

1.0

In [115]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [116]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [117]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [118]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

Slicing with labels behaves differently than normal Python slicing in that the endpoint is inclusive:

In [119]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

Setting using these methods modifies the corresponding section of the Series:

In [120]:
obj['b':'c'] = 5

In [121]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [122]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [123]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [124]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [125]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [126]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [127]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [128]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [129]:
data[data < 5] = 0

In [130]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


This makes DataFrame syntactically more like a two-dimensional NumPy array in
this particular case.

As a preliminary example, let’s select a single row and multiple columns by label:

In [131]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [133]:
data.loc[['Colorado', 'New York'], ['one', 'three']]

Unnamed: 0,one,three
Colorado,0,6
New York,12,14


We’ll then perform some similar selections with integers using ```iloc``` :

In [132]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [134]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [135]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


Both indexing functions work with slices in addition to single labels or lists of labels:

In [136]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [137]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [138]:
ser = pd.Series(np.arange(3.))

In [139]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [140]:
ser[-1]

KeyError: -1

In [141]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

On the other hand, with a non-integer index, there is no potential for ambiguity:

In [142]:
ser2[-1]

2.0

In [143]:
ser[:1]

0    0.0
dtype: float64

In [144]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

if you have an axis index containing integers, data selection
will always be label-oriented. For more precise handling, use ```loc``` _(for labels)_or ```iloc``` _(for integers)_:

In [145]:
ser.iloc[:1]

0    0.0
dtype: float64

In [146]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])

In [147]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [148]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [149]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [150]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

The internal data alignment introduces missing values in the label locations that don’t
overlap. Missing values will then propagate in further arithmetic computations.

In [151]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), 
                   columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])

In [153]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                  columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [154]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [155]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [156]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [157]:
df1 = pd.DataFrame({'A':[1, 2]})

In [158]:
df2 = pd.DataFrame({'B':[3, 4]})

In [159]:
df1

Unnamed: 0,A
0,1
1,2


In [160]:
df2

Unnamed: 0,B
0,3
1,4


In [161]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


In [162]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                  columns=list('abcd'))

In [163]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                  columns=list('abcde'))

In [164]:
df2.loc[1, 'b'] = np.nan

In [165]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [166]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [167]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [168]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [169]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [170]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [171]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


As with NumPy arrays of different dimensions, arithmetic between DataFrame and
Series is also defined.

In [172]:
arr = np.arange(12.).reshape((3, 4))

In [173]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [174]:
arr[0]

array([0., 1., 2., 3.])

In [175]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

When we subtract ```arr[0]``` from ```arr``` , the subtraction is performed once for each row. This is referred to as __broadcasting__

In [176]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                    columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [177]:
series = frame.iloc[0]

In [178]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [179]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

By default, arithmetic between DataFrame and Series matches the index of the Series
on the DataFrame’s columns, broadcasting down the rows:

In [180]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [181]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])

In [182]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [183]:
series3 = frame['d']

In [184]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [185]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [186]:
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [189]:
frame.T.sub(series3, axis='columns')

Unnamed: 0,Utah,Ohio,Texas,Oregon
b,-1.0,-1.0,-1.0,-1.0
d,0.0,0.0,0.0,0.0
e,1.0,1.0,1.0,1.0


NumPy ```ufuncs``` (element-wise array methods) also work with pandas objects:

In [190]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [191]:
frame

Unnamed: 0,b,d,e
Utah,-0.691049,0.947598,-0.342048
Ohio,-0.373918,0.363045,-0.517966
Texas,-0.807594,0.778633,0.225389
Oregon,-0.978249,-1.698034,0.781131


In [192]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.691049,0.947598,0.342048
Ohio,0.373918,0.363045,0.517966
Texas,0.807594,0.778633,0.225389
Oregon,0.978249,1.698034,0.781131


In [193]:
f = lambda x: x.max() - x.min()

In [194]:
frame.apply(f)

b    0.604331
d    2.645632
e    1.299097
dtype: float64

In [195]:
frame.apply(f, axis='columns')

Utah      1.638647
Ohio      0.881010
Texas     1.586227
Oregon    2.479165
dtype: float64

In [196]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [197]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.978249,-1.698034,-0.517966
max,-0.373918,0.947598,0.781131


In [198]:
format = lambda x: '%.2f' % x

In [199]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.69,0.95,-0.34
Ohio,-0.37,0.36,-0.52
Texas,-0.81,0.78,0.23
Oregon,-0.98,-1.7,0.78


The reason for the name ```applymap``` is that Series has a map method for applying an
element-wise function:

In [200]:
frame['e'].map(format)

Utah      -0.34
Ohio      -0.52
Texas      0.23
Oregon     0.78
Name: e, dtype: object

In [201]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

In [202]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [203]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                    index=['three', 'one'], 
                    columns=['d', 'a', 'b', 'c'])

In [204]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [205]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [206]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [207]:
obj = pd.Series([4, 7, -3, 2])

In [208]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [209]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [210]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [211]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [212]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [213]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [214]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


Ranking assigns ranks from one through the number of valid data points in an array.
The ```rank``` methods for Series and DataFrame are the place to look; by default rank
breaks ties by assigning each group the mean rank:

In [215]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [216]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [217]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [218]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [219]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                    'c': [-2, 5, 8, -2.5]})

In [220]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [221]:
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


### Axis Indexes with Duplicate Labels

In [222]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [223]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [224]:
obj.index.is_unique

False

In [225]:
obj['a']

a    0
a    1
dtype: int64

In [226]:
obj['c']

4

In [227]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [228]:
df

Unnamed: 0,0,1,2
a,1.77523,0.306398,-0.449674
a,1.496554,-0.303794,-0.271245
b,-0.010128,1.138445,-0.271718
b,0.017589,0.475006,-1.082125


In [229]:
df.loc['b']

Unnamed: 0,0,1,2
b,-0.010128,1.138445,-0.271718
b,0.017589,0.475006,-1.082125


In [230]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
                index=['a', 'b', 'c', 'd'],
                columns=['one', 'two'])

In [231]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [232]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [233]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [234]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [235]:
df.mean(axis='columns')

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [236]:
df.idxmax()

one    b
two    d
dtype: object

In [237]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [238]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [239]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)

In [240]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [241]:
import pandas_datareader.data as web

In [242]:
all_data = {ticker: web.get_data_yahoo(ticker) 
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [243]:
price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})

In [244]:
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [245]:
returns = price.pct_change()

In [246]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-10,0.002261,-0.000366,-0.004627,0.006973
2020-01-13,0.021364,-0.000658,0.012024,0.006645
2020-01-14,-0.013503,-0.00571,-0.007043,-0.005802
2020-01-15,-0.004286,0.00589,0.006476,0.005815
2020-01-16,0.006568,0.009735,0.011031,0.005013


The corr method of Series computes the correlation of the overlapping, non-NA,
aligned-by-index values in two Series. Relatedly, cov computes the covariance:

In [248]:
returns['MSFT'].corr(returns['IBM'])

0.4834619604580879

In [249]:
returns['MSFT'].cov(returns['IBM'])

9.154155985565628e-05

In [250]:
returns.MSFT.corr(returns.IBM)

0.4834619604580879

DataFrame’s ```corr``` and ```cov``` methods, on the other hand, return a full correlation or covariance matrix as a DataFrame, respectively:

In [251]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.398543,0.574342,0.525647
IBM,0.398543,1.0,0.483462,0.409534
MSFT,0.574342,0.483462,1.0,0.660267
GOOG,0.525647,0.409534,0.660267,1.0


In [252]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000243,8e-05,0.000131,0.000124
IBM,8e-05,0.000167,9.2e-05,8e-05
MSFT,0.000131,9.2e-05,0.000215,0.000146
GOOG,0.000124,8e-05,0.000146,0.000229


Using DataFrame’s ```corrwith``` method, you can compute pairwise correlations
between a DataFrame’s columns or rows with another Series or DataFrame. Passing a
Series returns a Series with the correlation value computed for each column:

In [253]:
returns.corrwith(returns.IBM)

AAPL    0.398543
IBM     1.000000
MSFT    0.483462
GOOG    0.409534
dtype: float64

In [254]:
returns.corrwith(volume)

AAPL   -0.120264
IBM    -0.133356
MSFT   -0.088348
GOOG   -0.003031
dtype: float64

In [255]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [256]:
uniques = obj.unique()

In [257]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [258]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [259]:
type(obj.value_counts())

pandas.core.series.Series

In [260]:
pd.value_counts(obj.values, sort=False)

b    2
d    1
c    3
a    3
dtype: int64

In [261]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [262]:
mask = obj.isin(['b', 'c'])

In [263]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [264]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [266]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])

In [267]:
unique_vals = pd.Series(['c', 'b', 'a'])

In [268]:
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [270]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                        'Qu2': [2, 3, 1, 2, 3],
                        'Qu3': [1, 5, 2, 4, 4]})

In [271]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [272]:
result = data.apply(pd.value_counts).fillna(0)

In [273]:
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
