In [1]:
import numpy as np
import pandas as pd

In [2]:
from pandas import Series, DataFrame

In [3]:
obj = pd.Series([1, 2, 3, -4])
obj

0    1
1    2
2    3
3   -4
dtype: int64

In [4]:
obj.array

<PandasArray>
[1, 2, 3, -4]
Length: 4, dtype: int64

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj2 = pd.Series([1, 2, 3, 4],
                 index=['a', 'b', 'c', 'd'])

In [7]:
obj2

a    1
b    2
c    3
d    4
dtype: int64

In [8]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [9]:
obj2["a"]

1

In [10]:
obj2["c"]

3

In [11]:
obj2[["a", "d"]]

a    1
d    4
dtype: int64

In [12]:
obj2[obj2> 2]

c    3
d    4
dtype: int64

In [13]:
obj2 * 2

a    2
b    4
c    6
d    8
dtype: int64

In [14]:
obj2

a    1
b    2
c    3
d    4
dtype: int64

In [15]:
np.exp(obj2)

a     2.718282
b     7.389056
c    20.085537
d    54.598150
dtype: float64

In [16]:
"c" in obj2

True

In [17]:
'f' in obj2

False

In [18]:
sdata = {"Ohio": 1900, "Oregon": 9300, "Texas": 9584, "California": 12000}
sdata

{'Ohio': 1900, 'Oregon': 9300, 'Texas': 9584, 'California': 12000}

In [19]:
obj3 = pd.Series(sdata)

In [20]:
obj3

Ohio           1900
Oregon         9300
Texas          9584
California    12000
dtype: int64

In [21]:
obj3.to_dict()

{'Ohio': 1900, 'Oregon': 9300, 'Texas': 9584, 'California': 12000}

In [22]:
obj3.array, obj3.index

(<PandasArray>
 [1900, 9300, 9584, 12000]
 Length: 4, dtype: int64,
 Index(['Ohio', 'Oregon', 'Texas', 'California'], dtype='object'))

In [23]:
states = ["California", "Oregon", "Texas", "Utah"]

In [24]:
obj4 = pd.Series(sdata, index=states)
obj4

California    12000.0
Oregon         9300.0
Texas          9584.0
Utah              NaN
dtype: float64

In [25]:
obj4.isna()

California    False
Oregon        False
Texas         False
Utah           True
dtype: bool

In [26]:
pd.isna(obj4)

California    False
Oregon        False
Texas         False
Utah           True
dtype: bool

In [27]:
pd.notna(obj4)

California     True
Oregon         True
Texas          True
Utah          False
dtype: bool

In [28]:
obj4

California    12000.0
Oregon         9300.0
Texas          9584.0
Utah              NaN
dtype: float64

In [29]:
obj3 + obj4

California    24000.0
Ohio              NaN
Oregon        18600.0
Texas         19168.0
Utah              NaN
dtype: float64

In [30]:
obj4.name = 'Population'
obj4.index.name = "State"

In [31]:
obj4

State
California    12000.0
Oregon         9300.0
Texas          9584.0
Utah              NaN
Name: Population, dtype: float64

In [32]:
obj

0    1
1    2
2    3
3   -4
dtype: int64

In [33]:
data = {"State": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
       "Year": [2001, 2002, 2003, 2001, 2002, 2003],
       "Population": [1.2, 1.3, 1.4, 1.4, 2.1, 2.3]}

In [34]:
frame = pd.DataFrame(data)

In [35]:
frame

Unnamed: 0,State,Year,Population
0,Ohio,2001,1.2
1,Ohio,2002,1.3
2,Ohio,2003,1.4
3,Nevada,2001,1.4
4,Nevada,2002,2.1
5,Nevada,2003,2.3


In [36]:
frame.head()

Unnamed: 0,State,Year,Population
0,Ohio,2001,1.2
1,Ohio,2002,1.3
2,Ohio,2003,1.4
3,Nevada,2001,1.4
4,Nevada,2002,2.1


In [37]:
frame.tail()

Unnamed: 0,State,Year,Population
1,Ohio,2002,1.3
2,Ohio,2003,1.4
3,Nevada,2001,1.4
4,Nevada,2002,2.1
5,Nevada,2003,2.3


In [38]:
frame1 = pd.DataFrame(data, columns=["State", "Population", "Year", "debt"])

In [39]:
frame1

Unnamed: 0,State,Population,Year,debt
0,Ohio,1.2,2001,
1,Ohio,1.3,2002,
2,Ohio,1.4,2003,
3,Nevada,1.4,2001,
4,Nevada,2.1,2002,
5,Nevada,2.3,2003,


In [40]:
frame1.columns

Index(['State', 'Population', 'Year', 'debt'], dtype='object')

In [41]:
frame1["State"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: State, dtype: object

In [42]:
frame1.Year

0    2001
1    2002
2    2003
3    2001
4    2002
5    2003
Name: Year, dtype: int64

In [43]:
frame1.loc[2]

State         Ohio
Population     1.4
Year          2003
debt           NaN
Name: 2, dtype: object

In [44]:
frame1.iloc[2]

State         Ohio
Population     1.4
Year          2003
debt           NaN
Name: 2, dtype: object

In [45]:
frame1["debt"] = 16.5

In [46]:
frame1

Unnamed: 0,State,Population,Year,debt
0,Ohio,1.2,2001,16.5
1,Ohio,1.3,2002,16.5
2,Ohio,1.4,2003,16.5
3,Nevada,1.4,2001,16.5
4,Nevada,2.1,2002,16.5
5,Nevada,2.3,2003,16.5


In [47]:
frame1.debt = np.arange(6.)

In [48]:
frame1

Unnamed: 0,State,Population,Year,debt
0,Ohio,1.2,2001,0.0
1,Ohio,1.3,2002,1.0
2,Ohio,1.4,2003,2.0
3,Nevada,1.4,2001,3.0
4,Nevada,2.1,2002,4.0
5,Nevada,2.3,2003,5.0


In [49]:
val = pd.Series([-1.2, 1.3, 2.1], index=[2, 4, 5])

In [50]:
frame1.debt = val

In [51]:
frame1

Unnamed: 0,State,Population,Year,debt
0,Ohio,1.2,2001,
1,Ohio,1.3,2002,
2,Ohio,1.4,2003,-1.2
3,Nevada,1.4,2001,
4,Nevada,2.1,2002,1.3
5,Nevada,2.3,2003,2.1


In [52]:
frame1['Eastern'] = frame1['State'] == 'Ohio'

In [53]:
frame1

Unnamed: 0,State,Population,Year,debt,Eastern
0,Ohio,1.2,2001,,True
1,Ohio,1.3,2002,,True
2,Ohio,1.4,2003,-1.2,True
3,Nevada,1.4,2001,,False
4,Nevada,2.1,2002,1.3,False
5,Nevada,2.3,2003,2.1,False


In [55]:
del frame1['Eastern']

In [56]:
frame1

Unnamed: 0,State,Population,Year,debt
0,Ohio,1.2,2001,
1,Ohio,1.3,2002,
2,Ohio,1.4,2003,-1.2
3,Nevada,1.4,2001,
4,Nevada,2.1,2002,1.3
5,Nevada,2.3,2003,2.1


In [57]:
frame1.columns

Index(['State', 'Population', 'Year', 'debt'], dtype='object')

If the nested dictionary is passed to the DataFrame, pandas will interpret the outer dictionary keys as the columns, and the inner keys as the row indices:

In [58]:
populations = {"Ohio": {2001: 1.2, 2002: 1.5, 2003: 2.1},
              "Nevada": {2002: 3.0, 2003: 4.1}}

In [59]:
frame2 = pd.DataFrame(populations)

In [60]:
frame2

Unnamed: 0,Ohio,Nevada
2001,1.2,
2002,1.5,3.0
2003,2.1,4.1


In [61]:
frame2.T

Unnamed: 0,2001,2002,2003
Ohio,1.2,1.5,2.1
Nevada,,3.0,4.1


In [62]:
frame2.dtypes

Ohio      float64
Nevada    float64
dtype: object

In [63]:
frame2.T.dtypes

2001    float64
2002    float64
2003    float64
dtype: object

In [64]:
populations 

{'Ohio': {2001: 1.2, 2002: 1.5, 2003: 2.1}, 'Nevada': {2002: 3.0, 2003: 4.1}}

In [65]:
pd.DataFrame(populations,  index=[2001, 2002, 2003])

Unnamed: 0,Ohio,Nevada
2001,1.2,
2002,1.5,3.0
2003,2.1,4.1


In [66]:
frame2["Ohio"][:-1]

2001    1.2
2002    1.5
Name: Ohio, dtype: float64

In [67]:
frame2["Nevada"][:2]

2001    NaN
2002    3.0
Name: Nevada, dtype: float64

In [68]:
pdata = {"Ohio": frame2["Ohio"][:-1],
        "Nevada": frame2["Nevada"][:2]}

In [69]:
pdata

{'Ohio': 2001    1.2
 2002    1.5
 Name: Ohio, dtype: float64,
 'Nevada': 2001    NaN
 2002    3.0
 Name: Nevada, dtype: float64}

In [70]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.2,
2002,1.5,3.0


In [71]:
frame2.index.name = "Year"
frame2.columns.name = "State"

In [72]:
frame2

State,Ohio,Nevada
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,1.2,
2002,1.5,3.0
2003,2.1,4.1


In [73]:
frame2.to_numpy()

array([[1.2, nan],
       [1.5, 3. ],
       [2.1, 4.1]])

In [74]:

frame1

Unnamed: 0,State,Population,Year,debt
0,Ohio,1.2,2001,
1,Ohio,1.3,2002,
2,Ohio,1.4,2003,-1.2
3,Nevada,1.4,2001,
4,Nevada,2.1,2002,1.3
5,Nevada,2.3,2003,2.1


In [75]:
frame1.to_numpy()

array([['Ohio', 1.2, 2001, nan],
       ['Ohio', 1.3, 2002, nan],
       ['Ohio', 1.4, 2003, -1.2],
       ['Nevada', 1.4, 2001, nan],
       ['Nevada', 2.1, 2002, 1.3],
       ['Nevada', 2.3, 2003, 2.1]], dtype=object)

## Index Object

In [76]:
obj = pd.Series(np.arange(4), index=['a', 'b', 'c', 'd'])

In [77]:
obj

a    0
b    1
c    2
d    3
dtype: int32

In [78]:
index =obj.index

In [79]:
index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [80]:
index[1:]

Index(['b', 'c', 'd'], dtype='object')

In [81]:
index[1] = 'e' # index objects are immutable 

TypeError: Index does not support mutable operations

In [82]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [83]:
obj2 = pd.Series([1.2, 4.5, 0.9], index=labels)
obj2

0    1.2
1    4.5
2    0.9
dtype: float64

In [84]:
obj2.index is labels

True

In [85]:
frame2

State,Ohio,Nevada
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,1.2,
2002,1.5,3.0
2003,2.1,4.1


In [86]:
frame2.columns

Index(['Ohio', 'Nevada'], dtype='object', name='State')

In [87]:
frame2.index

Int64Index([2001, 2002, 2003], dtype='int64', name='Year')

In [88]:
"Ohio" in frame2.columns

True

In [89]:
2003 in frame2.index

True

In [90]:
pd.Index(["foo", "foo", "bar", 'bar'])

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

## Essential Functionality

### Reindexing 

In [91]:
obj = pd.Series(np.random.standard_normal(4), index=['e', 'd', 'a', 'b'])

In [92]:
obj

e    1.685619
d   -0.005803
a   -1.104184
b    0.017081
dtype: float64

In [93]:
obj.reindex(['a', 'b', 'c', 'd', 'e'])

a   -1.104184
b    0.017081
c         NaN
d   -0.005803
e    1.685619
dtype: float64

In [94]:
obj3 = pd.Series(['Blue', 'Yellow', 'Purple'], index=[0, 2, 4])
obj3

0      Blue
2    Yellow
4    Purple
dtype: object

In [95]:
obj3.reindex(np.arange(9), method='ffill')

0      Blue
1      Blue
2    Yellow
3    Yellow
4    Purple
5    Purple
6    Purple
7    Purple
8    Purple
dtype: object

In [96]:
obj3.reindex(np.arange(8), method='bfill')

0      Blue
1    Yellow
2    Yellow
3    Purple
4    Purple
5       NaN
6       NaN
7       NaN
dtype: object

In [97]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), 
                     index=['a', 'b', 'c'],
                     columns=['Ohio', 'California', 'Utah'])

In [98]:
frame

Unnamed: 0,Ohio,California,Utah
a,0,1,2
b,3,4,5
c,6,7,8


In [99]:
frame.reindex(['a', 'b','c', 'd'])

Unnamed: 0,Ohio,California,Utah
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


In [100]:
state = ['Ohio', 'Texas', 'Utah']
frame.reindex(columns=state)

Unnamed: 0,Ohio,Texas,Utah
a,0,,2
b,3,,5
c,6,,8


In [101]:
frame.reindex(states, axis='columns')

Unnamed: 0,California,Oregon,Texas,Utah
a,1,,,2
b,4,,,5
c,7,,,8


In [102]:
frame

Unnamed: 0,Ohio,California,Utah
a,0,1,2
b,3,4,5
c,6,7,8


In [103]:
frame.loc[['a', 'b'] , ['Ohio', 'Utah']]

Unnamed: 0,Ohio,Utah
a,0,2
b,3,5


## Dropping Enteries from an Axis

In [104]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [105]:
new_obj = obj.drop('c')

In [106]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [107]:
obj.drop(['c', 'd'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [108]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index= ['ohio', 'texas', 'utah', 'california'], 
                    columns=['one', 'two', 'three', 'four'])

In [109]:
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
texas,4,5,6,7
utah,8,9,10,11
california,12,13,14,15


In [110]:
data.drop(index =['ohio', 'utah'])

Unnamed: 0,one,two,three,four
texas,4,5,6,7
california,12,13,14,15


In [111]:
data.drop(columns=['one', 'four'])

Unnamed: 0,two,three
ohio,1,2
texas,5,6
utah,9,10
california,13,14


In [112]:
data.drop('two', axis='columns')

Unnamed: 0,one,three,four
ohio,0,2,3
texas,4,6,7
utah,8,10,11
california,12,14,15


In [113]:
data.drop('ohio', axis=0)

Unnamed: 0,one,two,three,four
texas,4,5,6,7
utah,8,9,10,11
california,12,13,14,15


## Indexing, Selection and Filtering

In [114]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [115]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [116]:
obj['b']

1.0

In [117]:
obj[0]

0.0

In [118]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [119]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [120]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [121]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [122]:
obj.loc[['b', 'c']]

b    1.0
c    2.0
dtype: float64

In [123]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [124]:
obj.loc['a']

0.0

In [125]:
#obj.loc[[0, 1, 2]] 
# Will give error because index does not contain integer

In [126]:
obj.iloc[[0, 1, 2]]

a    0.0
b    1.0
c    2.0
dtype: float64

In [127]:
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
texas,4,5,6,7
utah,8,9,10,11
california,12,13,14,15


In [128]:
data[:2]

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
texas,4,5,6,7


In [129]:
data[['two', 'three']]

Unnamed: 0,two,three
ohio,1,2
texas,5,6
utah,9,10
california,13,14


In [130]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
texas,4,5,6,7
utah,8,9,10,11
california,12,13,14,15


In [131]:
data < 5

Unnamed: 0,one,two,three,four
ohio,True,True,True,True
texas,True,False,False,False
utah,False,False,False,False
california,False,False,False,False


In [132]:
data[data < 5] = 0

In [133]:
data

Unnamed: 0,one,two,three,four
ohio,0,0,0,0
texas,0,5,6,7
utah,8,9,10,11
california,12,13,14,15


In [134]:
data.loc[['texas', 'utah']]

Unnamed: 0,one,two,three,four
texas,0,5,6,7
utah,8,9,10,11


In [135]:
data.loc['texas', ['three', 'two']]

three    6
two      5
Name: texas, dtype: int32

In [136]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: utah, dtype: int32

In [137]:
data.iloc[2, [3, 1]]

four    11
two      9
Name: utah, dtype: int32

In [138]:
data.loc[:"utah", ['two', 'three']]

Unnamed: 0,two,three
ohio,0,0
texas,5,6
utah,9,10


In [139]:
data.iloc[:, :3]

Unnamed: 0,one,two,three
ohio,0,0,0
texas,0,5,6
utah,8,9,10
california,12,13,14


In [140]:
data.iloc[:, :3][data.three > 6]

Unnamed: 0,one,two,three
utah,8,9,10
california,12,13,14


In [141]:
data[data.three >= 2]

Unnamed: 0,one,two,three,four
texas,0,5,6,7
utah,8,9,10,11
california,12,13,14,15


In [142]:
data

Unnamed: 0,one,two,three,four
ohio,0,0,0,0
texas,0,5,6,7
utah,8,9,10,11
california,12,13,14,15


In [143]:
data.loc[:, 'one'] = 1

In [144]:
data


Unnamed: 0,one,two,three,four
ohio,1,0,0,0
texas,1,5,6,7
utah,1,9,10,11
california,1,13,14,15


In [145]:
data.iloc[2] = 2

In [146]:
data

Unnamed: 0,one,two,three,four
ohio,1,0,0,0
texas,1,5,6,7
utah,2,2,2,2
california,1,13,14,15


In [147]:
data.loc[data.four > 5] = 3

In [148]:
data

Unnamed: 0,one,two,three,four
ohio,1,0,0,0
texas,3,3,3,3
utah,2,2,2,2
california,3,3,3,3


In [149]:
data.loc[data.three == 3]['three'] = 6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.three == 3]['three'] = 6


In [150]:
data.loc[['texas', 'california'], 'three'] = 7

In [151]:
data

Unnamed: 0,one,two,three,four
ohio,1,0,0,0
texas,3,3,7,3
utah,2,2,2,2
california,3,3,7,3


## Arithmatic and Data Alighnment 

In [152]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])

s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=["a", "c", "e", "f", "g"])

In [153]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [154]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [155]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [156]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list('bce'),
                   index=['ohio', 'texas', 'utah'])

In [157]:
df1

Unnamed: 0,b,c,e
ohio,0.0,1.0,2.0
texas,3.0,4.0,5.0
utah,6.0,7.0,8.0


In [158]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bcd'), index=['ohio', 'texas', 'utah', 'california'])
df2

Unnamed: 0,b,c,d
ohio,0.0,1.0,2.0
texas,3.0,4.0,5.0
utah,6.0,7.0,8.0
california,9.0,10.0,11.0


In [159]:
df1 + df2

Unnamed: 0,b,c,d,e
california,,,,
ohio,0.0,2.0,,
texas,6.0,8.0,,
utah,12.0,14.0,,


In [160]:
df1 = pd.DataFrame({'A': [ 1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})

In [161]:
df1

Unnamed: 0,A
0,1
1,2


In [162]:
df2

Unnamed: 0,B
0,3
1,4


In [163]:
df1 + df2

Unnamed: 0,A,B
0,,
1,,


## Arithmatic methods with fill values

In [164]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

In [165]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [166]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [167]:
df2.iloc[1]['b'] = np.nan

In [168]:
df2.loc[1, 'b'] = np.nan

In [169]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [170]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [171]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [172]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [173]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [174]:
df1.reindex(columns=df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


## Operations between DataFrames and Series

In [175]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [176]:
arr[0]

array([0., 1., 2., 3.])

In [177]:
arr[1]

array([4., 5., 6., 7.])

In [178]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [179]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), 
                     index=['utah', 'ohio', 'texas','oregon'])

In [180]:
frame

Unnamed: 0,b,d,e
utah,0.0,1.0,2.0
ohio,3.0,4.0,5.0
texas,6.0,7.0,8.0
oregon,9.0,10.0,11.0


In [181]:
series = frame.iloc[0]

In [182]:
series

b    0.0
d    1.0
e    2.0
Name: utah, dtype: float64

In [183]:
frame - series

Unnamed: 0,b,d,e
utah,0.0,0.0,0.0
ohio,3.0,3.0,3.0
texas,6.0,6.0,6.0
oregon,9.0,9.0,9.0


In [184]:
series2 = pd.Series(np.arange(3), index=list('bef'))
series2

b    0
e    1
f    2
dtype: int32

In [185]:
frame + series2

Unnamed: 0,b,d,e,f
utah,0.0,,3.0,
ohio,3.0,,6.0,
texas,6.0,,9.0,
oregon,9.0,,12.0,


In [186]:
series3 = frame['d']

In [187]:
frame

Unnamed: 0,b,d,e
utah,0.0,1.0,2.0
ohio,3.0,4.0,5.0
texas,6.0,7.0,8.0
oregon,9.0,10.0,11.0


In [188]:
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
utah,-1.0,0.0,1.0
ohio,-1.0,0.0,1.0
texas,-1.0,0.0,1.0
oregon,-1.0,0.0,1.0


## Function Application and Mapping

In [189]:
frame = pd.DataFrame(np.random.standard_normal((4, 3)),
                    columns=list("bde"),
                    index=["Utah", "Ohio", "Texas", "Oregon"])

In [190]:
frame

Unnamed: 0,b,d,e
Utah,-0.43087,1.891817,-0.207946
Ohio,-0.168862,0.071783,-0.230916
Texas,0.84155,0.577067,-1.090972
Oregon,-0.337719,-1.192884,1.149436


In [191]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.43087,1.891817,0.207946
Ohio,0.168862,0.071783,0.230916
Texas,0.84155,0.577067,1.090972
Oregon,0.337719,1.192884,1.149436


In [192]:
def f1(x):
    return x.max() - x.min()

In [193]:
frame.apply(f1)

b    1.272420
d    3.084701
e    2.240408
dtype: float64

In [194]:
frame['b'].max() - frame['b'].min()

1.272420161110281

In [195]:
frame.apply(f1, axis='columns')

Utah      2.322687
Ohio      0.302699
Texas     1.932522
Oregon    2.342320
dtype: float64

In [196]:
def f2(x):
    return pd.Series([x.max(), x.min()], index=['max', 'min'])

In [197]:
frame.apply(f2)

Unnamed: 0,b,d,e
max,0.84155,1.891817,1.149436
min,-0.43087,-1.192884,-1.090972


In [198]:
def my_format(x):
    return f"{x:.2f}"

In [199]:
my_format(3.5545)

'3.55'

In [200]:
frame.applymap(my_format)

Unnamed: 0,b,d,e
Utah,-0.43,1.89,-0.21
Ohio,-0.17,0.07,-0.23
Texas,0.84,0.58,-1.09
Oregon,-0.34,-1.19,1.15


In [201]:
frame['b'].map(my_format)

Utah      -0.43
Ohio      -0.17
Texas      0.84
Oregon    -0.34
Name: b, dtype: object

## Sorting and Ranking

In [202]:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])
obj

d    0
a    1
b    2
c    3
dtype: int32

In [203]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [204]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=["three", "one"],
                    columns=["d", "a", "b", "c"])

In [205]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [206]:
frame.sort_index(axis='columns')

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [207]:
frame.sort_index(axis='columns', ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [208]:
obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [209]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [210]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [211]:
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [212]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [213]:
obj.sort_values(na_position="first")

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [214]:
frame = pd.DataFrame({'b': [-1, 3, 0, 4], 'c': [-2, -4, 0 , 9]})

In [215]:
frame

Unnamed: 0,b,c
0,-1,-2
1,3,-4
2,0,0
3,4,9


In [216]:
frame.sort_values(by='c')

Unnamed: 0,b,c
1,3,-4
0,-1,-2
2,0,0
3,4,9


In [217]:
frame.sort_values(['b', 'c'])

Unnamed: 0,b,c
0,-1,-2
2,0,0
1,3,-4
3,4,9


In [218]:
frame.sort_values(['c', 'b'])

Unnamed: 0,b,c
1,3,-4
0,-1,-2
2,0,0
3,4,9


In [219]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [220]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [221]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

## Axis Indexes with Duplicates labels

In [222]:
obj = pd.Series(np.arange(5.), index=list('aabbc'))

In [223]:
obj

a    0.0
a    1.0
b    2.0
b    3.0
c    4.0
dtype: float64

In [225]:
obj.index.is_unique

False

In [226]:
obj['a']

a    0.0
a    1.0
dtype: float64

In [227]:
obj.loc['a']

a    0.0
a    1.0
dtype: float64

## Summarizing and Computing Descriptive Statistics

In [228]:
df = pd.DataFrame([[1.40, np.nan], [7.10, -4.5],
                  [np.nan, np.nan], [0.75, -1.3]], 
                 index=list('abcd'),
                 columns=['one', 'two'])

In [229]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [230]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [231]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [235]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [236]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [237]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [238]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [239]:
df.sum(axis=0, skipna=False)

one   NaN
two   NaN
dtype: float64

In [241]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [242]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [243]:
df.mean(axis=1)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [245]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [254]:
df.idxmax()

one    b
two    d
dtype: object

In [255]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [260]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [261]:
obj = pd.Series(['a', 'a', 'b', 'c'] *4)

In [267]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

## Correlation and Covariance

In [268]:
price = pd.read_pickle("examples/yahoo_price.pkl")
volume = pd.read_pickle("examples/yahoo_volume.pkl")

In [270]:
price.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.990226,313.062468,113.304536,25.884104
2010-01-05,28.038618,311.683844,111.935822,25.892466
2010-01-06,27.592626,303.826685,111.208683,25.733566
2010-01-07,27.541619,296.753749,110.823732,25.465944
2010-01-08,27.724725,300.709808,111.935822,25.641571


In [272]:
returns = price.pct_change()

In [273]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [280]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [282]:
returns['MSFT'].corr(returns['IBM'])

0.49976361144151155

In [283]:
returns['MSFT'].cov(returns['IBM'])

8.870655479703549e-05

In [284]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


In [285]:
returns.corrwith(returns['IBM'])

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

In [287]:
volume.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400


In [289]:
volume.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.57603,0.383942,0.490353
GOOG,0.57603,1.0,0.438424,0.490446
IBM,0.383942,0.438424,1.0,0.425892
MSFT,0.490353,0.490446,0.425892,1.0


In [290]:
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

## Unique Values, Value Counts, and Membership

In [291]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])

In [293]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [294]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [296]:
obj.to_numpy()

array(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'], dtype=object)

In [297]:
pd.value_counts(obj.to_numpy(), sort=False)

b    2
c    3
d    1
a    3
dtype: int64

In [298]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [300]:
mask = obj.isin(['b', 'c'])

In [301]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [303]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [304]:
to_match = pd.Series(["c", "a", "b", "b", "c", "a"])

In [305]:
to_match

0    c
1    a
2    b
3    b
4    c
5    a
dtype: object

In [310]:
unique_values = pd.Series(['c', 'b', 'a'])

In [311]:
pd.Index(unique_values).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [312]:
data = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4],
                     "Qu2": [2, 3, 1, 2, 3],
                     "Qu3": [1, 5, 2, 4, 4]})

In [313]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [315]:
data['Qu1'].value_counts()

3    2
4    2
1    1
Name: Qu1, dtype: int64

In [316]:
data['Qu1'].value_counts().sort_index()

1    1
3    2
4    2
Name: Qu1, dtype: int64

In [317]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [319]:
data.apply(pd.value_counts)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [323]:
data.apply(pd.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
