In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from pandas import Series, DataFrame

## Series
A Series is a **one-dimensional array-like** object containing a sequence of values (of similar types to NumPy types) and an associated **array of data labels, called its index**. The simplest Series is formed from only an array of data:

In [4]:
obj = pd.Series([4, 7, -5, 3])

In [5]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj.values

array([ 4,  7, -5,  3])

In [8]:
obj2 = pd.Series([4,7,-5,3],index=['d','b','a','c'])

In [9]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [11]:
obj2['a']

-5

In [12]:
obj2['d']

4

In [13]:
obj2['d'] = 6

In [14]:
obj2[['c','a','d']]

c    3
a   -5
d    6
dtype: int64

In [15]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [16]:
import numpy as np

In [17]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [18]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [20]:
obj2[['d','c','c','b','a','a']]

d    6
c    3
c    3
b    7
a   -5
a   -5
dtype: int64

In [21]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [22]:
obj * 2

0     8
1    14
2   -10
3     6
dtype: int64

In [23]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [24]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [25]:
'b' in obj2

True

In [26]:
'e' in obj2

False

In [27]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [28]:
obj3 = pd.Series(sdata)

In [29]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [30]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [31]:
obj4 = pd.Series(sdata, index=states)

In [32]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [33]:
obj4.dropna

<bound method Series.dropna of California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64>

In [35]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [36]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [37]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [38]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [39]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [40]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [41]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [42]:
obj4.name = 'population'

In [43]:
obj4.index.name = 'state'

In [44]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [45]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [46]:
obj.index = ['Bob','Stave','Jeff','Ryan']

In [47]:
obj

Bob      4
Stave    7
Jeff    -5
Ryan     3
dtype: int64

## DataFrame
A DataFrame represents a **rectangular table of data** and contains an ordered collection of columns, each of which can be a **different value type** (numeric, string, boolean, etc.). The DataFrame has both a row and column index; it can be thought of as a dict of Series all sharing the same index. Under the hood, the data is stored as one or more two-dimensional blocks rather than a list, dict, or some other collection of one-dimensional arrays. The exact details of DataFrame’s internals are outside the scope of this book.

In [48]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [49]:
frame = pd.DataFrame(data)

In [50]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [51]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [53]:
pd.DataFrame(data, columns=['year','state'])

Unnamed: 0,year,state
0,2000,Ohio
1,2001,Ohio
2,2002,Ohio
3,2001,Nevada
4,2002,Nevada
5,2003,Nevada


In [57]:
frame[['year','state']]

Unnamed: 0,year,state
0,2000,Ohio
1,2001,Ohio
2,2002,Ohio
3,2001,Nevada
4,2002,Nevada
5,2003,Nevada


In [58]:
frame = pd.DataFrame(data, columns=['year','state','pop','debt'],
                    index=['one','two','three','four','five','six'])

In [59]:
frame

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [60]:
frame.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [62]:
frame['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [63]:
frame.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [64]:
frame.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [66]:
frame

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [67]:
frame.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [68]:
frame['debt'] = 16.5

In [69]:
frame

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [70]:
frame['debt'] = np.arange(6.)

In [71]:
frame

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [72]:
val = pd.Series([-1.2,-1.5,-1.7], index=['two','four','five'])

In [73]:
frame['debt'] = val

In [74]:
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [75]:
frame

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [76]:
frame['eastern'] = frame.state == 'Ohio'

In [77]:
frame

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [78]:
del frame['eastern']

In [79]:
frame

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [80]:
frame['eastern'] = frame.state == 'Ohio'

In [81]:
frame

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [82]:
pop = {'Nevada':{2001:2.4,2002:2.9},
      'Ohio':{2000:1.5,2001:1.7,2002:3.6}}

In [83]:
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [84]:
frame = pd.DataFrame(pop)

In [85]:
frame

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [86]:
frame.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [89]:
frame = pd.DataFrame(pop, index=[2001, 2002, 2003])

In [90]:
frame

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [91]:
frame.T

Unnamed: 0,2001,2002,2003
Nevada,2.4,2.9,
Ohio,1.7,3.6,


In [92]:
frame3 = pd.DataFrame(pop)

In [93]:
pdata = {'Ohio':frame3['Ohio'][:-1],
         'Nevada':frame3['Nevada'][:2]}

In [94]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [95]:
frame3.index.name = 'year';frame3.columns.name = 'state'

In [96]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [97]:
frame3.T

year,2000,2001,2002
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [98]:
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

## Possible data inputs to DataFrame constructor
|Type|Notes|
|---|---|
|2D ndarray|A matrix of data, passing optional row and column labels|
|dict of arrays, lists, or tuples|Each sequence becomes a column in the DataFrame; all sequences must be the same length|
|NumPy structured/record array|Treated as the “dict of arrays” case|
|dict of Series|Each value becomes a column; indexes from each Series are unioned together to form the result’s row index if no explicit index is passed|
|dict of dicts|Each inner dict becomes a column; keys are unioned to form the row index as in the “dict of Series” case|
|List of dicts or Series|Each item becomes a row in the DataFrame; union of dict keys or Series indexes become the DataFrame’s column labels|
|List of lists or tuples|Treated as the “2D ndarray” case|
|Another DataFrame|The DataFrame’s indexes are used unless different ones are passed|
|NumPy MaskedArray|Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result|

In [99]:
obj = pd.Series(range(3), index=['a','b','c'])

In [100]:
index = obj.index

In [101]:
index

Index(['a', 'b', 'c'], dtype='object')

In [102]:
obj

a    0
b    1
c    2
dtype: int64

In [103]:
index[1:]

Index(['b', 'c'], dtype='object')

In [105]:
index[1]

'b'

In [106]:
obj

a    0
b    1
c    2
dtype: int64

In [108]:
index[2]

'c'

In [109]:
obj.a

0

In [110]:
pd.DataFrame(obj)

Unnamed: 0,0
a,0
b,1
c,2


In [111]:
obj.index.name = 'alpha'

In [112]:
pd.DataFrame(obj)

Unnamed: 0_level_0,0
alpha,Unnamed: 1_level_1
a,0
b,1
c,2


In [114]:
obj.name = 'num'

In [115]:
pd.DataFrame(obj)

Unnamed: 0_level_0,num
alpha,Unnamed: 1_level_1
a,0
b,1
c,2


In [116]:
x = pd.DataFrame(obj)

In [117]:
x

Unnamed: 0_level_0,num
alpha,Unnamed: 1_level_1
a,0
b,1
c,2


In [118]:
x.columns

Index(['num'], dtype='object')

In [119]:
x['num']

alpha
a    0
b    1
c    2
Name: num, dtype: int64

In [121]:
x.num

alpha
a    0
b    1
c    2
Name: num, dtype: int64

In [125]:
'num' in x.columns

True

In [126]:
x.index

Index(['a', 'b', 'c'], dtype='object', name='alpha')

In [127]:
x.index.name

'alpha'

In [129]:
'al' in x.index

False

In [130]:
'al' in x.columns

False

In [131]:
dup_labels = pd.Index(['foo','foo','bar','bar'])

In [132]:
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

## Some Index methods and properties
|Method|Description|
|---|---|
|append|Concatenate with additional Index objects, producing a new Index|
|difference|Compute set difference as an Index|
|intersection|Compute set intersection|
|union|Compute set union|
|isin|Compute boolean array indicating whether each value is contained in the passed collection|
|delete|Compute new Index with element at index i deleted|
|drop|Compute new Index by deleting passed values|
|insert|Compute new Index by inserting element at index i|
|is_monotonic|Returns True if each element is greater than or equal to the previous element|
|is_unique|Returns True if the Index has no duplicate values|
|unique|Compute the array of unique values in the Index|

In [133]:
obj = pd.Series([4.5,dw7.2,-5.3,3.6], index=['d','b','a','c'])

In [134]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [137]:
obj2 = obj.reindex(['a','b','c','d','e'])

In [138]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [141]:
obj2 = obj[['a','b','c','d']]

In [142]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
dtype: float64

In [144]:
obj2 = obj.reindex(['a','c','d','x'])

In [145]:
obj2

a   -5.3
c    3.6
d    4.5
x    NaN
dtype: float64

In [146]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [147]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [148]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [149]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
                    index=['a','c','d'],
                    columns=['Ohio','Texas','California'])

In [150]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [151]:
frame2 = frame.reindex(['a','b','c','d','e'])

In [152]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0
e,,,


In [155]:
states = ['Texas','Utah','California']

In [156]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [158]:
frame.reindex(['a','b','c','d','e'],columns=['Utah','Texas','California'])

Unnamed: 0,Utah,Texas,California
a,,1.0,2.0
b,,,
c,,4.0,5.0
d,,7.0,8.0
e,,,


## reindex function arguments
|Argument|Description|
|---|---|
|index|New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying.|
|method|Interpolation (fill) method; 'ffill' fills forward, while 'bfill' fills backward.|
|fill_value|Substitute value to use when introducing missing data by reindexing.|
|limit|When forward- or backfilling, maximum size gap (in number of elements) to fill.|
|tolerance|When forward- or backfilling, maximum size gap (in absolute numeric distance) to fill for inexact matches.|
|level|Match simple Index on level of MultiIndex; otherwise select subset of.|
|copy|If True, always copy underlying data even if new index is equivalent to old index; if False, do not copy the data when the indexes are equivalent.|

## Dropping Entries from an Axis
Dropping one or more entries from an axis is easy if you already have an index array or list without those entries. As that can require a bit of munging and set logic, the drop method will return a new object with the indicated value or values deleted from an axis:

In [159]:
obj = pd.Series(np.arange(5.), index=['a','b','c','d','e'])

In [160]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [161]:
new_obj = obj.drop('c')

In [162]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [163]:
obj.drop(['a','b'])

c    2.0
d    3.0
e    4.0
dtype: float64

In [164]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=['Ohio','Colorado','Utah','New York'],
                   columns=['one','two','three','four'])

In [165]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [166]:
data.drop(['Colorado','Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [167]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [168]:
data.drop('two',axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [169]:
data.drop(['one','three'],axis=1)

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [170]:
obj.drop('c',inplace=True)

In [172]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [173]:
data.drop('two',axis=1,inplace=True)

In [174]:
data

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


## Indexing, Selection, and Filtering
Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers. Here are some examples of this:

In [175]:
obj = pd.Series(np.arange(4.), index=['a','b','c','d'])

In [176]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [177]:
obj['b']

1.0

In [178]:
obj[1]

1.0

In [179]:
obj[0]

0.0

In [180]:
obj[0] = 13

In [181]:
obj

a    13.0
b     1.0
c     2.0
d     3.0
dtype: float64

In [182]:
obj['a'] = -1

In [183]:
obj

a   -1.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [184]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [185]:
obj

a   -1.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [187]:
obj[['b','a','d','a']]

b    1.0
a   -1.0
d    3.0
a   -1.0
dtype: float64

In [188]:
obj.reindex(['a','a','a','b','e','b'])

a   -1.0
a   -1.0
a   -1.0
b    1.0
e    NaN
b    1.0
dtype: float64

In [189]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [190]:
obj

a   -1.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [192]:
obj[[0,0,0,3]]

a   -1.0
a   -1.0
a   -1.0
d    3.0
dtype: float64

In [193]:
obj

a   -1.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [194]:
obj[obj < 0]

a   -1.0
dtype: float64

Slicing with labels behaves differently than normal Python slicing in that the endpoint is inclusive:

In [195]:
obj['a':'c']

a   -1.0
b    1.0
c    2.0
dtype: float64

In [196]:
obj['a':'c'] = -2

In [197]:
obj

a   -2.0
b   -2.0
c   -2.0
d    3.0
dtype: float64

In [198]:
data

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [199]:
data['two'] = np.arange(4)

In [200]:
data

Unnamed: 0,one,three,four,two
Ohio,0,2,3,0
Colorado,4,6,7,1
Utah,8,10,11,2
New York,12,14,15,3


In [204]:
data = data.reindex(['one','two','three','four'],axis=1)

In [205]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,2,3
Colorado,4,1,6,7
Utah,8,2,10,11
New York,12,3,14,15


In [206]:
data['two']

Ohio        0
Colorado    1
Utah        2
New York    3
Name: two, dtype: int64

In [207]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [208]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,0,2,3
Colorado,4,1,6,7


In [209]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,1,6,7
Utah,8,2,10,11
New York,12,3,14,15


In [210]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,2,3
Colorado,4,1,6,7
Utah,8,2,10,11
New York,12,3,14,15


In [211]:
data < 3

Unnamed: 0,one,two,three,four
Ohio,True,True,True,False
Colorado,False,True,False,False
Utah,False,True,False,False
New York,False,False,False,False


In [212]:
data[data < 5] = 0

In [213]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,6,7
Utah,8,0,10,11
New York,12,0,14,15


## SELECTION WITH LOC AND ILOC
For DataFrame label-indexing on the rows, I introduce the special indexing operators loc and iloc. They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation using either axis labels (loc) or integers (iloc).

As a preliminary example, let’s select a single row and multiple columns by label:

In [214]:
data.loc['Colorado',['two','three']]

two      0
three    6
Name: Colorado, dtype: int64

In [215]:
data.loc['Colorado']

one      0
two      0
three    6
four     7
Name: Colorado, dtype: int64

In [217]:
data.loc['Colorado',['three','four']]

three    6
four     7
Name: Colorado, dtype: int64

下面的代码的意思是先选取第二行，之后后面的列表里面会选择列数
如下面选取到第二行，之后选取第二行也就是Utah这一行的数据，之后选择第三列，第零列，第一列，也就是four，one，two

In [218]:
data.iloc[2,[3,0,1]]

four    11
one      8
two      0
Name: Utah, dtype: int64

In [219]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,6,7
Utah,8,0,10,11
New York,12,0,14,15


In [220]:
data.iloc[[1,2],[3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,0
Utah,11,8,0


In [222]:
data.loc[['Colorado','Utah'],['four','one','two']]

Unnamed: 0,four,one,two
Colorado,7,0,0
Utah,11,8,0


In [223]:
data.loc[:'Utah','two']

Ohio        0
Colorado    0
Utah        0
Name: two, dtype: int64

In [224]:
data.iloc[:,:3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,0,6
Utah,8,0,10
New York,12,0,14


In [225]:
data.iloc[:,:3]

Unnamed: 0,one,two,three
Ohio,0,0,0
Colorado,0,0,6
Utah,8,0,10
New York,12,0,14


In [226]:
data.three > 5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [233]:
data.loc['Ohio']

one      0
two      0
three    0
four     0
Name: Ohio, dtype: int64

In [234]:
data.iloc[[1,2],[3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,0
Utah,11,8,0


In [235]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,6,7
Utah,8,0,10,11
New York,12,0,14,15


In [236]:
data.iloc[[1,2]]

Unnamed: 0,one,two,three,four
Colorado,0,0,6,7
Utah,8,0,10,11


好吧，上面的还不如理解为跟numpy里面的多维silce一样的结构，在iloc，loc中第一个是行，第二个是列，这样就很好理解

In [237]:
data.iloc[[1,2],[3,0]]

Unnamed: 0,four,one
Colorado,7,0
Utah,11,8


In [240]:
data.loc[:'Utah','two']

Ohio        0
Colorado    0
Utah        0
Name: two, dtype: int64

In [242]:
data.iloc[:,:3]

Unnamed: 0,one,two,three
Ohio,0,0,0
Colorado,0,0,6
Utah,8,0,10
New York,12,0,14


In [7]:
data = DataFrame(np.arange(16).reshape((4,4)),
                index=['Ohio','Colorado','Utah','New York'],
                columns=['one','two','three','four'])

In [8]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [13]:
data[['one','four']]

Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11
New York,12,15


In [17]:
data.loc[:'Utah',['one','four']]

Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11


In [19]:
ser = pd.Series(np.arange(3.))

In [20]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [23]:
ser2 = pd.Series(np.arange(3.),index=['a','b','c'])

In [24]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [25]:
ser2[-1]

2.0

In [26]:
ser[:1]

0    0.0
dtype: float64

In [27]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [28]:
ser.iloc[:1]

0    0.0
dtype: float64

In [29]:
s1 = pd.Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])

In [30]:
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])

In [31]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [32]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [33]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [34]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio','Texas','Colorado'])

In [35]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah','Ohio','Texas','Oragon'])

In [36]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [37]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oragon,9.0,10.0,11.0


In [38]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oragon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [39]:
df1 = pd.DataFrame({'A':[1,2]})

In [40]:
df1

Unnamed: 0,A
0,1
1,2


In [42]:
df2 = pd.DataFrame({'B':[3,4]})

In [43]:
df2

Unnamed: 0,B
0,3
1,4


In [45]:
%time df1 - df2

CPU times: user 2.8 ms, sys: 382 µs, total: 3.18 ms
Wall time: 3.74 ms


Unnamed: 0,A,B
0,,
1,,


In [46]:
df1 = DataFrame(np.arange(12.).reshape((3,4)),
               columns=list('abcd'))

In [47]:
df2 = DataFrame(np.arange(20.).reshape((4,5)),
               columns=list('abcde'))

In [49]:
df2.loc[1,'b'] = np.nan

In [50]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [51]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [52]:
%time df1 + df2

CPU times: user 4.98 ms, sys: 638 µs, total: 5.62 ms
Wall time: 12.4 ms


Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


Using the add method on df1, I pass df2 and an argument to fill_value:

下面的代码显现了add函数不一样的地方，add函数和加法不一样，add函数中会有一个fill_value的值，这个值表示了替换Nan值为何种方式。

In [56]:
%time df1.add(df2, fill_value=5)

CPU times: user 3.87 ms, sys: 0 ns, total: 3.87 ms
Wall time: 3.34 ms


Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,9.0
1,9.0,10.0,13.0,15.0,14.0
2,18.0,20.0,22.0,24.0,19.0
3,20.0,21.0,22.0,23.0,24.0


In [54]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [55]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [57]:
%time df1.add(df2, fill_value=-5)

CPU times: user 4.29 ms, sys: 22 µs, total: 4.31 ms
Wall time: 4.79 ms


Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,-1.0
1,9.0,0.0,13.0,15.0,4.0
2,18.0,20.0,22.0,24.0,9.0
3,10.0,11.0,12.0,13.0,14.0


## OPERATIONS BETWEEN DATAFRAME AND SERIES
As with NumPy arrays of different dimensions, arithmetic between DataFrame and Series is also defined. First, as a motivating example, consider the difference between a two-dimensional array and one of its rows:

In [58]:
arr = np.arange(12.).reshape((3,4))

In [59]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [60]:
arr[0]

array([0., 1., 2., 3.])

In [61]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [63]:
frame = DataFrame(np.arange(12.).reshape((4,3)),
                 columns=list('bde'),
                 index=['Utah','Ohio','Texas','Oregon'])

In [64]:
series = frame.iloc[0]

In [65]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [66]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame’s columns, broadcasting down the rows:

In [67]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [68]:
series2 = pd.Series(range(3),index=list('bef'))

In [69]:
series2

b    0
e    1
f    2
dtype: int64

In [70]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [71]:
series3 = frame['d']

In [72]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [73]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [74]:
frame.sub(series3, axis=0)

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [75]:
frame = pd.DataFrame(np.random.randn(4,3),columns=list('bde'),
                    index=['Utah','Ohio','Texas','Oregon'])

In [76]:
frame

Unnamed: 0,b,d,e
Utah,1.415518,-1.38339,1.658965
Ohio,0.176986,0.739549,1.348618
Texas,-0.012936,0.584257,-1.953487
Oregon,0.364887,0.550307,0.802439


In [77]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.415518,1.38339,1.658965
Ohio,0.176986,0.739549,1.348618
Texas,0.012936,0.584257,1.953487
Oregon,0.364887,0.550307,0.802439


In [78]:
f = lambda x: x.max() - x.min()

In [79]:
frame.apply(f)

b    1.428454
d    2.122939
e    3.612453
dtype: float64

In [80]:
frame.apply(f,axis=1)

Utah      3.042355
Ohio      1.171632
Texas     2.537744
Oregon    0.437551
dtype: float64

In [81]:
def f(x):
    return Series([x.min(),x.max()],index=['min','max'])

In [83]:
frame.apply(f,axis=1)

Unnamed: 0,min,max
Utah,-1.38339,1.658965
Ohio,0.176986,1.348618
Texas,-1.953487,0.584257
Oregon,0.364887,0.802439


In [84]:
frame

Unnamed: 0,b,d,e
Utah,1.415518,-1.38339,1.658965
Ohio,0.176986,0.739549,1.348618
Texas,-0.012936,0.584257,-1.953487
Oregon,0.364887,0.550307,0.802439


In [86]:
f = lambda x : Series([x.min(),x.max()],index=['min','max'])

In [87]:
frame.apply(f,axis=1)

Unnamed: 0,min,max
Utah,-1.38339,1.658965
Ohio,0.176986,1.348618
Texas,-1.953487,0.584257
Oregon,0.364887,0.802439


In [88]:
format = lambda x: '%.2f' % x

In [89]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,1.42,-1.38,1.66
Ohio,0.18,0.74,1.35
Texas,-0.01,0.58,-1.95
Oregon,0.36,0.55,0.8


## Sorting and Ranking
Sorting a dataset by some criterion is another important built-in operation. To sort lexicographically by row or column index, use the sort_index method, which returns a new, sorted object:

In [90]:
obj = pd.Series(range(4),index=list('dabc'))

In [91]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [92]:
obj

d    0
a    1
b    2
c    3
dtype: int64

In [93]:
obj.sort_values()

d    0
a    1
b    2
c    3
dtype: int64

In [94]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)),
                    index=['three','one'],
                    columns=list('dabc'))

In [95]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [96]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [97]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


ascending是上升的意思，设置为True则为升序，False为降序

In [99]:
frame.sort_index(axis=1,ascending=True)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [100]:
obj = pd.Series([4,7,-3,2])

In [101]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [102]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [103]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [104]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [110]:
frame = frame.sort_index(axis=1)

In [111]:
frame.sort_values(by='b')

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [112]:
frame.sort_values(by=['a','b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [113]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [114]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [116]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [117]:
obj.rank(ascending=False,method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [119]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c':[-2,5,8,-2.5]})

In [120]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [121]:
frame.rank(axis=1)

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [122]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [123]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [125]:
obj.index.is_unique

False

In [126]:
obj['c']

4

In [127]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [128]:
df

Unnamed: 0,0,1,2
a,0.56996,0.907664,1.187843
a,-1.28184,-1.209316,0.693918
b,-0.878296,-0.058204,-0.626542
b,-0.037534,0.388497,0.722498


In [129]:
df.loc['b']

Unnamed: 0,0,1,2
b,-0.878296,-0.058204,-0.626542
b,-0.037534,0.388497,0.722498


In [130]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                 [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                 columns=['one', 'two'])

In [131]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [132]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [133]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [136]:
df.mean(axis=1,skipna=True)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [138]:
import pandas_datareader.data as web

In [139]:
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [140]:
price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})

In [141]:
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [142]:
returns = price.pct_change()

In [143]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-02-05,0.017109,0.002663,0.013997,0.011644
2019-02-06,0.000345,0.005681,-0.011099,-0.026841
2019-02-07,-0.018939,-0.011577,-0.007168,-0.014813
2019-02-08,0.001175,0.003904,0.0038,-0.003322
2019-02-11,-0.005751,0.002094,-0.003975,-4.6e-05


In [144]:
returns['MSFT'].corr(returns['IBM'])

0.4864783504659754

In [145]:
returns['MSFT'].cov(returns['IBM'])

8.760712549855643e-05

In [146]:
returns.MSFT.corr(returns.IBM)

0.4864783504659754

In [147]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [148]:
uniques = obj.unique()

In [149]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [150]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [151]:
pd.value_counts(obj.values, sort=False)

d    1
c    3
a    3
b    2
dtype: int64

In [152]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [153]:
mask = obj.isin(['b','c'])

In [154]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [155]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [156]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])

In [157]:
to_match.count()

6

In [158]:
to_match.value_counts()

b    2
a    2
c    2
dtype: int64

In [159]:
unique_vals = pd.Series(['c','b','a'])

In [160]:
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])