In [1]:
## Reindexing
import pandas as pd
import numpy as np
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [2]:
# 重新配置新的index
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [3]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [4]:
frame.reindex(['a', 'b', 'c', 'd', 'e'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0
e,,,


In [5]:
frame.loc[['a', 'b']] #numpy有提過，要取得兩個以上索引資料，要使用[[]]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,


In [70]:
'''

Other methods.
Table 5-3. reindex function arguments


'''

'\n\nOther methods.\nTable 5-3. reindex function arguments\n\n\n'

In [76]:
## Dropping Entries from an Axis
#drop不會影響原資料，del才會
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [77]:
# 刪掉了Ｃ
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [78]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [79]:
del obj['c'] #del永久刪除原資料
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [89]:
## Selection with loc and iloc
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [85]:
data.iloc[1, [1, 2]]

two      5
three    6
Name: Colorado, dtype: int64

In [86]:
data.iloc[[1, 2], [1, 2, 3]]

Unnamed: 0,two,three,four
Colorado,5,6,7
Utah,9,10,11


In [49]:
# loc的差別 列切片
ser[:1]

0    0.0
dtype: float64

In [50]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [51]:
ser.loc[:1]   

0    0.0
1    1.0
dtype: float64

In [52]:
ser.iloc[:1]   

0    0.0
dtype: float64

In [None]:
'''

'loc' gets rows (or columns) with particular labels from the index.
'iloc' gets rows (or columns) at particular positions in the index (so it only takes integers).
'ix' usually tries to behave like loc but falls back to behaving like iloc if a label is not present in the index.

'''

"\n\n'loc' gets rows (or columns) with particular labels from the index.\n'iloc' gets rows (or columns) at particular positions in the index (so it only takes integers).\n'ix' usually tries to behave like loc but falls back to behaving like iloc if a label is not present in the index.\n\n"

In [None]:
## Arithmetic and Data Alignment
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [None]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [None]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [None]:
# 一定要兩個都有才能加，不然就是NaN
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [None]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [None]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [None]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [None]:
'''

將這兩個dataframe合在一起，他們的值會直接相加總和

'''

'\n\n將這兩個dataframe合在一起，他們的值會直接相加總和\n\n'

In [15]:
'''

If you add DataFrame objects with no column or row labels in common, the result will contain all nulls
完全不一樣就會都是NaN

'''
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'A': [3, 4]})
df1

Unnamed: 0,A
0,1
1,2


In [16]:
df2

Unnamed: 0,A
0,3
1,4


In [17]:
df1+df2

Unnamed: 0,A
0,4
1,6


In [15]:
## Arithmetic methods with fill values
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df2.loc[1, 'b'] = np.nan    # 將[1,'b']指定為NaN

df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [16]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [17]:
# 多的東西將會呈現NaN狀態
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [None]:
df1.loc[1, 'b'] = np.nan 

In [None]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [None]:
# 重新配置index，並使他的columns = df2.columns.
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [None]:
'''

Other methods.
Table 5-5. Flexible arithmetic methods

'''

'\n\nOther methods.\nTable 5-5. Flexible arithmetic methods\n\n'

In [59]:
## Operations between DataFrame and Series
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.loc['Utah']

frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [60]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [None]:
# 每一個都會減到series -> 廣播機制（後面會講）
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [39]:
## Sorting 
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [40]:
# 用index來排列大小的話
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [41]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),index=['three', 'one'],columns=['d', 'a', 'b', 'c'])
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [42]:
# ascending設定False可以倒著輸出出來
frame.sort_index(axis=1, ascending=False)   

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [None]:
# 用value來排列大小
obj = pd.Series([4, 7, -3, 2, np.nan, 4, np.nan])
obj

0    4.0
1    7.0
2   -3.0
3    2.0
4    NaN
5    4.0
6    NaN
dtype: float64

In [None]:
# 任何的NaN都會被顯示於最末端
obj.sort_values()

2   -3.0
3    2.0
0    4.0
5    4.0
1    7.0
4    NaN
6    NaN
dtype: float64

In [None]:
# 你也可以依照你想要的那一行來做排序
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_values(by='b')   # 也可以多個不同的值做排序

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [None]:
## Axis Indexes with Duplicate Labels
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [None]:
# 查詢這個index是不是有唯一性的
obj.index.is_unique

False