# Pandas & Numpy

* 運算速度快
* 消耗資源少，採用矩陣運算
* 函式庫中有很多方便好用的資料分析工具

pip install

# numpy

* 裝載相同類型資料的多維度陣列，通常會稱一維陣列為向量(vector)，二維陣列為矩陣(matrix)
* 常使用於機器學習(深度學習、神經網路)
* 資料科學分析數據
* 參考：https://medium.com/python4u/hello-numpy-b5ebe67a1ada

In [1]:
import numpy as np

In [11]:
a = np.array([1, 2, 3, 4]) # 建立一維陣列，list、tuple皆可
print(a)
print('-----')
b = np.array([(2.5, 1, 3, 4.5), (5, 6, 7, 8)], dtype = float)  # 二維陣列, type可加可不加
print(b)
print('-----')
c = np.array([[(2.5, 1, 3, 4.5), (5, 6, 7, 8)], [(2.5, 1, 3, 4.5), (5, 6, 7, 8)]])  #三維陣列建立
print(c)

[1 2 3 4]
-----
[[2.5 1.  3.  4.5]
 [5.  6.  7.  8. ]]
-----
[[[2.5 1.  3.  4.5]
  [5.  6.  7.  8. ]]

 [[2.5 1.  3.  4.5]
  [5.  6.  7.  8. ]]]


In [16]:
a = np.zeros((2, 3)) # 2x3全為0的陣列
print(a, '\n-----')
b = np.ones((2, 3, 4)) # 2x3x4全為1的陣列
print(b, '\n-----')
c = np.full((3,2), 8) # 3x2全為8的陣列
print(c, '\n-----')
d = np.eye(2)  # 2x2的單位矩陣
print(d, '\n-----')
e = np.random.random((2,3))  # 2x3隨機值(0~1)的矩陣
print(e)

[[0. 0. 0.]
 [0. 0. 0.]] 
-----
[[[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]] 
-----
[[8 8]
 [8 8]
 [8 8]] 
-----
[[1. 0.]
 [0. 1.]] 
-----
[[0.12564791 0.43289935 0.8941641 ]
 [0.69969783 0.13211744 0.5187089 ]]


In [25]:
a = np.ones((2, 3, 4))
print(a)
print(a.shape) # 陣列維度
print(len(a)) # 陣列長度(x)
print(a.ndim) # 陣列維度大小(number of dimensions)
print(a.size) # 陣列元素數量

a = a.astype(str) # 更換元素類別
print(a)
print('=======')
b = a.reshape(4, 6)
print(b)

[[[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]]
(2, 3, 4)
2
3
24
[[['1.0' '1.0' '1.0' '1.0']
  ['1.0' '1.0' '1.0' '1.0']
  ['1.0' '1.0' '1.0' '1.0']]

 [['1.0' '1.0' '1.0' '1.0']
  ['1.0' '1.0' '1.0' '1.0']
  ['1.0' '1.0' '1.0' '1.0']]]
[['1.0' '1.0' '1.0' '1.0' '1.0' '1.0']
 ['1.0' '1.0' '1.0' '1.0' '1.0' '1.0']
 ['1.0' '1.0' '1.0' '1.0' '1.0' '1.0']
 ['1.0' '1.0' '1.0' '1.0' '1.0' '1.0']]


In [40]:
# index
a = np.array([1, 2, 3, 4])
print(a[0])
print(a[0:2])
print('-------')
b = np.array([(2.5, 1, 3, 4.5), (5, 6, 7, 8)])
print(b[1])
print(b[1, 3])
print(b[0:, :2])

1
[1 2]
[1]
-------
[5. 6. 7. 8.]
8.0
[[2.5 1. ]
 [5.  6. ]]


In [50]:
a = np.array([[1, 2], [1, 2]])
b = np.array([[2, 4], [2, 4]])
print(np.add(a, b))     #等同a+b
print(a+b)
print('---')
print(np.subtract(a, b)) #等同a-b
print('---')
print(np.multiply(a, b)) #等同a*b
print(a.dot(b)) #矩陣乘法
print('---')
print(np.divide(a, b)) #等同a/b

[[3 6]
 [3 6]]
[[3 6]
 [3 6]]
---
[[-1 -2]
 [-1 -2]]
---
[[2 8]
 [2 8]]
[[ 6 12]
 [ 6 12]]
---
[[0.5 0.5]
 [0.5 0.5]]


In [54]:
a = np.array([(2.5, 1, 3, 4.5), (5, 6, 7, 8)]) # 多維必須思考軸的對應

print(a.sum(axis=0))
print(a.min(axis=1))
print(a.max(axis=0))
print('---')
print(np.median(a, axis=0))
print(np.mean(a, axis=1))
print(np.std(a, axis=0))

[ 7.5  7.  10.  12.5]
[1. 5.]
[5. 6. 7. 8.]
---
[3.75 3.5  5.   6.25]
[2.75 6.5 ]
[1.25 2.5  2.   1.75]


# Pandas
* 等同於表格(ex:excel)，有column、row
* pandas為基於numpy產生的套件，使用大量資料運算時較能表現佳
* 參考：https://oranwind.org/python-pandas-ji-chu-jiao-xue/

In [55]:
import pandas as pd

In [56]:
# Series
# 索引標籤(index)和實際值的陣列組合
series_1 = pd.Series([2, 1, 7, 3])

print(series_1)

0    2
1    1
2    7
3    3
dtype: int64


In [57]:
# DataFrame
# 類似試算表和關聯式資料庫資料表（table）欄列結構

dataframe_1 = pd.read_csv('pandas_sample.csv')
print(dataframe_1)

  student  id  math  english  science description class
0    Jack   1    80     20.0     61.0    positive    2A
1     Ash   2    70     91.0     80.0       smart    2B
2    Alex   3    89      NaN     49.0         sad    2A
3   Tommy   4    16     19.0     21.0         mad    2C
4   Merry   5    49     17.0     46.0         mad    2C
5   Apple   6    86     54.0     87.0       smart    2C
6     NaN   7    64     29.0     19.0    negative    2A
7  Queens   8    43     34.0     46.0         sad    2B
8   Apolo   9    58     64.0      NaN    positive    2B
9   Densy  10    90     58.0     83.0    positive    2B


In [59]:
# 創立dataframe
# from dict
groups = ["Movies", "Sports", "Coding", "Fishing", "Dancing", "cooking"]  
num = [46, 8, 12, 12, 6, 58]

dict = {"groups": groups,  
        "num": num}

df = pd.DataFrame(dict)
print(df)

    groups  num
0   Movies   46
1   Sports    8
2   Coding   12
3  Fishing   12
4  Dancing    6
5  cooking   58


In [60]:
# from list
df = pd.DataFrame(columns = ['groups', 'num'])
df['groups'] = groups
df['num'] = num
print(df)

    groups  num
0   Movies   46
1   Sports    8
2   Coding   12
3  Fishing   12
4  Dancing    6
5  cooking   58


In [61]:
print(select_df.shape) # 回傳列數與欄數  
print('---')
print(select_df.head(3)) # 回傳前三筆值  
print('---')  
print(select_df.tail(3)) # 回傳後三筆值  
print('---')  
print(select_df.columns) # 回傳欄位名稱  
print('---')  
print(select_df.index) # 回傳 index  
print('---')  
print(select_df.info) # 回傳資料內容

(6, 2)
---
   groups  num
0  Movies   46
1  Sports    8
2  Coding   12
---
    groups  num
3  Fishing   12
4  Dancing    6
5  cooking   58
---
Index(['groups', 'num'], dtype='object')
---
RangeIndex(start=0, stop=6, step=1)
---
<bound method DataFrame.info of     groups  num
0   Movies   46
1   Sports    8
2   Coding   12
3  Fishing   12
4  Dancing    6
5  cooking   58>


In [62]:
# iloc