# 一、Numpy
* Numpy（Numerical Python extensions）是一个第三方的Python包，用于科学计算，前身是1995年就开始开发的一个用于数组运算的库
* 极大地简化了向量和矩阵的操作处理，是一些主力软件包（如scikit-learn、scipy、pandas和tensorflow）架构的基础部分。
* Quickstart  tutorial：https://docs.scipy.org/doc/numpy/user/quickstart.html
* A Visual Intro to NumPy and Data Representation：http://jalammar.github.io/visual-numpy/

In [None]:
import numpy as np

In [None]:
np.array([2, 3, 6, 7])

In [None]:
a=np.array([0,0,0])
a

In [None]:
np.array([2, 3, 6, 7.])

In [None]:
np.array([2, 3, 6, 7+1j])

## 等差数列的数组

In [None]:
np.arange(5)

In [None]:
np.arange(10, 100, 20, dtype=float)

In [None]:
np.linspace(0., 2.5, 5)

In [None]:
x = np.linspace(0, 2*np.pi, 10)
print(x)

In [None]:
print(x.shape)

In [None]:
print(x.ndim)

In [None]:
f = np.sin(x)
f

## 二维数组

In [None]:
a = np.array([[1, 2, 3], [4, 5, 6]])
a

In [None]:
a.shape

In [None]:
a.ndim

In [None]:
a.size

## 改变数组的形状

In [None]:
a = np.arange(0, 20, 1)      # 一维数组
a

In [None]:
b = a.reshape((4, 5))
b

In [None]:
c = a.reshape((20, 1))
c

In [None]:
d = a.reshape((-1, 4))
d

In [None]:
print(a)

In [None]:
a.shape = (4, 5)
print(a)

## 形状(N, ), (N, 1)和(1, N)不同

+ 形状(N, )：数组是一维的
+ 形状(N, 1)：数组是二维的，N行一列
+ 形状(1, N)：数组是二维的，一行N列

In [None]:
a = np.array([1, 2, 3, 4, 5])    # 一维数组
b = a.copy()

In [None]:
c1 = np.dot(np.transpose(a), b)  # 转置对一维数组不起作用
print(c1)
c2 = np.dot(a, np.transpose(b))  # 转置也可以写成b.T
print(c2)

In [None]:
ax = np.reshape(a, (5, 1))
print(ax)

bx = np.reshape(b, (1, 5))
print(bx)

c = np.dot(ax, bx)
print(c)

## 填充数组

In [None]:
np.zeros(3)

In [None]:
np.zeros((2, 2), complex)

In [None]:
np.ones((2, 3))

In [None]:
np.full((2, 2), 5)

In [None]:
# rand: 0到1之间[0, 1)均匀分布的随机数
np.random.rand(2, 4)

In [None]:
# randn：服从均值为0，方差为1的标准正态（高斯）分布的随机数
np.random.randn(2, 4)

## 索引与切片

In [None]:
a = np.array([0, 1, 2, 3, 4])
a[1:3]

In [None]:
a[:3]

In [None]:
a[1:]

In [None]:
a[1:-1]

In [None]:
a[:]

In [None]:
a[::2]

In [None]:
a[1:4:2]

In [None]:
a[::-1]

In [None]:
a = np.arange(12); a.shape = (3, 4); a

In [None]:
a[1, 2]

In [None]:
a[1, -1]

In [None]:
a[:, 1]

In [None]:
a[2, :]

In [None]:
a[1][2]

In [None]:
a[2]

In [None]:
a[0, 1:3]

In [None]:
a[1:, 2:]

In [None]:
a[::2, 1::2]

二维数组a如下所示：

![image.png](attachment:image.png)

In [None]:
a = np.array([[0,1,2,3,4,5,],
     [10,11,12,13,14,15],
     [20,21,22,23,24,25],
     [30,31,32,33,34,35],
     [40,41,42,43,44,45],
     [50,51,52,53,54,55]])

In [None]:
a[0, 3:5]

In [None]:
a[4:, 4:]

In [None]:
a[:, 2]

In [None]:
a[2::2, ::2]

## 拷贝与视图

In [None]:
a = np.arange(5); a

In [None]:
b = a[2:].copy()          # .copy()
b

In [None]:
b[0] = 100;
print(b)
print(a)

## 数组运算

In [None]:
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

In [None]:
print(x + y)              # 加法运算
print(np.add(x, y))

In [None]:
print(x - y)              # 减法运算
print(np.subtract(x, y))

In [None]:
print(x * y)              # 乘法运算
print(np.multiply(x, y))

In [None]:
print(x / y)              # 除法运算
print(np.divide(x, y))

In [None]:
print(np.sqrt(x))         # 平方根运算

### 广播机制（broadcasting）
https://www.runoob.com/numpy/numpy-broadcast.html

In [None]:
a = np.array([[ 0, 0, 0],
           [10,10,10],
           [20,20,20],
           [30,30,30]])
b = np.array([1,2,3])
print(a + b)

### 矩阵乘法

In [None]:
A = np.array([[1, 2], [3, 4]])
print(np.dot(A, A))
print(A*A)

In [None]:
x = np.array([10, 20])
np.dot(A, x)            #等价于A.dot(x)

In [None]:
np.dot(x, A)            #等价于x.dot(A)

## 更高效的数学函数
https://docs.scipy.org/doc/numpy/reference/routines.math.html

In [None]:
x = np.array([[1,2],[3,4]])
x

In [None]:
print(np.sum(x))          # Compute sum of all elements;
print(np.sum(x, axis=0))  # Compute sum of each column;
print(np.sum(x, axis=1))  # Compute sum of each row;

# 二、Matplotlib
* Matplotlib是Python中最常用的可视化工具之一，可以非常方便地创建海量类型的2D图表和一些基本的3D图表
* 因为在函数的设计上参考了MATLAB，所以叫做Matplotlib
* Pyplot tutorial：https://matplotlib.org/stable/tutorials/introductory/pyplot.html

In [None]:
import matplotlib.pyplot as plt

plt.plot([1,2,3,4], [1,4,9,16], 'r--')
plt.axis([0, 6, 0, 20])
plt.show()

In [None]:
%matplotlib inline

## 一张图中多条曲线

In [None]:
t = np.arange(0., 5., 0.2)
plt.plot(t, t, 'r--', t, t**2, 'bs', t, t**3, 'g^')

In [None]:
# Compute the x and y coordinates for points on sine and cosine curves
x = np.arange(0, 3*np.pi, 0.1)
y_sin = np.sin(x)
y_cos = np.cos(x)

# Plot the points using matplotlib
plt.plot(x, y_sin)
plt.plot(x, y_cos)

plt.xlabel('x axis label')
plt.ylabel('y axis label')
plt.title('Sine and Cosine')
plt.legend(['Sine', 'Cosine'])

## 多张子图

In [None]:
def f(t):
    return np.exp(-t) * np.cos(2*np.pi*t)

t1 = np.arange(0.0, 5.0, 0.1)
t2 = np.arange(0.0, 5.0, 0.02)

plt.figure()
plt.subplot(211)
plt.plot(t1, f(t1), 'bo', t2, f(t2), 'k')

plt.subplot(212)
plt.plot(t2, np.cos(2*np.pi*t2), 'r--')

In [None]:
# figure的使用
x = np.linspace(-1, 1, 50)
y1 = 2 * x + 1

# figure 1
plt.figure(1)
plt.plot(x, y1)


# figure 2
y2 = x**2
plt.figure()
plt.plot(x, y2)


# figure 3，指定figure的编号并指定figure的大小, 指定线的颜色, 宽度和类型
y2 = x**2
plt.figure(num = 5, figsize = (4, 4))
plt.plot(x, y1)
plt.plot(x, y2, color = 'red', linewidth = 1.0, linestyle = '--')

## 分类变量的图

In [None]:
names = ['group_a', 'group_b', 'group_c']
values = [1, 10, 100]

plt.figure(1, figsize=(9, 3))

plt.subplot(131)
plt.bar(names, values)

plt.subplot(132)
plt.scatter(names, values)

plt.subplot(133)
plt.plot(names, values)

plt.suptitle('Categorical Plotting')

## 添加文本

In [None]:
mu, sigma = 100, 15
x = mu + sigma * np.random.randn(10000)

# the histogram of the data
n, bins, patches = plt.hist(x, 50, density=1, facecolor='g', alpha=0.75)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of IQ')
plt.text(60, .025, r'$\mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)

## 添加注释

In [None]:
ax = plt.subplot(111)

t = np.arange(0.0, 5.0, 0.01)
s = np.cos(2*np.pi*t)

line, = plt.plot(t, s, lw=2)

plt.annotate('local max', xy=(2, 1), xytext=(3, 1.5), arrowprops=dict(facecolor='black', shrink=0.05))
plt.ylim(-2,2)

## 图像显示

In [None]:
plt.figure(figsize=(20, 20))
little_dog_img = plt.imread('plot_types.jpg')
plt.imshow(little_dog_img)

# 三、Pandas
* Pandas是python的一个数据分析包
* 由AQR Capital Management于2008年4月开发，并于2009年底开源出来
* 10 Minutes to pandas：https://pandas.pydata.org/docs/user_guide/10min.html

## 3.1 Series
* 一维**标记**数组，由一组数据以及一组与之相关的数据标签（即索引）组成。

### 传入列表创建Series

In [None]:
import pandas as pd

In [None]:
# 传入列表，使用缺省整数索引
obj = pd.Series([4, 7, -5, 3])
obj

In [None]:
obj.values

In [None]:
obj.index

In [None]:
# 传入列表，并给定索引
obj2 = pd.Series([4,7,-5,3], index=['d','b','a','c'])
obj2

In [None]:
obj2.index

In [None]:
# 修改索引
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

### 传入字典创建Series

In [None]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

In [None]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

### 检测缺失数据

In [None]:
pd.isnull(obj4)

In [None]:
pd.notnull(obj4)

### 访问Series中的元素

In [None]:
# 通过索引访问Series中的元素
print(obj2['a'])
obj2['d'] = 6
obj2[['c','a','d']]

In [None]:
print('b' in obj2)
print('e' in obj2)
print(3 in obj2.values)

### 对Series的操作

In [None]:
# 用条件过滤数组
obj2[obj2 > 0]

In [None]:
# 标量乘法
obj2*2

In [None]:
# 数学函数
np.exp(obj2)

In [None]:
print(obj3)
print(obj4)

obj3 + obj4      # 自动对齐索引

In [None]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

## 3.2 DataFrame
* 二维表格型数据结构， 含有一组有序的列，每列都有标签，可看成一个Series的字典，既有行索引又有列索引

### 创建DataFrame，传入由等长列表或数组构成的字典

In [None]:
data={'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
      'year':[2000, 2001, 2002, 2001, 2002],
      'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
frame

In [None]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

In [None]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five'])
frame2

### 创建DataFrame, 传入嵌套字典

In [None]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002:3.6}}
frame3 = pd.DataFrame(pop)
frame3

In [None]:
frame4 = pd.DataFrame(pop, index=[2001, 2002, 2003])
frame4

### 缺失数据处理

In [None]:
frame3.dropna(how='any')    # 删除有任何缺失值的行

In [None]:
frame4.dropna(how='all',inplace=True)

In [None]:
frame4

In [None]:
frame4.fillna(value=5)     # 填充缺失值

In [None]:
frame3.isnull()    # 判断哪些是缺失值

### 访问单列

In [None]:
frame2['state']          # 字典记法

In [None]:
frame2.state          # 属性记法

### 访问单行

In [None]:
frame2

In [None]:
frame2.loc['three']

In [None]:
frame2.iloc[2]

### 修改列

In [None]:
frame2['debt'] = 16.5
frame2

In [None]:
frame2['debt'] = np.arange(5)
frame2

In [None]:
val = pd.Series([-1.2, -1.5, -1.7], index=[ 'two', 'four', 'five'])
frame2['debt'] = val
frame2

### 增加列

In [None]:
frame2['eastern'] = (frame2.state == 'Ohio')
frame2

### 删除行和列

In [None]:
del frame2['eastern']
frame2

In [None]:
frame2.drop(['pop','debt'], axis=1)

In [None]:
frame2

In [None]:
frame2.drop(columns=['pop','debt'])

In [None]:
frame2.drop(['one', 'three', 'five'], axis=0)

In [None]:
frame2

In [None]:
frame2.drop(['pop','debt'], axis=1, inplace=True)
frame2