In [4]:
import pandas as pd
import matplotlib.pyplot as plt
# from IPython.display import Image, Video

# 介绍
Numpy 提供多维数组对象以及一系列操作数组的函数, 可以说其几乎是每一个Python科学计算软件的基础库.

In [5]:
import numpy as np

Numpy的核心数据结构是ndarray, 它用来存储具有相同数据类型的多维数组. 除了数据, ndarry也包含数组的shape, size, ndim, nbytes, dtype.

In [7]:
np.ndarray?

In [8]:
d0 = np.array([[1,2],[3,4]])

In [9]:
d0

array([[1, 2],
       [3, 4]])

In [10]:
type(d0), d0.shape, d0.size, d0.ndim, d0.dtype, d0.nbytes

(numpy.ndarray, (2, 2), 4, 2, dtype('int32'), 16)

为什么需要numpy? 速度! 简单! 粗略比较一下速度.

In [11]:
a0 = np.arange(10000)
t0 = %timeit -o [i**2 for i in a0]

1000 loops, best of 3: 1.93 ms per loop


In [12]:
a1 = np.arange(10000)
t1 = %timeit -o a1**2

The slowest run took 5.50 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 8.22 µs per loop


In [13]:
t0.best/t1.best

234.49457122985947

# 数据类型
Numpy Dtypes

详细参考: numpy datatypes.

In [14]:
x = np.array([1, 2]) # Let numpy choose the datatype
y = np.array([1.0, 2.0])  # Let numpy choose the datatype
z = np.array([1, 2], dtype=np.int64)

print(x.dtype, y.dtype, z.dtype)

int32 float64 int64


In [15]:
z.nbytes*8 # nbits

128

In [16]:
x1 = x + 0.3
print(x1, x1.dtype)

[1.3 2.3] float64


In [17]:
z[0] = 3.5
z

array([3, 2], dtype=int64)

如何使用类型: 一般指定int, float, complex 即可, 不需要细分int16, int32等

In [18]:
np.sqrt([-1, 2, 3])

  if __name__ == '__main__':


array([       nan, 1.41421356, 1.73205081])

In [19]:
np.sqrt([-1, 2, 3], dtype=np.complex)

array([0.        +1.j, 1.41421356+0.j, 1.73205081+0.j])

# 存储顺序
多维数组在内存中是连续储存的, 本质上可以看成是一维, 如何将内存中数据映射到多维数组中取决于数组是按行存储的还是按列存储的. 例如有四个整数1,2,3,4, 那么:

按行存储就是: [[1, 2], [3, 4]]
按列存储就是: [[1, 3], [2, 4]]
Fotran是按列存储的, C是按行存储的.

In [20]:
a = np.arange(6, dtype=np.int8)

In [21]:
a

array([0, 1, 2, 3, 4, 5], dtype=int8)

In [22]:
a1 = a.reshape(2,3, order="F")
a1

array([[0, 2, 4],
       [1, 3, 5]], dtype=int8)

In [23]:
a2 = a.reshape(2, 3)
a2

array([[0, 1, 2],
       [3, 4, 5]], dtype=int8)

什么时候需要考虑存储顺序?

跟其他语言交互的时候, 比如调用Fortran(Numpy, Scipy中很多数值 就是调用Fortran的, Anconda现在默认使用intel mkl也是Fortran的), 但是平常使用不需要关心顺序.

Numpy中使用ndarray.strides确定映射的顺序.

In [24]:
a1.strides

(1, 2)

In [25]:
a2.strides

(3, 1)

strides确定对应维度移动一个元素应内存中移动的字节数, 如对应a1, 有(1x1, 2x1), 对应a2, 有(3x1, 1x1).

某些操作, 如transpose, reshape, 只需要改变strides即可.

In [26]:
a = np.random.rand(10, 3)

In [27]:
a.strides

(24, 8)

In [28]:
b = a.transpose()

In [29]:
b.strides

(8, 24)

In [30]:
np.shares_memory(a, b)

True

In [31]:
c = a.reshape(3, 10)

In [32]:
np.shares_memory(a, c)

True

## 帮助

In [33]:
np.array?

In [34]:
np.con*?

In [35]:
>>> np.array([[1, 2], [3, 4]])

array([[1, 2],
       [3, 4]])

# 数组创建
# 从列表创建

In [36]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([1, 2, 3])

In [37]:
a

array([[1, 2, 3],
       [4, 5, 6]])

In [38]:
b

array([1, 2, 3])

In [41]:
print("a:",a.shape, a.size, type(a), np.ndim(a), a.size)
print("b:",b.shape, b.size, type(b), np.ndim(b), b.size)

a: (2, 3) 6 <class 'numpy.ndarray'> 2 6
b: (3,) 3 <class 'numpy.ndarray'> 1 3


In [43]:
len(a), len(b) # 返回第一个维度的长度

(2, 3)

In [44]:
np.array([i for i in range(10) if i % 2 == 0])

array([0, 2, 4, 6, 8])

In [45]:
a, a.shape[0], len(a)

(array([[1, 2, 3],
        [4, 5, 6]]), 2, 2)

## 使用Numpy函数创建

In [46]:
np.zeros(2, 3) # Create an array of all zeros

TypeError: data type not understood

In [47]:
np.zeros((2, 3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [48]:
np.ones((5, 5))

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [49]:
np.full((2, 3), 7)

array([[7, 7, 7],
       [7, 7, 7]])

In [50]:
np.eye(2)

array([[1., 0.],
       [0., 1.]])

In [51]:
np.random.random((2, 3))

array([[0.45876478, 0.79251043, 0.0846241 ],
       [0.54418218, 0.0191857 , 0.38711422]])

In [52]:
np.arange(9).reshape(3,-1)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [53]:
np.linspace(0, 1.0, 10)

array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])

In [56]:
np.tril(np.arange(9).reshape(3, -1)) # 返回下三角矩阵
# np.triu 返回上三角矩阵

array([[0, 0, 0],
       [3, 4, 0],
       [6, 7, 8]])

In [57]:
np.random.rand(3, 3)

array([[0.71473042, 0.95872367, 0.28969472],
       [0.32984731, 0.31908785, 0.49106201],
       [0.53514671, 0.30591923, 0.95675115]])

# 从文件读取

In [59]:
!head test.dat
!wc test.dat

'head' 不是内部或外部命令，也不是可运行的程序
或批处理文件。
'wc' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


In [61]:
# a = np.genfromtxt("test.dat", delimiter=",", comments="#")
a[:10], a.shape

In [65]:
# 保存数组到可读文件
np.savetxt("test.dat", np.random.random((1000, 5)), delimiter=",", header="show how to save array dat.\n a simple example")

# 保存二进制文件
np.savetxt("test.npy", np.random.random((1000, 5)))

In [66]:
# 读取大文件
def generate_txt_filel(length=1e6, ncols=20):
    data = np.random.random((int(length), int(ncols)))
    np.savetxt("large_text_file.csv", data, delimiter=',')
    
def iter_loadtxt(filename, delimiter=',', skiprows=0, dtype=float):
    def iter_func():
        with open(filename, 'r') as infile:
            for _ in range(skiprows):
                next(infile)
            for line in infile:
                line = line.rstrip().split(delimiter)
                for item in line:
                    yield dtype(item)
        iter_loadtxt.rowlength = len(line)
    
    data = np.fromiter(iter_func(), dtype=dtype)
    data = data.reshape((-1, iter_loadtxt.rowlength))
    return data

In [None]:
# generate_text_file() # 477M

In [None]:
!ls -lh large_text_file.csv

In [None]:
%time data = np.genfromtxt('large_text_file.csv', delimiter=",")
# CPU times: user 24.2 s, sys: 8.5 s, total: 32.7 s
# Wall time: 32.8 s

In [None]:
# %time data = iter_loadtxt('large_text_file.csv')

In [None]:
%time data = pd.read_csv('large_text_file.csv')
CPU times: user 5.48 s, sys: 1.27 s, total: 6.75 s
Wall time: 6.13 s