# python data analysis
## appendix
### 变量和按引用传递

In [1]:
a = [1, 2, 3]  
# 变量赋值的过程可以理解为，1.创建一个对象，2.创建一个变量（名），3.将这个变量和对象绑定，变量指向该对象
# 变量使用，以参数形式传递给函数，传入一个引用，不是将对象复制到函数中去

In [2]:
b = a
a.append(4)
print(b)
# a, b是变量，是指向同一个对象的两个引用

[1, 2, 3, 4]


### 类型

In [3]:
a = [1, 2, 3]
# a，一个变量，指向一个对象（list）
# [1, 2, 3]，一个（list）对象，包含该对象的类型信息等

In [4]:
a = 6
type(a)
a = 'abc'
type(a)
# 变量a不包含对象的类型信息，其指向的对象包含

int

str

In [5]:
c = 3.1415926
isinstance(c, (int, float)) # 判断变量（指向的对象）是否属于某个（些）类型（之一）

True

### 属性和方法

In [6]:
# attribute：存储在对象内部的其他python对象
# method：与对象有关的能够访问其内部数据的函数

In [7]:
a = 'manunited'

In [None]:
a.<Tab> # 返回a的所有方法

In [8]:
getattr(a, 'split') # 返回特定方法是否属于该对象

<function str.split>

### 引入模块、函数

In [None]:
import moduleName
a = moduleName.funName(...)

import moduleName as defNM
a = defNM.funName(...)

from moduleName import funName
a = funName(...)

from moduleName import funName as defFN
a = defFN(...)

### 二元运算、比较运算

In [9]:
a = [1, 2, 3]
b = a
c = list(a)

In [10]:
b is a # is 判断两个引用是否指向同一对象，is，is not常用来判断变量是否为None
c is a # list函数会创建新的列表

True

False

In [11]:
d = 7
e = 11
e // d # 取整除法
e ** d # 幂运算
e & d # and
e | d # or
e ^ d # xor 

1

19487171

3

15

12

In [13]:
f = 1
g = 1
f & g
f | g
f ^ g

1

1

0

### 惰性

In [12]:
a = b = c = 5
d = a + b * c # python 急性子的语言，计算结果和表达式都是立即求值的，此处，先计算b * c的结果25，再加上a
# 利用iterator和generator等可以实现惰性/延迟运算，不会立即计算中间结果

### 可变、不可变对象

In [15]:
a_list = ['foo', 2, [4, 5]] # list可变
a_list[2] = (3, 4)
a_list

['foo', 2, (3, 4)]

In [16]:
# 不可变的immutable，是指不能修改内存块的数据。即便修改了，实际是创建了一个新对象，并将其引用赋值给原变量
a_tuple = (3, 5, (3, 4)) # tuple 是不可变对象
a_tuple[1] = 'four'

TypeError: 'tuple' object does not support item assignment

### 标量类型

|类型|说明|
|--|--|
|None|null值|
|str|字符串|
|float|浮点型|
|bool|布尔型|
|int|整型（带符号整数）|
|long|长整型（带符号整数，任意精度）|


### 数值类型

In [17]:
ival = 123456789
ival ** 3

fval = 1.23456
fval1 = 1.23e-7

3 / 2
3 // 2

cval = 1 + 2j # j表示虚数
cval * (1 - 2j)

1881676371789154860897069

1.5

1

(5+0j)

### 字符串

In [18]:
a = 'one way of writing a string'
b = "another way"

c = '''
this is a long string that
sapans multiple lines
'''
d = """
to write a multiple string 
in another way 
"""

In [20]:
e = 'string is immutable' # string不可变对象
e[7] = 7 # error

TypeError: 'str' object does not support item assignment

In [21]:
f = e.replace('string', 'longer string') # replace方法是创建了新的对象
f

'longer string is immutable'

In [22]:
g = 3.7
h = str(g)

s = 'python'
list(s)
s[:3]

s = '12\\34'  # backslash \, escape character
print(s)


s = r'this\has\no\special\characters' # r''
s

a = 'this is the first half'
b = 'and this is the second half'
a + b

['p', 'y', 't', 'h', 'o', 'n']

'pyt'

12\34


'this\\has\\no\\special\\characters'

'this is the first halfand this is the second half'

In [23]:
template = '%.2f %s are worth $%d' # 字符串格式化输出
template % (4.567, 'Argentine Pesos', 1)

'4.57 Argentine Pesos are worth $1'

### Booleans 布尔值

In [24]:
True and True
False and True

True

False

In [25]:
a = [1, 2, 3]
if a:
    print('I found something!')

b = []
if not b:
    print('Empty!')

I found something!
Empty!


In [26]:
bool([]), bool([1, 2, 3])
bool('Hello World!'), bool('')
bool(0), bool(1)

(False, True)

(True, False)

(False, True)

### Type casting 类型转换

In [27]:
s = '3.14159'
fval = float(s)
type(fval)
int(fval)
bool(fval)

float

3

True

### None

In [28]:
# it’s worth bearing in mind that None is not a reserved keyword but rather a unique instance of NoneType
a = None
a is None

b = 1
b is not None

True

True

In [29]:
def add_and_maybe_multiple(a, b, c=None): # None 作为参数默认值
    result = a + b
    if c is not None:
        result = result * c
    return result

In [30]:
add_and_maybe_multiple(1, 2, 3)

9

### Dates and Times

In [39]:
from datetime import datetime, date, time
dt = datetime(2018, 6, 22, 9, 45, 59)
dt.day
dt.minute
dt.date()
dt.time()

22

45

datetime.date(2018, 6, 22)

datetime.time(9, 45, 59)

In [40]:
dt.strftime('%m%d%Y %H:%M')
dt.replace(minute = 0, second = 0)

'06222018 09:45'

datetime.datetime(2018, 6, 22, 9, 0)

In [41]:
datetime.strptime('20180202', '%Y%m%d')

datetime.datetime(2018, 2, 2, 0, 0)

In [43]:
dt2 = datetime(2018, 5, 26)
delta = dt2 - dt
delta
type(delta)

datetime.timedelta(-28, 51241)

datetime.timedelta

In [44]:
dt + delta

datetime.datetime(2018, 5, 26, 0, 0)

## 控制流
### 条件判断
`if elif else`

In [46]:
def equal0(x):
    if (x < 0):
        print('It\'s negative')
    elif (x == 0):
        print('equal to 0')
    else:
        print('positive')

In [47]:
equal0(7)

positive


In [48]:
a = 5
b = 7
c = 8
d = 4
if a < b or c > d:  # c > d 不会被计算，python立即计算结果
    print('made it')

made it


### 循环
`for`

In [49]:
seq = [1, 2, None, 4, None, 5]
total = 0
for value in seq:
	if value is None:
		continue
	total += value
total

12

In [50]:
seq = [1, 2, 0, 4, 6, 5, 2, 1]
total_til_5 = 0
for i in seq:
	if i == 5:
		break
	total_til_5 += i
total_til_5

13

`while`

In [51]:
x = 256
total = 0
while x > 0:
	if total > 500:
		break
	total += x
	x = x // 2
total

504

In [62]:
range(10) # 返回一个用于逐个产生整数的迭代器
list(range(10))

range(0, 10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [63]:
range(0, 20, 2)
list(range(0, 20, 2))

range(0, 20, 2)

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [65]:
seq = [1, 2, 3, 4, 5]
for i in range(len(seq)):
    val = seq[i]
    print(val)

1
2
3
4
5


In [69]:
sum = 0
for i in range(10000):
    if (i % 3 == 0) or (i % 5 == 0):
        sum += i
sum

23331668

### 空语句
`pass`

In [52]:
def equal0(x):
	if x < 0:
		print('negative')
	elif x == 0:
		pass #空操作
	else:
		print('positive')
equal0(7)
equal0(0)
equal0(-7)

positive
negative


### 异常处理

In [53]:
float('3.1415')

3.1415

In [54]:
float('something') # ValueError

ValueError: could not convert string to float: 'something'

In [55]:
# 处理 ValueError
def attempt_float(x):
	try:
		return float(x)
	except ValueError: # try语句发生异常时，执行except语句
		return x

In [56]:
attempt_float('3.1415')
attempt_float('something')

3.1415

'something'

In [57]:
float((1, 2)) #TypeError

TypeError: float() argument must be a string or a number, not 'tuple'

In [58]:
attempt_float((1, 2)) #TypeError

TypeError: float() argument must be a string or a number, not 'tuple'

In [59]:
# 处理 ValueError TypeError
def attempt_float1(x):
	try:
		return float(x)
	except (ValueError, TypeError):
		return x

attempt_float1((1, 2))

(1, 2)

In [None]:
f = open(path, 'w')
try:
	write_to_file(f)
finally: # 无论try语句成功与否，finally后的语句都执行
	f.close()

In [None]:
f = open(path, 'w')
try:
	write_to_file(f)
except:
	print('Failed')
else: # try语句成功时，执行else语句
	print('Succeeded')
finally:
	f.close()

### 三元表达式

In [72]:
x = 5
'Non_negative' if x > 0 else 'Negative' #将一个if-else块转化为一行

'Non_negative'

## 数据结构
### 元组 tuple

In [73]:
tup = (2, 3, 7, 11, 18)
nested_tup = ((2, 3, 7), (11, 18))
tuple([2, 3, 7])
a_tup = tuple('string')
a_tup[2]

(2, 3, 7)

'r'

In [2]:
tup = ('foo', [1, 2], True)

In [4]:
tup[2] = False # TypeErroe，tuple object dose not support item assignment


TypeError: 'tuple' object does not support item assignment

In [8]:
tup[1].append(3) # Q 怎么解释，A tup[1]指向了一个list，list作为tuple的一个元素不能变，list指向的元素可以变
tup

('foo', [1, 2, 3, 3], True)

In [11]:
(3, None, 'foo') + (6, 0) + ('bar',)  # ('bar') is a sting, ('bar',) is a tuple

(3, None, 'foo', 6, 0, 'bar')

In [12]:
('foo', 'bar') * 3

('foo', 'bar', 'foo', 'bar', 'foo', 'bar')

#### unpack

In [13]:
tup = (3, 11, 18)
gary, ryan, paul = tup
ryan

11

In [14]:
tup = (3, 7, (11, 18))
gary, david, (ryan, paul) = tup
ryan

11

In [16]:
a, b = (1, 2)
a
b
b, a = a, b # 交换变量名c = a, a = b, b =c
a
b

1

2

2

1

In [17]:
seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
for a, b, c in seq: # unpacking 长用于对tuple或list序列进行迭代
    sum = a + b + c
    print(sum)

6
15
24


#### tuple method

In [18]:
# 由于tuple的大小和内存不能修改，其方法很少
a = (1, 2, 3, 2, 2, 5, 5, 7)
a.count(2)

3

### list

In [20]:
a_list = [2, 3, 7, 'go']
tup = (1, 3, 'hello')
b_list = list(tup)
a_list
b_list

[2, 3, 7, 'go']

[1, 3, 'hello']

In [25]:
b_list[1] = 'oligen'
b_list

[1, 'oligen', 'hello']

#### list method

In [47]:
b_list.append('world')
b_list

b_list.insert(1, 'ryan') #insert的计算量比append大
b_list

b_list.pop(2) # insert的逆运算
b_list

[1, 'ryan', 'world', 'world', 'hello', 'world']

[1, 'ryan', 'ryan', 'world', 'world', 'hello', 'world']

'ryan'

[1, 'ryan', 'world', 'world', 'hello', 'world']

['x', 'y', 'world', 'world', 'hello', 'world']

In [32]:
b_list.append('hello')
b_list.remove('hello') # 删除第一个hello
b_list

[1, 'ryan', 'world', 'world', 'hello']

In [33]:
'hello' in b_list # 判断元素是不是在list中，python对list采用线性扫面，若判断元素是否在dict或set中，采用基于哈希表的方法，效率高

True

#### list 合并 排序

In [34]:
[2, 'gary'] + [7, 'david']
a = [2, 'gary']
a.extend([7, 'david']) #extend将元素附加到现有列表，比两个列表相加合并（创建新列表合并原有两个列表）的效率高

[2, 'gary', 7, 'david']

In [35]:
a = [3, 7, 18, 11, 20]
a.sort()
a

[3, 7, 11, 18, 20]

In [36]:
b = ['gary', 'david', 'ryan', 'paul', 'ole']
b.sort(key = len) #按字符串长度排序
b

['ole', 'gary', 'ryan', 'paul', 'david']

#### 二分搜素

In [38]:
import bisect

In [43]:
c = [1, 2, 2, 3, 5, 5, 5, 7]
bisect.bisect(c, 2) #插入到的位置
bisect.bisect(c, 5)

3

7

In [44]:
bisect.insort(c, 6) # 插入相应的位置
c

[1, 2, 2, 3, 5, 5, 5, 6, 7]

#### 索引

In [56]:
seq = [3, 2, 1, 27, 2, 6, 8, 85]
seq[3:5] # start:stop， start包含在内，stop不包含，元素个数为stop - start
seq[3:4]
seq[1:3] = ['a', 'b', 'c']
# seq[1:4] = ['a', 'b', 'c'] which one is correct
seq[:5]
seq[2:]
seq[-5:]
seq[-3:-1]

[27, 2]

[27]

[3, 'a', 'b', 'c', 27]

['b', 'c', 27, 2, 6, 8, 85]

[27, 2, 6, 8, 85]

[6, 8]

In [48]:
seq[::2] # start stop step
seq[::-1]

[3, 'b', 27, 6, 85]

[85, 8, 6, 2, 27, 'c', 'b', 'a', 3]

### 内置的序列函数

In [49]:
some_list = ['foo', 'bar', 'zip']
mapping = dict((v, i) for i, v in enumerate(some_list))
# enumerate 逐个返回序列的(index, value)元组
mapping

{'bar': 1, 'foo': 0, 'zip': 2}

In [50]:
enumerate(some_list)

<enumerate at 0x6be85a0>

In [59]:
type(enumerate(some_list))

enumerate

In [61]:
list(enumerate(some_list))

[(0, 'foo'), (1, 'bar'), (2, 'zip')]

In [52]:
# sorted 排序
sorted([7, 1, 3, 9, 3, 6, 8])
sorted('horse race')
sorted(set('this is just some string'))

[1, 3, 3, 6, 7, 8, 9]

[' ', 'a', 'c', 'e', 'e', 'h', 'o', 'r', 'r', 's']

[' ', 'e', 'g', 'h', 'i', 'j', 'm', 'n', 'o', 'r', 's', 't', 'u']

In [62]:
seq1 = ['foo', 'bar', 'baz']
seq2 = ['one', 'two', 'three']
a = zip(seq1, seq2)
type(a)
list(a)

zip

[('foo', 'one'), ('bar', 'two'), ('baz', 'three')]

In [181]:
seq3 = [True, False]
list(zip(seq1, seq2, seq3))

[('foo', 'one', True), ('bar', 'two', False)]

In [58]:
for i, (a, b) in enumerate(zip(seq1, seq2)):
    print('%d: %s, %s' % (i, a, b))

0: foo, one
1: bar, two
2: baz, three


In [63]:
pitchers = [('ryan', 'giggs'), ('paul', 'scholes'), ('gary', 'nevil')]
firstName, lastName = zip(*pitchers)
firstName
lastName
# 将元组中的数unzip
# *的用法相当于zip(seq[0], seq[1], ..., seq[len(seq) - 1])

('ryan', 'paul', 'gary')

('giggs', 'scholes', 'nevil')

In [66]:
list(range(10))
list(reversed(range(10))) # reversed 按逆序迭代序列中的元素

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

### dict

In [67]:
# 哈希映射hash map/相联数组associative array，是一种可变大小的键值对集
emptyDict = {}
d1 = {'a' : 'something', 'b' : [1, 2, 3]}
d1
d1[7] = 'integer'
d1
d1['b']

{'a': 'something', 'b': [1, 2, 3]}

{'a': 'something', 'b': [1, 2, 3], 7: 'integer'}

[1, 2, 3]

In [69]:
'b' in d1
d1[5] = 'some value'
d1['dummy'] = 'another value'
d1
del d1[5]    #关键字del，删除k-v
d1
ret = d1.pop('dummy') #方法pop，删除k-v
ret
d1

True

{'a': 'something',
 'b': [1, 2, 3],
 7: 'integer',
 5: 'some value',
 'dummy': 'another value'}

{'a': 'something', 'b': [1, 2, 3], 7: 'integer', 'dummy': 'another value'}

'another value'

{'a': 'something', 'b': [1, 2, 3], 7: 'integer'}

In [70]:
d1.keys() #返回key的iterator，无序
d1.values() #返回value的iterator

dict_keys(['a', 'b', 7])

dict_values(['something', [1, 2, 3], 'integer'])

In [72]:
d1.update({'b' : 'foo', 'c' : 12}) #方法update将两个dict合并
d1

{'a': 'something', 'b': 'foo', 7: 'integer', 'c': 12}

#### 元素两两配对，组成字典

In [73]:
# mapping = {}
# for key, value in zip(key_list, value_list):
#     mapping[key] = value
mapping = dict(zip(range(5), reversed(range(5))))
mapping

{0: 4, 1: 3, 2: 2, 3: 1, 4: 0}

In [None]:
# if key in some_dict:
#     value = some_dict[key]
# else:
#     value = default_value
value = some_dict.get(key, default_value) #dict的方法get/pop可以接受一个可供返回的默认值

In [74]:
words = ['apple', 'bat', 'bar', 'atom', 'book']
by_letter = {}
for word in words:
    letter = word[0]
    if letter not in by_letter: # key in a dict
        by_letter[letter] = [word]
    else:
        by_letter[letter].append(word)
by_letter

{'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}

In [79]:
by_letter = {}
for word in words:
    letter = word[0]
    by_letter.setdefault(letter, []).append(word) # Q？
by_letter

{'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}

In [80]:
# Q？
from collections import defaultdict
by_letter = defaultdict(list)
for word in words:
    by_letter[word[0]].append(word)
by_letter

defaultdict(list, {'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']})

### set集合

In [81]:
set([2, 2, 2, 3, 1, 3, 3])
{2, 2, 2, 1, 3, 3}

{1, 2, 3}

{1, 2, 3}

In [85]:
a = {1, 2, 3, 4, 5}
b = {3, 4, 5, 6, 7}
a | b
a.union(b)
a & b
a.intersection(b)
a - b
a.difference(b)
a ^ b #对称差，异或
a.symmetric_difference(b)

{1, 2, 3, 4, 5, 6, 7}

{1, 2, 3, 4, 5, 6, 7}

{3, 4, 5}

{3, 4, 5}

{1, 2}

{1, 2}

{1, 2, 6, 7}

{1, 2, 6, 7}

In [86]:
a.add(19)
a
a.remove(1)
a

{1, 2, 3, 4, 5, 19}

{2, 3, 4, 5, 19}

In [88]:
a_set = {1, 2, 3, 4, 5}
{1, 2, 3}.issubset(a_set)
a_set.issuperset({1, 2, 3})
{1, 2, 3} == {1, 2, 3}

a.isdisjoint(b) #a、b无公共元素，True

True

True

True

False

### 列表/字典/集合推导式

In [89]:
strings = ['a', 'as', 'bat', 'car', 'dove', 'python']

In [90]:
[x.upper() for x in strings if len(x) > 2]
# dict_comp = {key-expr : value-expr for value in collection if condition}
# set_comp = {expr for value in collection if condition}

['BAT', 'CAR', 'DOVE', 'PYTHON']

In [91]:
unique_lengths = {len(x) for x in strings}
unique_lengths

{1, 2, 3, 4, 6}

In [93]:
loc_mapping = {val : index for index, val in enumerate(strings)}
loc_mapping

{'a': 0, 'as': 1, 'bat': 2, 'car': 3, 'dove': 4, 'python': 5}

In [94]:
loc_mapping1 = dict((val, idx) for idx, val in enumerate(strings))
loc_mapping1

{'a': 0, 'as': 1, 'bat': 2, 'car': 3, 'dove': 4, 'python': 5}

#### 嵌套列表推导式

In [96]:
all_data = [['tom', 'billy', 'jefferson', 'andrew', 'wesley', 'steven', 'joe'], ['susie', 'casey', 'jill', 'ana', 'eva', 'jennifer', 'stephanie']]
names_of_interest = []
for names in all_data:
    enough_es = [name for name in names if name.count('e') >= 2]
    names_of_interest.extend(enough_es)
names_of_interest

['jefferson', 'wesley', 'steven', 'jennifer', 'stephanie']

In [97]:
result = [name for names in all_data for name in names if name.count('e') >= 2]
result

['jefferson', 'wesley', 'steven', 'jennifer', 'stephanie']

In [98]:
some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
flattened = [x for tup in some_tuples for x in tup]
flattened
# 等价于
# flattened = []
# for tup in some_tuples:
#     for x in tup:
#         flattened.extend(x)

[1, 2, 3, 4, 5, 6, 7, 8, 9]

# Chap3 IPython交互式计算开发环境

## IPython 基础

In [99]:
a = 5
a

5

In [101]:
import numpy as np
data = {i : np.random.randn() for i in range(7)}
data

{0: 0.1973240137439397,
 1: -1.9162108830213502,
 2: 1.936102037701324,
 3: -0.6489930269989795,
 4: -0.056426089828586296,
 5: -0.5259936210192147,
 6: -2.079084229674212}

In [102]:
an_apple = 27
an_example = 35
# an<Tab>，按下<Tab>键，自动补全
b = [1, 2, 3]
# b.<Tab>，按下<Tab>键，查看对象所含的方法和属性
# 以下划线开头的方法和属性，包括magic method默认不显示

In [103]:
import datetime
# datetime.<Tab>，查看模块所含的函数等

# chap4 Numpy
## ndarray

In [104]:
# ndarray n维数组对象；题哦同构数据多维容器，其中的元素必须是相同类型
import numpy as np
# np.random.seed(12345)
data = np.random.randn(2, 3)
data
data * 10
data + data

array([[-0.1504187 ,  0.31503415,  1.82145624],
       [-0.44506091, -1.87216732, -2.16603435]])

array([[ -1.50418705,   3.15034147,  18.21456239],
       [ -4.45060907, -18.72167319, -21.66034353]])

array([[-0.30083741,  0.63006829,  3.64291248],
       [-0.89012181, -3.74433464, -4.33206871]])

In [105]:
# shape dtype属性
data.shape
data.dtype

(2, 3)

dtype('float64')

### 创建ndarray

In [106]:
# numpy中的array函数
data1 = [1, 2, 3, 5, 7]
arr1 = np.array(data1)
arr1

array([1, 2, 3, 5, 7])

In [108]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
arr2
arr2.ndim
arr2.shape

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

2

(2, 4)

In [109]:
# np.array会自动为新建的数组推断一个较为合适的数据类型
arr1.dtype
arr2.dtype

dtype('int32')

dtype('int32')

In [112]:
# numpy中，创建特定类型的函数
np.zeros(10)
np.zeros((3, 6))
np.empty((3, 5, 2)) #以3×（5 × 2）显示；empty返回的不是0，而是垃圾值

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

array([[[  3.64913339e-316,   9.58487353e-322],
        [  0.00000000e+000,   0.00000000e+000],
        [  7.29541959e+175,   5.02034658e+175],
        [  1.03120190e-071,   3.05690378e-057],
        [  1.31662554e-071,   4.29260957e+020]],

       [[  1.47763641e+248,   1.16096346e-028],
        [  7.69165785e+218,   1.35617292e+248],
        [  1.10391252e-047,   5.48646888e-057],
        [  4.02330383e-057,   1.27683739e-066],
        [  4.27677405e-096,   6.32299154e+233]],

       [[  6.48224638e+170,   5.22411352e+257],
        [  5.74020278e+180,   8.37174974e-144],
        [  1.41529402e+161,   9.16651763e-072],
        [  5.38183344e+097,   3.51045579e-033],
        [  2.73954041e+126,   1.44017309e+246]]])

In [115]:
a_list = [1, 2, 3, 4, 5]
a_array = np.asarray(a_list)
a_array

array([1, 2, 3, 4, 5])

In [116]:
range(7)
np.arange(7) #类似于内置的range，但返回的是array，不是range

range(0, 7)

array([0, 1, 2, 3, 4, 5, 6])

In [117]:
type(range(3))

range

In [118]:
np.ones_like(arr2)
np.zeros_like(arr1)
np.eye(3)
np.identity(5)

array([[1, 1, 1, 1],
       [1, 1, 1, 1]])

array([0, 0, 0, 0, 0])

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

array([[ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.]])

### ndarray数据类型

In [119]:
### ndarray数据类型
arr1 = np.array([1, 2, 3], dtype = np.float64)
arr2 = np.array([1, 2, 3], dtype = np.int32)
arr1.dtype
arr2.dtype
# 当设计的程序涉及数据的存储读取速度时，再考虑数据类型的问题

dtype('float64')

dtype('int32')

In [120]:
arr = np.array([1, 2, 3, 4, 5])
arr.dtype
float_arr = arr.astype(np.float64) # astype转换类型，会创建一个新的数组
float_arr.dtype

dtype('int32')

dtype('float64')

In [125]:
numeric_strings = np.array(['1.23', '2.34', '3.45', '4.56', '5.67'], dtype = np.string_)
numeric_strings.dtype
numeric_strings.astype(float) #astype不改变原ndarray的数据类型，而是创建新的数据类型的ndarray。严格写法numeric_strings.astype(np.float64)，float是python的数据类型，astype函数能将它自动映射到相匹配的numpy数据类型
numeric_strings.dtype
numeric_float = numeric_strings.astype(float)
numeric_float.dtype

dtype('S4')

array([ 1.23,  2.34,  3.45,  4.56,  5.67])

dtype('S4')

dtype('float64')

In [128]:
int_array = np.arange(3)
calibers = np.array([.13, .15, .17], dtype = np.float64) 
int_array.astype(calibers.dtype) 
empty_unit32 = np.empty(8, dtype = 'u4') # u4 代表无符号的32位（4字节）整型unit32
empty_unit32

array([ 0.,  1.,  2.])

array([1, 1, 1, 1, 1, 1, 1, 1], dtype=uint32)

In [130]:
np.empty((3, 2))

array([[ -1.50418705,   3.15034147],
       [ 18.21456239,  -4.45060907],
       [-18.72167319, -21.66034353]])

### 数组和标量的运算


In [131]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr
arr * arr
arr - arr
1 / arr
arr ** 0.5

array([[ 1.,  2.,  3.],
       [ 4.,  5.,  6.]])

array([[  1.,   4.,   9.],
       [ 16.,  25.,  36.]])

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

array([[ 1.        ,  0.5       ,  0.33333333],
       [ 0.25      ,  0.2       ,  0.16666667]])

array([[ 1.        ,  1.41421356,  1.73205081],
       [ 2.        ,  2.23606798,  2.44948974]])

### 数组索引和切片（索引）

In [134]:
arr = np.arange(10)
arr
arr[5] # 索引
arr[5:8] # 切片（索引）
arr[5:8] = 12 
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

5

array([5, 6, 7])

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

In [135]:
arr_slice = arr[5:8]
arr_slice[1] = 12345
arr #数组的切片是原始的数组视图，数据不会被复制，任何修改都会直接反映到源数组上。numpy用于处理大量数据，切片作用于源数据不会因为复制而造成内存和性能的浪费

array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,     9])

In [136]:
arr_slice[:] = 63
arr

array([ 0,  1,  2,  3,  4, 63, 63, 63,  8,  9])

In [137]:
arr_slice_copy = arr[5:8].copy() #得到切片的一个副本
arr_slice_copy[:] = 7
arr

array([ 0,  1,  2,  3,  4, 63, 63, 63,  8,  9])

In [138]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d[2] #多维数组切片，得到低维数组

array([7, 8, 9])

In [140]:
arr2d
arr2d[0][2] #递归索引
arr2d[0, 2] #多维索引
arr2d[:2]
arr2d[:2, 1:]
arr2d[1, :2]
arr2d[2, :1]
arr2d[:, :1] # : 表示选取整个轴
arr2d[:, 1] # 与前一个结果不同

array([[1, 0, 0],
       [4, 0, 0],
       [7, 8, 9]])

0

0

array([[1, 0, 0],
       [4, 0, 0]])

array([[0, 0],
       [0, 0]])

array([4, 0])

array([7])

array([[1],
       [4],
       [7]])

array([0, 0, 8])

In [141]:
arr2d[:2, 1:] = 0
arr2d

array([[1, 0, 0],
       [4, 0, 0],
       [7, 8, 9]])

In [142]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]],[[7, 8, 9], [10, 11, 12]]])
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [143]:
arr3d[0]
origin_value = arr3d[0].copy()

array([[1, 2, 3],
       [4, 5, 6]])

In [144]:
arr3d[0] = 42
arr3d

array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [145]:
arr3d[0] = origin_value
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [146]:
arr3d[1, 0]

array([7, 8, 9])

### 布尔型索引

In [159]:
# 布尔型索引选取数组中的数据，总是创建数据的副本
names = np.array(['ryan', 'paul', 'david', 'gary', 'paul'])
names
data = np.random.randn(5, 7)
data

names == 'paul'
data[names == 'paul'] # 布尔型数组的长度需跟被索引的轴长度一致

array(['ryan', 'paul', 'david', 'gary', 'paul'],
      dtype='<U5')

array([[ 0.02866809, -0.46207668, -1.17261165,  1.11766411,  1.44309142,
         1.68484276, -0.3976931 ],
       [-0.38776476,  1.51668354, -0.82303682,  0.01441515, -1.0394747 ,
         1.25480689,  1.15948708],
       [-0.47728864, -0.3583492 ,  0.26907771, -0.90986271, -1.30751737,
         2.23101223, -0.64524346],
       [-0.3966697 , -1.64869826,  0.9314179 , -0.19030552,  0.62242807,
         1.09432604, -0.58882629],
       [-0.24689993, -1.33505714, -0.56244807,  0.84168244, -0.58066017,
         0.75477846, -0.29628647]])

array([False,  True, False, False,  True], dtype=bool)

array([[-0.38776476,  1.51668354, -0.82303682,  0.01441515, -1.0394747 ,
         1.25480689,  1.15948708],
       [-0.24689993, -1.33505714, -0.56244807,  0.84168244, -0.58066017,
         0.75477846, -0.29628647]])

In [150]:
data[names == 'paul', 2:]
data[names == 'paul', 3]
data[names == 'paul', 3:4] #有冒号表示选取轴

array([[ 0.20838446,  0.14529201, -1.324471  ,  1.28109948,  0.22945412],
       [-0.68948522, -1.46044443,  0.07384677,  0.34530531,  0.88989993]])

array([ 0.14529201, -1.46044443])

array([[ 0.14529201],
       [-1.46044443]])

In [156]:
names != 'paul'
data[~(names == 'paul')] # ~ 等价于 !=

array([ True, False,  True,  True, False], dtype=bool)

array([[ 0.38952659,  0.33979283, -0.66219317,  0.63237723, -0.14148579,
        -1.43513721,  1.95642071],
       [ 0.90069779, -0.54147461, -0.02270073, -1.47610297, -0.00201212,
         0.69019613,  0.68186577],
       [-0.13390582,  0.52910467, -1.92945273,  0.36105302,  0.48836733,
        -1.08840448,  0.89629287]])

In [157]:
mask = (names == 'paul') | (names == 'ryan')
mask
data[mask]

array([ True,  True, False, False,  True], dtype=bool)

array([[ 0.38952659,  0.33979283, -0.66219317,  0.63237723, -0.14148579,
        -1.43513721,  1.95642071],
       [-1.1999628 ,  0.03047865,  0.20838446,  0.14529201, -1.324471  ,
         1.28109948,  0.22945412],
       [ 0.9998045 ,  0.55346908, -0.68948522, -1.46044443,  0.07384677,
         0.34530531,  0.88989993]])

In [160]:
data[data < 0] = 0 #通过布尔型数组赋值
data[names == 'david'] = 7
data

array([[ 0.02866809,  0.        ,  0.        ,  1.11766411,  1.44309142,
         1.68484276,  0.        ],
       [ 0.        ,  1.51668354,  0.        ,  0.01441515,  0.        ,
         1.25480689,  1.15948708],
       [ 7.        ,  7.        ,  7.        ,  7.        ,  7.        ,
         7.        ,  7.        ],
       [ 0.        ,  0.        ,  0.9314179 ,  0.        ,  0.62242807,
         1.09432604,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.84168244,  0.        ,
         0.75477846,  0.        ]])

#### 其他索引方式

In [161]:
arr = np.empty((8, 4))
for i in range(8):
    arr[i] = i
arr

array([[ 0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.],
       [ 2.,  2.,  2.,  2.],
       [ 3.,  3.,  3.,  3.],
       [ 4.,  4.,  4.,  4.],
       [ 5.,  5.,  5.,  5.],
       [ 6.,  6.,  6.,  6.],
       [ 7.,  7.,  7.,  7.]])

In [162]:
arr[[3, 7, -1, -5, 6]] # 按指定的顺序索引

array([[ 3.,  3.,  3.,  3.],
       [ 7.,  7.,  7.,  7.],
       [ 7.,  7.,  7.,  7.],
       [ 3.,  3.,  3.,  3.],
       [ 6.,  6.,  6.,  6.]])

In [163]:
arr = np.arange(32).reshape((8, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [164]:
arr[[1, 5, 7, 2], [0, 3, 1, 2]] # 得到一维数组，4个元素
arr[[1, 5, 7, 2]]
arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]] # 得到一个4*4的二维数组
arr[np.ix_([1, 5, 7, 2], [0, 3, 1, 2])] # np.ix_函数将两个一维数组组成可以索引矩阵的索引器

array([ 4, 23, 29, 10])

array([[ 4,  5,  6,  7],
       [20, 21, 22, 23],
       [28, 29, 30, 31],
       [ 8,  9, 10, 11]])

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

#### 转置和轴对换

In [165]:
arr = np.arange(15).reshape((3, 5))
arr
arr.T

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [166]:
arr = np.random.randn(6, 3)
np.dot(arr.T, arr) #矩阵内积

array([[ 4.56571478,  2.26980874,  0.44497863],
       [ 2.26980874,  2.68229715, -1.10588546],
       [ 0.44497863, -1.10588546,  8.81697335]])

In [167]:
arr = np.arange(16).reshape(2, 2, 4)
arr
arr.transpose((1, 0, 2)) # 高维数组的转置需要一个由轴编号组成的元组进行轴对换
arr.transpose((1, 2, 0))

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

array([[[ 0,  8],
        [ 1,  9],
        [ 2, 10],
        [ 3, 11]],

       [[ 4, 12],
        [ 5, 13],
        [ 6, 14],
        [ 7, 15]]])

In [168]:
arr.swapaxes(1, 2) # 进行轴对换，直接作用的源数据上

array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])

In [169]:
arr
arr.T

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

array([[[ 0,  8],
        [ 4, 12]],

       [[ 1,  9],
        [ 5, 13]],

       [[ 2, 10],
        [ 6, 14]],

       [[ 3, 11],
        [ 7, 15]]])

### 数组函数

In [170]:
arr = np.arange(10)
np.sqrt(arr)
np.exp(arr)

array([ 0.        ,  1.        ,  1.41421356,  1.73205081,  2.        ,
        2.23606798,  2.44948974,  2.64575131,  2.82842712,  3.        ])

array([  1.00000000e+00,   2.71828183e+00,   7.38905610e+00,
         2.00855369e+01,   5.45981500e+01,   1.48413159e+02,
         4.03428793e+02,   1.09663316e+03,   2.98095799e+03,
         8.10308393e+03])

In [174]:
x = np.random.randn(8)
y = np.random.randn(8)
np.maximum(x, y)
arr = np.random.randn(7) * 5
np.modf(arr) # 将小数的整数部分和小数部分分为两个数组

array([ 0.86861816,  0.42024606, -0.15678701,  0.09160955, -0.66875179,
        1.32705564, -0.08107582,  1.27667454])

(array([ 0.32641132,  0.27179365, -0.86532908, -0.09496987,  0.82167844,
        -0.82749273, -0.29944285]), array([ 1.,  3., -1., -0.,  3., -5., -0.]))

### 数组数据处理

In [178]:
point = np.arange(3)
x, y = np.meshgrid(point, point)
x
y
points = np.arange(-5, 5, 0.01)
xs, ys = np.meshgrid(points, points)
xs
ys

array([[0, 1, 2],
       [0, 1, 2],
       [0, 1, 2]])

array([[0, 0, 0],
       [1, 1, 1],
       [2, 2, 2]])

array([[-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       ..., 
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99]])

array([[-5.  , -5.  , -5.  , ..., -5.  , -5.  , -5.  ],
       [-4.99, -4.99, -4.99, ..., -4.99, -4.99, -4.99],
       [-4.98, -4.98, -4.98, ..., -4.98, -4.98, -4.98],
       ..., 
       [ 4.97,  4.97,  4.97, ...,  4.97,  4.97,  4.97],
       [ 4.98,  4.98,  4.98, ...,  4.98,  4.98,  4.98],
       [ 4.99,  4.99,  4.99, ...,  4.99,  4.99,  4.99]])

In [179]:
# 图如何显示
import matplotlib.pyplot as plt
z = np.sqrt(xs ** 2 + ys ** 2)
plt.imshow(z, cmap = plt.cm.gray)
plt.colorbar()
plt.title("image plot of $\sqrt{x^2 + y^2}$ for a grid of values")

<matplotlib.image.AxesImage at 0x909a4a8>

<matplotlib.colorbar.Colorbar at 0x91224e0>

<matplotlib.text.Text at 0x907d128>

### 将条件逻辑表述为数组运算

In [182]:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)] # if cond true xarr, else yarr
result
result = np.where(cond, xarr, yarr)
result

[1.1000000000000001, 2.2000000000000002, 1.3, 1.3999999999999999, 2.5]

array([ 1.1,  2.2,  1.3,  1.4,  2.5])

In [183]:
arr = np.random.randn(4, 4)
arr
np.where(arr > 0, 1, -1)
np.where(arr > 0, 1, arr)

array([[-1.55507521,  0.89296203,  0.87535744, -0.10434899],
       [-0.7492847 , -0.54670952,  0.51391624, -1.13792981],
       [-0.7256057 , -0.46187819,  2.21871343, -0.31028063],
       [-0.85840564, -0.32100856,  0.85169241, -0.26187763]])

array([[-1,  1,  1, -1],
       [-1, -1,  1, -1],
       [-1, -1,  1, -1],
       [-1, -1,  1, -1]])

array([[-1.55507521,  1.        ,  1.        , -0.10434899],
       [-0.7492847 , -0.54670952,  1.        , -1.13792981],
       [-0.7256057 , -0.46187819,  1.        , -0.31028063],
       [-0.85840564, -0.32100856,  1.        , -0.26187763]])

In [184]:
arr = np.arange(25).reshape(5, 5)
arr[np.where(arr > 7)] # Q np.where

array([ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])

In [193]:
cond1 = np.array([True, False, True, False])
cond2 = np.array([False, True, True, False])
result = []
for i in range(4):
    if cond1[i] and cond2[i]:
        result.append(0)
    elif cond1[i]:
        result.append(1)
    elif cond2[i]:
        result.append(2)
    else:
        result.append(3)
result

result = np.where(cond1 & cond2, 0, np.where(cond1, 1, np.where(cond2, 2, 3)))
result
result = 1 * (cond1 & ~cond2) + 2 * (cond2 & ~cond1) + 3 * ~(cond1 | cond2)
result

[1, 2, 0, 3]

array([1, 2, 0, 3])

array([1, 2, 0, 3])

### 数学与统计方法

In [194]:
arr = np.random.randn(5, 4)
arr.mean()
np.mean(arr)
arr.sum()
np.sum(arr)
arr.mean(1) # 列
arr.sum(0) # 行

-0.15236918191054766

-0.15236918191054766

-3.0473836382109534

-3.0473836382109534

array([-0.55410297, -0.52121799,  0.24108644,  0.01798545,  0.05440315])

array([-2.02696066,  2.92121787, -2.11080169, -1.83083916])

In [195]:
arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
arr.cumsum(0)
arr.cumprod(1)

array([[ 0,  1,  2],
       [ 3,  5,  7],
       [ 9, 12, 15]], dtype=int32)

array([[  0,   0,   0],
       [  3,  12,  60],
       [  6,  42, 336]], dtype=int32)

### 用于布尔型数组的方法

In [199]:
arr = np.random.randn(100)
(arr > 0).sum()

41

In [201]:
bools = np.array([False, False, True, False])
bools.any() # any 是否存在True
bools.all() # all 是否全为True

True

False

### 排序

In [202]:
arr = np.random.randn(8)
arr

array([ 2.7322599 , -0.51423039,  0.25286995,  0.55632671,  0.33672953,
        0.9169171 ,  0.40994964, -1.62901337])

In [203]:
arr.sort()
arr

array([-1.62901337, -0.51423039,  0.25286995,  0.33672953,  0.40994964,
        0.55632671,  0.9169171 ,  2.7322599 ])

In [206]:
arr = np.random.randn(5, 3)
arr
arr.sort(1)
arr

array([[ 0.42322053, -1.09362533, -0.28826774],
       [-1.15281857, -0.17289565,  0.91186976],
       [ 0.45557231, -0.38671044, -0.11328784],
       [ 0.87316771, -0.20460553,  1.71637578],
       [ 0.79052763, -0.22417876,  2.23584167]])

array([[-1.09362533, -0.28826774,  0.42322053],
       [-1.15281857, -0.17289565,  0.91186976],
       [-0.38671044, -0.11328784,  0.45557231],
       [-0.20460553,  0.87316771,  1.71637578],
       [-0.22417876,  0.79052763,  2.23584167]])

In [207]:
arr.sort(0)
arr

array([[-1.15281857, -0.28826774,  0.42322053],
       [-1.09362533, -0.17289565,  0.45557231],
       [-0.38671044, -0.11328784,  0.91186976],
       [-0.22417876,  0.79052763,  1.71637578],
       [-0.20460553,  0.87316771,  2.23584167]])

In [208]:
arr = np.random.randn(1000)
arr.sort()
arr[int(0.05 * len(arr))] # 5%分位数

-1.6446316118722486

### unique及集合运算

In [209]:
names = np.array(['bob', 'joe', 'will', 'bob', 'will', 'joe', 'joe'])
np.unique(names)
set(names)

array(['bob', 'joe', 'will'],
      dtype='<U4')

{'bob', 'joe', 'will'}

In [210]:
values = np.array([6, 0, 0, 3, 2, 5, 6])
np.in1d(values, [2, 3, 6])

array([ True, False, False,  True,  True, False,  True], dtype=bool)

| 方法 | 说明 |
| -- | -- |
| unique(x) | 唯一值 |
| intersect1d(x, y) | 交 |
| union1d(x, y) | 并 |
| in1d(x, y) | x的元素是否包含于y |
| setdiff1d(x, y) | x - y |
| setxor1d(x, y) | 对称差 x+y-xy |

## 数组（文件）的输入输出

In [211]:
arr = np.arange(10)
np.save('some_array', arr) # 默认文件后缀 .npy
np.load('some_array.npy')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
np.savez('array_achive.npz', a = arr, b = arr) # 将多个array保存到数组压缩文件中
arch = np.load('array_archive.npz')
arch['b']

## 线性代数

In [212]:
x = np.array([[1., 2., 3. ], [4., 5., 6.]])
y = np.array([[6., 23.], [-1, 7], [8, 9]])
x.dot(y)
np.dot(x, np.ones(3))

array([[  28.,   64.],
       [  67.,  181.]])

array([  6.,  15.])

In [216]:
from numpy.linalg import inv, qr
x = np.random.randn(5, 5)
mat = x.T.dot(x)
inv(mat) # 矩阵的逆
mat.dot(inv(mat))

array([[  3.13008607,  -5.95828754,   5.34495139,  -1.5707139 ,
         12.25270769],
       [ -5.95828754,  12.5556019 , -11.94509071,   3.2633864 ,
        -25.15266333],
       [  5.34495139, -11.94509071,  12.93090313,  -2.36425866,
         23.42979936],
       [ -1.5707139 ,   3.2633864 ,  -2.36425866,   1.49529709,
         -6.80636768],
       [ 12.25270769, -25.15266333,  23.42979936,  -6.80636768,
         51.28390371]])

array([[  1.00000000e+00,  -8.23708834e-15,  -8.06022133e-15,
         -1.31403283e-15,   1.17065755e-14],
       [ -5.96349076e-15,   1.00000000e+00,  -1.20127640e-14,
         -1.06010539e-15,   1.24900683e-14],
       [ -7.82278451e-16,   1.22199431e-15,   1.00000000e+00,
         -6.16601811e-16,   4.74146622e-15],
       [  6.79375921e-16,  -1.04436001e-14,   9.60648763e-15,
          1.00000000e+00,  -8.94151254e-15],
       [ -2.24619302e-15,   4.23241987e-15,   4.22110595e-15,
         -1.18150792e-15,   1.00000000e+00]])

In [217]:
q, r = qr(mat) # qr分解
r 

array([[-11.46979104,  -6.20953085,  -5.31391884,   7.6875278 ,
          3.14777574],
       [  0.        ,  -9.75646824,  -2.77178276,   2.33372802,
         -3.21661864],
       [  0.        ,   0.        ,  -1.13527502,   2.53483222,
          0.86066679],
       [  0.        ,   0.        ,   0.        ,  -1.08415998,
         -0.13956218],
       [  0.        ,   0.        ,   0.        ,   0.        ,
          0.01579539]])

|numpy.linalg|说明|
|--|--|
|diag|对角线元素，或转化成对角矩阵|
|dot|内积|
|trace|迹|
|det|行列式|
|eig|特征值|
|inv|逆|
|pinv|Moore-Penrose逆|
|qr|qr分解|
|svd|奇异值分解|
|solve|解方程组|
|lstsq|Ax=b最小二乘解|

## 随机数

In [218]:
samples = np.random.normal(size=(3, 3))
samples

array([[-0.45004324,  1.38235754, -0.63640275],
       [ 2.37864865,  0.15701581,  1.17206913],
       [ 0.69056321,  0.02786089, -0.7781212 ]])

In [221]:
from random import normalvariate
N = 1000000
%timeit samples = [normalvariate(0, 1) for _ in range(N)]
%timeit np.random.normal(size=N)

972 ms ± 6.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
40.8 ms ± 2.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


|numpy.random函数|说明|
|--|--|
|seed|随机数生成器的种子|
|permutation|对一个序列进行随机排列|
|shuffle|对一个序列随机排列|
|rand|均匀分布|
|randint|整数均匀分布|
|randn|正态分布|
|binomail|二项分布|
|normal|正态分布|
|beta|Beta分布|
|chisquare|卡方分布|
|gamma|gamma分布|
|uniform|均匀分布|

## random walk

In [226]:
import random

In [230]:
position = 0
walk = [position]
steps = 1000
for i in range(steps):
    step = 1 if random.randint(0, 1) else -1
    position += step
    walk.append(position)
# Q 将随机游走画成图

In [237]:
nsteps = 1000
draws = np.random.randint(0, 2, size=nsteps)
steps = np.where(draws > 0, 1, -1)
walk = steps.cumsum()

In [238]:
(np.abs(walk) >= 10).argmax() # 首次距离原点达到10所需的步数

145

In [248]:
nwalks = 5000
nsteps = 1000
draws = np.random.randint(0, 2, size=(nwalks, nsteps))
draws.shape
np.shape(draws)
steps = np.where(draws > 0, 1, -1)
walks = steps.cumsum(1)
walks
walks.shape
walks.max()
walks.min()

(5000, 1000)

(5000, 1000)

array([[  1,   2,   3, ..., -32, -31, -32],
       [ -1,   0,  -1, ..., -58, -59, -58],
       [ -1,  -2,  -1, ...,  -2,  -1,  -2],
       ..., 
       [  1,   0,  -1, ..., -14, -13, -12],
       [  1,   0,  -1, ...,   8,   7,   8],
       [ -1,   0,  -1, ...,  24,  25,  26]], dtype=int32)

(5000, 1000)

124

-129

In [253]:
hist30 = (np.abs(walks) >= 30).any(1)
len(hist30)
hist30.sum()

5000

3451

In [252]:
crossing_times = (np.abs(walks[hist30]) >= 30).argmax(1)
crossing_times.mean()

503.55751955954798