# 数据处理



## 多维数组
- 创建数组：形状，元素的数据类型，元素的值全是0或者随机数
- 访问元素：详见numpy的教程

In [1]:
import torch
import random
import numpy as np
import pandas as pd

In [2]:
"""张量表示一个数值组成的数组，这个数组可能又多个维度"""
x=torch.arange(12)
x

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [3]:
"""张量的形状"""
x.shape

torch.Size([12])

In [4]:
"""元素的总数"""
x.numel()

12

In [5]:
"""改变一个张量的形状而不改变元素数量和元素值，用reshape"""
X=x.reshape(3,4)
X

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [6]:
"""创建全为0的张量"""
torch.zeros((2,3,4))

tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

In [7]:
"""创建元素为全1的张量"""
torch.ones(2,3,4)

tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]])

In [8]:
"""通过提供python列表（或嵌套列表）来为张量中的元素赋值"""
torch.tensor([[1,2,3],[4,5,6],[7,8,9]])

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [9]:
"""tensor的运算"""
x=torch.tensor([1.0,2,4,8])
y=torch.tensor([2,2,2,2])
x+y,x-y,x*y,x/y,x**y #**运算符是求幂运算

(tensor([ 3.,  4.,  6., 10.]),
 tensor([-1.,  0.,  2.,  6.]),
 tensor([ 2.,  4.,  8., 16.]),
 tensor([0.5000, 1.0000, 2.0000, 4.0000]),
 tensor([ 1.,  4., 16., 64.]))

In [10]:
"""把多个张量连在一起"""
x=torch.arange(12,dtype=torch.float32).reshape(3,4)
y=torch.tensor([[1,2,3,4],[1,2,3,4],[1,2,3,4]])
torch.cat((x,y),dim=0),torch.cat((x,y),dim=1) #堆起来和连起来

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [ 1.,  2.,  3.,  4.],
         [ 1.,  2.,  3.,  4.],
         [ 1.,  2.,  3.,  4.]]),
 tensor([[ 0.,  1.,  2.,  3.,  1.,  2.,  3.,  4.],
         [ 4.,  5.,  6.,  7.,  1.,  2.,  3.,  4.],
         [ 8.,  9., 10., 11.,  1.,  2.,  3.,  4.]]))

In [11]:
"""通过逻辑运算符创建一个张量"""
x==y

tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])

In [12]:
"""numpy中的广播机制"""
a=torch.arange(3).reshape(3,1)
b=torch.arange(2).reshape(1,2)
a,b

(tensor([[0],
         [1],
         [2]]),
 tensor([[0, 1]]))

In [13]:
a+b

tensor([[0, 1],
        [1, 2],
        [2, 3]])

In [14]:
"""元素的访问"""
X=torch.tensor([[1,2,3,4],[4,5,6,7],[0,9,8,7]])
print(X)
X[1:3],X[-1]

tensor([[1, 2, 3, 4],
        [4, 5, 6, 7],
        [0, 9, 8, 7]])


(tensor([[4, 5, 6, 7],
         [0, 9, 8, 7]]),
 tensor([0, 9, 8, 7]))

In [15]:
"""修改值"""
X[1,2]=9
X

tensor([[1, 2, 3, 4],
        [4, 5, 9, 7],
        [0, 9, 8, 7]])

In [16]:
X[0:2,:]=12
X

tensor([[12, 12, 12, 12],
        [12, 12, 12, 12],
        [ 0,  9,  8,  7]])

In [17]:
"""转化为numpy张量"""
A=X.numpy()
B=torch.tensor(A)
type(A),type(B)

(numpy.ndarray, torch.Tensor)

In [18]:
"""将维度为1的张量转化成python标量"""
a=torch.tensor([3.5])
a,a.item(),float(a),int(a)

(tensor([3.5000]), 3.5, 3.5, 3)

In [19]:
"""标量"""
x=torch.tensor([2.0])
y=torch.tensor([3.0])

x+y,x*y,x/y,x**y

(tensor([5.]), tensor([6.]), tensor([0.6667]), tensor([8.]))

In [20]:
x=torch.arange(4)
x

tensor([0, 1, 2, 3])

In [21]:
"""按某一维度求和"""
a=torch.ones((2,5,4))
a.shape

torch.Size([2, 5, 4])

In [22]:
a.sum(axis=1),a.sum(axis=1).shape

(tensor([[5., 5., 5., 5.],
         [5., 5., 5., 5.]]),
 torch.Size([2, 4]))

In [23]:
a.sum(axis=1,keepdims=True),a.sum(axis=1,keepdims=True).shape

(tensor([[[5., 5., 5., 5.]],
 
         [[5., 5., 5., 5.]]]),
 torch.Size([2, 1, 4]))

In [24]:
a.sum(axis=[0,2],keepdims=True),a.sum(axis=[0,2],keepdims=True).shape

(tensor([[[8.],
          [8.],
          [8.],
          [8.],
          [8.]]]),
 torch.Size([1, 5, 1]))

In [25]:
"""
矩阵运算，矩阵怎么求导数
梯度的理解，是函数变化最大的方向  
"""

'\n矩阵运算，矩阵怎么求导数\n梯度的理解，是函数变化最大的方向  \n'

In [26]:
"""自动求导"""
x=torch.arange(4.0)
x

tensor([0., 1., 2., 3.])

In [27]:
x.requires_grad_(True) #等价于x=torch.arange(4.0,requires_grad=True)
x.grad #默认值是None,可以在此访问grad

In [28]:
"""计算y"""
y=2*torch.dot(x,x) #y是x与x的点积
y

tensor(28., grad_fn=<MulBackward0>)

In [29]:
"""构造反向传播函数计算y关于x每个分量的导数"""
y.backward()
x.grad

tensor([ 0.,  4.,  8., 12.])

In [30]:
x.grad==4*x

tensor([True, True, True, True])

In [31]:
"""pytorch会默认累计梯度，所以进行下次求梯度的时候，要清空x.grad"""
x.grad.zero_()
y=x.sum()
y.backward()
x.grad

tensor([1., 1., 1., 1.])

In [32]:
"""
如果y不是个标量函数，二是向量函数怎么求，由于向量函数对向量求导是矩阵，但是矩阵不常用
所以通常会对求导结果求和，按行求和
"""
x.grad.zero_()
y=x*x
# 等价于 y.backward(torch.ones(len(x)))
y.sum().backward()
x.grad

tensor([0., 2., 4., 6.])

In [33]:
"""将某些计算移动到记录的计算图之外"""
x.grad.zero_()
y=x*x
u=y.detach() #在对x的求导过程中，u当成是常数
z=u*x
z.sum().backward()
x.grad,x.grad==u

(tensor([0., 1., 4., 9.]), tensor([True, True, True, True]))

In [34]:
"""y还是x的函数，但是u不再是x的函数了，y仍可以对x求导"""
x.grad.zero_()
y.sum().backward()
x.grad

tensor([0., 2., 4., 6.])

In [35]:
"""normal函数"""
#normal表示生成满足高斯分布的随机数张量
a=torch.normal(0,1,(3,3)) #0是均值，1是标准差
a

tensor([[-0.9515, -0.8232, -0.5945],
        [ 0.1291,  0.7862,  0.9027],
        [ 0.0713,  1.1038,  0.2074]])

In [36]:
"""random.shuffle()函数什么用"""
#将链表中的元素顺序打乱
x=[i for i in range(10)]
x
random.shuffle(x)
x

[2, 1, 7, 3, 6, 0, 4, 8, 5, 9]

In [37]:
"""生成器和普通链表"""
def fun1():
    list_1=[i for i in range(5)]  #普通的链表
    for i in list_1:
        print(i)
def fun2():
    list_2=(i for i in range(5)) #定义了一个生成器
    for i in list_2:
        print(i)

In [38]:
fun1(),fun2()

0
1
2
3
4
0
1
2
3
4


(None, None)

In [39]:
"""yield,如果一个函数中定义了yield，则这个函数就是个生成器，可以使用next函数使用这个生成器"""
def fun():
    print("111")
    yield
    print("222")
    yield
    print("333")
    yield
f=fun()
type(f)

generator

In [40]:
"""使用next函数调用这个生成器"""
next(f),next(f),next(f)

111
222
333


(None, None, None)

In [41]:
"""返回的那个None是因为yield后面没有返回的东西"""
def fun():
    print("111")
    yield 1
    print("222")
    yield ("222")
    print("333")
    yield 3

In [42]:
"""使用next函数调用这个生成器"""
f=fun()
next(f),next(f),next(f)

111
222
333


(1, '222', 3)

In [43]:
"""生成一系列偶数"""
def geno():
    for i in range(10):
        yield 2*i
f=geno()
next(f),next(f),next(f),next(f),next(f)

(0, 2, 4, 6, 8)

In [44]:
for i in geno():
    print(i)

0
2
4
6
8
10
12
14
16
18


In [45]:
"""with语句"""


'with语句'

In [46]:
"""zip函数"""
list1=[1,2,3]
list2=['a','b','c']
zipped=zip(list1,list2)
list(zipped)

[(1, 'a'), (2, 'b'), (3, 'c')]

In [47]:
list1=[1,2,3]
list2=['a','b','c']
list3=[10,20]
zipped=zip(list1,list2,list3)
list(zipped)

[(1, 'a', 10), (2, 'b', 20)]

In [48]:
"""用for循环"""
list1=[1,2,3]
list2=['a','b','c']
for i ,letter in zip(list1,list2):
    print("Number:{};Letters:{}".format(i,letter))

Number:1;Letters:a
Number:2;Letters:b
Number:3;Letters:c


In [49]:
"""
tensor的性质
tensor包含两种数据，data和grad
tensor w包含w（data）和dloss/dw（grad）
"""
x_data=[1.0,2.0,3.0]
y_data=[2.0,4.0,6.0]
w=torch.tensor([1.0]) #只有dtype是float可以requires_grad=True
w.requires_grad=True
"""在构造计算图"""
def forward(x):
    return x*w  #运算符已经被重载了，w是tensor,return的也是tensor
def loss(x,y):
    y_pred=forward(x)
    return(y-y_pred)**2
print("before training:",4,forward(4).item())
for epoch in range(100):
    for x,y in zip(x_data,y_data):
        l=loss(x,y)
        l.backward() #导数存到w中，计算图释放
        print('\tgrad:',x,y,w.grad.item())
        w.data=w.data-0.01*w.grad.data #为什么要取data再计算，因为如果直接用tensor计算的话就是创建计算图的过程，不用创建计算图了，
        #在构建forward和loss时直接使用张量进行计算，但是权重更新的时候不行
        w.grad.data.zero_()
    print("progress:",epoch,l.item())
print("after training:",4,forward(4).item())

before training: 4 4.0
	grad: 1.0 2.0 -2.0
	grad: 2.0 4.0 -7.840000152587891
	grad: 3.0 6.0 -16.228801727294922
progress: 0 7.315943717956543
	grad: 1.0 2.0 -1.478623867034912
	grad: 2.0 4.0 -5.796205520629883
	grad: 3.0 6.0 -11.998146057128906
progress: 1 3.9987640380859375
	grad: 1.0 2.0 -1.0931644439697266
	grad: 2.0 4.0 -4.285204887390137
	grad: 3.0 6.0 -8.870372772216797
progress: 2 2.1856532096862793
	grad: 1.0 2.0 -0.8081896305084229
	grad: 2.0 4.0 -3.1681032180786133
	grad: 3.0 6.0 -6.557973861694336
progress: 3 1.1946394443511963
	grad: 1.0 2.0 -0.5975041389465332
	grad: 2.0 4.0 -2.3422164916992188
	grad: 3.0 6.0 -4.848389625549316
progress: 4 0.6529689431190491
	grad: 1.0 2.0 -0.4417421817779541
	grad: 2.0 4.0 -1.7316293716430664
	grad: 3.0 6.0 -3.58447265625
progress: 5 0.35690122842788696
	grad: 1.0 2.0 -0.3265852928161621
	grad: 2.0 4.0 -1.2802143096923828
	grad: 3.0 6.0 -2.650045394897461
progress: 6 0.195076122879982
	grad: 1.0 2.0 -0.24144840240478516
	grad: 2.0 4.0 -0.

In [50]:
w=torch.tensor([1.0])
w.requires_grad=True
print(w)
print(w.data)
print(w.item())
#注意只有0维的tensor可以变成item()

tensor([1.], requires_grad=True)
tensor([1.])
1.0


In [51]:
#对象后面加括号，可调用的对象__call__()
class Foobar:
    def __init__(self):
        pass
        
    def __call__(self,*args,**kwargs):
        print("Hello"+str(args[0]))

foobar=Foobar() #实例化这个类
foobar(1,2,3) #调用了__call__函数

Hello1


In [52]:
#*args和**kwargs的作用
def func(*args,**kwargs):
    print(args) #变成元组
    print(kwargs) #变成字典

func(1,2,3,4,x=1,y=2)

(1, 2, 3, 4)
{'x': 1, 'y': 2}


In [58]:
x=torch.arange(12).reshape(3,4)
print(x)
print(x[0,:])
print(x[0])
print(x[1,:])
print(x[1])

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])
tensor([0, 1, 2, 3])
tensor([0, 1, 2, 3])
tensor([4, 5, 6, 7])
tensor([4, 5, 6, 7])


In [54]:
y=np.arange(12).reshape(3,4)
print(y)
print(y[:,-1])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[ 3  7 11]


In [68]:
X=np.array([1,2,3,4,5])
Y=np.array([0,1,33,4,5])
print(np.mean(X == Y))

0.4


In [3]:
#numpy中argmin的用法
a=np.array([1,2,3,4,-3,-2,-1])
index_min1=np.argmin(a)
print(index_min1)
b=np.array([[1,2,3],
            [-1,-2,-1],
            [2,2,2]])
index_min2=np.argmin(b)
print(index_min2) #输出展平索引

4
4


In [87]:
data={
    'ID':[101,102,103,104,105],
    'Name':['Alice','Bob','Carl','Amy','Tom'],
    'Age':[24,18,25,22,19],
    'Salary':[50000,75000,45000,80000,67000],
    'Status':['Active','Inactive','Active','Inactive','Active']
}
df=pd.DataFrame(data)
print(df)

    ID   Name  Age  Salary    Status
0  101  Alice   24   50000    Active
1  102    Bob   18   75000  Inactive
2  103   Carl   25   45000    Active
3  104    Amy   22   80000  Inactive
4  105    Tom   19   67000    Active


In [75]:
df['Name']

0    Alice
1      Bob
2     Carl
3      Amy
4      Tom
Name: Name, dtype: object

In [76]:
df['Height']=[175,180,173,165,183]

In [77]:
df['Height']

0    175
1    180
2    173
3    165
4    183
Name: Height, dtype: int64

In [81]:
np.array(df.Height-3)

array([172, 177, 170, 162, 180], dtype=int64)

In [86]:
#选择某一行某一列
df.loc[1,'Name']

'Bob'

In [85]:
#条件选择,选择薪水大于50000的行
df[df['Salary']>50000]

Unnamed: 0,ID,Name,Age,Salary,Status,Height
1,102,Bob,18,75000,Inactive,180
3,104,Amy,22,80000,Inactive,165
4,105,Tom,19,67000,Active,183


In [96]:
#pandas中sample用法
extract1=df['Age'].sample(n=3,random_state=1) #random_state是为了保证每次运行都是一样的，如果去掉的话每次运行结果都不一样
print(extract1)
extract2=df.sample(replace=1,frac=0.4,random_state=1) #replace表示有放回的抽样，frac表示抽取40%的数据
print(extract2)

2    25
1    18
4    19
Name: Age, dtype: int64
    ID Name  Age  Salary    Status
3  104  Amy   22   80000  Inactive
4  105  Tom   19   67000    Active


In [104]:
#dataframe中sort_value的用法
data={'Name':['a','b','c'],
'No':[2,1,3]  
     }
df1=pd.DataFrame(data)
print(df1)
df1=df1.sort_values('No') #按照降序排列
print(df1)

  Name  No
0    a   2
1    b   1
2    c   3
  Name  No
1    b   1
0    a   2
2    c   3


In [3]:
#numpy中的copy函数，浅复制与深复制
#copy函数是一种深复制
a=np.array([1,2,3,4])
print(a)
b=np.copy(a)
b[0]=99
print(a)
print(b)

[1 2 3 4]
[1 2 3 4]
[99  2  3  4]


In [4]:
#浅复制
a=np.array([1,2,3,4])
b=a
print(a)
print(b)
a[0]=99
print(a)
print(b)

[1 2 3 4]
[1 2 3 4]
[99  2  3  4]
[99  2  3  4]


In [15]:
a=np.array([[1,2,3],[4,5,6]])
for x in np.nditer(a):
    print(x,end=' ')
print('\n')
for y in np.nditer(a.T):
    print(y,end=' ')
#修改数组元素
print('\n')
for z in np.nditer(a,op_flags=['writeonly']):
    z[...]=2*z
print(a)

1 2 3 4 5 6 

1 2 3 4 5 6 

[[ 2  4  6]
 [ 8 10 12]]


In [19]:
a=np.array([[1,2,3],[4,5,6]])
it=np.nditer(a)


<numpy.nditer object at 0x00000172163EEAD0>
<numpy.nditer object at 0x000001721F1AA530>


In [9]:
X=np.array([
    [1,2],
    [2,3],
    [3,4]])
X_mean1=np.mean(X,axis=0,dtype=float)
print(X_mean1)
print(X-X_mean1)
X_mean2=np.mean(X)
print(X_mean2)
print(X-X_mean2)

[2. 3.]
[[-1. -1.]
 [ 0.  0.]
 [ 1.  1.]]
2.5
[[-1.5 -0.5]
 [-0.5  0.5]
 [ 0.5  1.5]]


In [13]:
#lambda创建匿名函数
act={'relu':lambda x:np.maximum(0,x),'tanh':lambda x:np.tanh(x)}
x=np.array([[1,2,3],[4,5,6]])
relu_result=act['relu'](x)
tanh_result=act['tanh'](x)
print(relu_result)
print(tanh_result)

[[1 2 3]
 [4 5 6]]
[[0.76159416 0.96402758 0.99505475]
 [0.9993293  0.9999092  0.99998771]]


In [5]:
#字典
"""创建空字典"""
D={}
print(D)
"""增加键值对"""
D[1]=10
D['nihao']='nihao'
print(D)
"""遍历字典中所有的键值对"""
list_D=list(D.items())
print(list_D)
for i,H in D.items():
    print(i)
    print(H)
"keys()的用法"
print(D.keys())
print(list(D.keys()))

{}
{1: 10, 'nihao': 'nihao'}
[(1, 10), ('nihao', 'nihao')]
1
10
nihao
nihao
dict_keys([1, 'nihao'])
[1, 'nihao']


In [14]:
a=torch.tensor([[1,2,3],[3,4,5]])
print(a)
a=torch.max(a,dim=1)
print(a)
b=torch.tensor([2,2])
print((a.indices==b).sum())

tensor([[1, 2, 3],
        [3, 4, 5]])
torch.return_types.max(
values=tensor([3, 5]),
indices=tensor([2, 2]))
tensor(2)


In [3]:
a=[]
a[0]=1
print(a)

IndexError: list assignment index out of range

In [2]:
#join函数的用法，连接字符串
#(1)连接字符串
str='nihao'
str=str.join(['h','e','llo'])
print(str)
#(2)将数组容器类转化成字符串
str=['h','e','l','l','o']
str=''.join(str) 
print(str)

hnihaoenihaollo
hello


In [6]:
#tensor中.data.numpy()的用法
a=torch.tensor([[1,2,3],[3,4,5]])
a=a.data.numpy()
print(a)
print(a[0])

[[1 2 3]
 [3 4 5]]
[1 2 3]
