# 认识numpy 和 pandas 

## 区别
- numpy 主要使用array类，它的矩阵计算十分强大；
- pandas 主要使用DataFrame（实际上是ndarray）类，这个类用来管理数据。

In [4]:
# /usr/bin/python 3.7.8 64-bit
# -- encoding = utf-8 ---
import pandas as pd
import numpy as np

# 列表
sample_array = np.array([1,2,3,4,5])
sample_array

# 对数组进行的运算会作用到数组的每个元素上
sample_array + 2

# 如果放置不同类型的数据，那么它就会有小情绪
# 就会变成字符串
np.array([1, 2, "Q"])

# 二维数组
sample_array_2 = np.array(
    [[1,2,3,4,5],
    [6,7,8, 9, 10]]
)
sample_array_2

# 行数和列数
sample_array_2.shape


(2, 5)

- 等差数列

In [5]:
import pandas as pd
import numpy as np

#  等差数列
# 其实range 和 arange还是有很大的区别的
np.arange(start = 1, stop = 6, step = 0.2)

array([1. , 1.2, 1.4, 1.6, 1.8, 2. , 2.2, 2.4, 2.6, 2.8, 3. , 3.2, 3.4,
       3.6, 3.8, 4. , 4.2, 4.4, 4.6, 4.8, 5. , 5.2, 5.4, 5.6, 5.8])

- 多种生成数组的方式

In [6]:
np.tile("A", 5)
np.tile(0, 4)
# 生成只有0的数组
np.zeros(4)
np.zeros([2,3])
# 生成只有1的数组
np.ones(3)

array([1., 1., 1.])

-   切片(slice)

In [7]:
import pandas as pd
import numpy as np
d1_array = np.array([1,2,3,4,5])
d1_array
d1_array[1:3]
d2_array = np.array(
    [[1,2,3,4,5],
    [6,7,8,9,10]]
)
d2_array
d2_array[0,3]
d2_array[1, 2:4]

array([8, 9])

### 数据帧(DataFrame)
-   列操作
-   行操作

In [None]:
import pandas as pd
import numpy as np
sample_df = pd.DataFrame({
    'col1' : sample_array,
    'col2' : sample_array * 2,
    'clo3' : ["A", "B", "C", "D", "E"]
})
sample_df
# print(sample_df)
# 数据帧的列操作，点操作符
# print(sample_df.col2)
# 这个和下面的是一致的，方括号操作符
# print(sample_df["col2"])
print(sample_df[["col2","clo3"]])
# 删除指定的列
print(sample_df.drop("col1",axis=1))
# 数据帧的行操作
print(sample_df.head(n = 5))
# 可以使用sample_df的query 函数更灵活地提取数据
print(sample_df.query('col2 == 8'))
# 其实这个query函数有点儿类似数据库操作，记住这点就行啦。
print(sample_df.query('clo3 == "A" | clo3 == "E"'))
print(sample_df.query('clo3 == "A" & col1 ==1'))
# 注意[]方括号的使用
print(sample_df.query('clo3 == "A" ')[['col1','col2','clo3']])

### 连接数据帧

In [None]:
import pandas as pd
import numpy as np
df_1 = pd.DataFrame({
    'col1':np.array([1,2,3]),
    'col2':np.array(["A","B","C"])
})
df_2 = pd.DataFrame({
    'col1':np.array([4,5,6]),
    'col2':np.array(["D","E","F"])
})
print(pd.concat([df_1, df_2]))
# 横向连接
temp = pd.concat([df_1, df_2], axis=1)

print(temp)

### 序列(series)

In [40]:
print(type(sample_df))
# 如果取出来一列，其实上面有提示
# type(sample_df.col1)  
type(sample_df['col1'])
type(sample_df.col1.values)
# 这时候就变成了numpy 类型

<class 'pandas.core.frame.DataFrame'>


numpy.ndarray

## 函数文档（类似说明书）

In [42]:
# 这个仁者见仁，智者见智，网上搜索也行
help(sample_df.query)

Help on method query in module pandas.core.frame:

query(expr, inplace=False, **kwargs) method of pandas.core.frame.DataFrame instance
    Query the columns of a DataFrame with a boolean expression.
    
    Parameters
    ----------
    expr : str
        The query string to evaluate.
    
        You can refer to variables
        in the environment by prefixing them with an '@' character like
        ``@a + b``.
    
        You can refer to column names that are not valid Python variable names
        by surrounding them in backticks. Thus, column names containing spaces
        or punctuations (besides underscores) or starting with digits must be
        surrounded by backticks. (For example, a column named "Area (cm^2) would
        be referenced as `Area (cm^2)`). Column names which are Python keywords
        (like "list", "for", "import", etc) cannot be used.
    
        For example, if one of your columns is called ``a a`` and you want
        to sum it with ``b``, your quer