# Python数据分析（二）：基本数据结构


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## numpy.array

### 创建narray

In [None]:
# 用嵌套列表创建数组
mylist = [[1,2,3],[4,5,6],[7,8,9]]
myarray = np.array(mylist)
myarray

In [None]:
# 特殊数组
np.zeros([4,5])

In [None]:
np.ones([2,4,3])

In [None]:
np.empty(5)  # 只分配空间，不填充值

In [None]:
np.eye(5)

In [None]:
np.arange(14)

### narray数据类型

In [None]:
myarray.dtype

In [None]:
myarray[0,0].dtype

In [None]:
type(myarray[0,0])

In [None]:
myarray2 = np.array(mylist, dtype=np.float64)

In [None]:
myarray2.dtype

In [None]:
myarray3 = np.array(mylist, dtype=np.object) # np.object: Python对象类型

In [None]:
myarray3.dtype

In [None]:
type(myarray3[0,0])

### 索引和切片

In [None]:
# 索引
myarray[0]

In [None]:
myarray[0,1]

In [None]:
myarray[0][1]

In [None]:
# 切片
myarray[1:2]

In [None]:
myarray[1:2, 1:2]

In [None]:
myarray[:,1:2]

In [None]:
# 布尔索引
myarray[myarray>5]

In [None]:
# 花式索引
myarray[:,[0,0,1,0,2]]

### 基本运算和通用函数

In [None]:
# 数组与标量
myarray*3

In [None]:
myarray/3

In [None]:
3/myarray

In [None]:
myarray+3

In [None]:
myarray-3

In [None]:
myarray**2

In [None]:
# 数组与数组
myarray+myarray

In [None]:
myarray*myarray

In [None]:
myarray**myarray

In [None]:
# 通用函数
np.mean(myarray)

In [None]:
# 等价的实例方法
myarray.mean()

常用的通用函数：
- abs
- sqrt
- exp
- log
- sign

### 排序

In [None]:
myarray_new = np.array([[4,8],[7,3]])
myarray_new

In [None]:
myarray_new.sort()
myarray_new

### 随机数

In [None]:
np.random.normal(size=(4,5))

## pandas.Series

### 创建Series

In [None]:
# 列表/数组
series = pd.Series([4,5,2,5,5],index=['a','b','c','d','e'])
series

In [None]:
# 标量
series = pd.Series(4,index=['a','b','c','d','e'])
series

In [None]:
# 字典
series = pd.Series({'a':4,'b':5,'c':2,'d':5,'e':5}, name='name')
series

### 基本属性

In [None]:
# index
series.index

In [None]:
# 修改index
series.index = range(5)
series

In [None]:
# 另一种修改index的方法
series.rename(lambda x:x*2)

In [None]:
# index name
series.index.name = 'index'
series

In [None]:
# name
series.name = 'value'
series

In [None]:
# 另一种修改列名的方法
series.rename('myvalue')

### 索引

In [None]:
# 特殊的一维narray
series[2]

In [None]:
# 有序字典
series = pd.Series({'a':4,'b':5,'c':2,'d':5,'e':5}, name='name')
series['c']

### 基本运算

In [None]:
# 描述
series.describe()

In [None]:
# 同一维narray
series.sum()

In [None]:
np.exp(series)

### 基本函数

In [None]:
# 排序
# 升序
series.sort_values()

In [None]:
# 降序
series.sort_values(ascending=False)

In [None]:
# 头
series.head(2)

In [None]:
# 尾
series.tail(2)

In [None]:
# sort+head
series.nlargest(2)

In [None]:
# sort+tail
series.nsmallest(3)

In [None]:
# apply方法
series.apply(lambda x:x+4)

## pandas.DataFrame

### 创建DataFrame

In [None]:
# dict of dict
data = {
    'one':{'a':1,'b':3},
    'two':{'a':2,'b':4}
}
pd.DataFrame(data)

In [None]:
# dict of list
data = {
    'one':[1,3],
    'two':[2,4],
}
pd.DataFrame(data, index=['a','b'])

In [None]:
# record list/array
data = [[1,2],[3,4]]
pd.DataFrame(data, index=['a','b'], columns=['one','two'])

In [None]:
# list of dict
data = [{'one':1,'two':2},{'one':3,'two':4}]
pd.DataFrame(data, index=['a','b'])

小结：dict在外，外层是列，里层每个单位是一列数据；list/array在外，外层是行，里层每个单位是一行数据

### 基本属性

In [None]:
df = pd.read_csv('data/sample1.csv')

In [None]:
# index
df.index

In [None]:
# 设置index
df = df.set_index('goodsID')
df.head()

In [None]:
df.reset_index().head()

In [None]:
# columns
df.columns

In [None]:
# 修改columns
df2 = pd.read_csv('data/sample2.csv')
df2.columns = range(len(df2.columns))
df2.head()

In [None]:
df.rename(columns = {'goodsName':'name'}).head()

### 索引

In [None]:
# 列索引
df['price'].head()

In [None]:
df[['price','monthly_sales']].head()

In [None]:
# 按编号的行索引
df.iloc[1]

In [None]:
# 按index的行索引
df.loc[44466463444]

In [None]:
# 元素索引
df.loc[44466463444,'price']

In [None]:
# 布尔索引
df[df['price']>25]

### 基本运算

In [None]:
df.describe()

In [None]:
df.mean()

In [None]:
np.mean(df[['price','monthly_sales']])

### 基本函数

In [None]:
# 排序
# 升序
df.sort_values(['price','monthly_sales']).head()

In [None]:
# 降序
df.sort_values(['price','monthly_sales'], ascending=False).head()

In [None]:
# 头
df.head(3)

In [None]:
# 尾
df.tail(3)

In [None]:
# sort+head
df.nlargest(5,'price')

In [None]:
# sort+tail
df.nsmallest(4,'comments')

In [None]:
# apply方法
# 按列操作
df[['price','monthly_sales','comments']].apply(lambda x: sum(x))

In [None]:
# 按行apply
df[['price','monthly_sales','comments']].apply(lambda x: sum(x), axis=1).head()

## 作业

### 要求
1. 用numpy的随机数模块随机生成一个26*4的数组，数据范围是（0,1)，且服从卡方分布；选取其中大于0.5的全部数据；
2. 把该数组转换成DataFrame，行索引为26个字母，列索引为'one','two','three','four'；取出按two排序前5位的行；
3. 取出one列，并计算该列的和、中值和方差。

### 提示
1. 部分需要的函数不在Note里，请查询官方文档寻找合适的函数；
2. 作业提交到GitHub上，并把链接附在。
