<font size=17>Pandas 学习</font> 

# 数据与结构 Series

## Series
一个数列有<br>
name<br>index<br>value


In [1]:
# 引入pandas
import pandas as pd
import numpy as np

### 构建Series

#### 不同构建Series的方法

In [2]:
#list 构建
pd.Series(range(10,20))

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In [3]:
# np.array 构建
pd.Series(np.random.rand(5))

0    0.339980
1    0.192890
2    0.330426
3    0.055550
4    0.336929
dtype: float64

In [4]:
#字典构建
d = {'a' : 3,'b' :8, 'c' : 7}
pd.Series(d)

a    3
b    8
c    7
dtype: int64

#### 构建时候指定索引

In [5]:
pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])

a    0.459097
b    0.529160
c    0.194011
d    0.271567
e    0.489065
dtype: float64

### 数据预览

In [6]:
# 创建一个object
ser_obj = pd.Series(np.random.rand(100))

#### head()
head(sth)<br>
tail(sth)
sth = 需要浏览的行数

In [7]:
ser_obj.head(10)

0    0.690233
1    0.010928
2    0.222797
3    0.566556
4    0.558298
5    0.442110
6    0.981741
7    0.005693
8    0.488966
9    0.533852
dtype: float64

In [8]:
ser_obj.tail(5)

95    0.089047
96    0.513969
97    0.098939
98    0.589067
99    0.979225
dtype: float64

#### 获取索引

In [9]:
ser_obj.index

RangeIndex(start=0, stop=100, step=1)

In [10]:
ser_obj.values

array([0.6902329 , 0.01092829, 0.22279653, 0.56655626, 0.55829764,
       0.44210988, 0.98174119, 0.00569266, 0.48896578, 0.53385172,
       0.07950156, 0.59355725, 0.05972231, 0.26518649, 0.0347982 ,
       0.58183578, 0.77891203, 0.70110775, 0.40603905, 0.25853229,
       0.45051121, 0.55457968, 0.996899  , 0.04092633, 0.065435  ,
       0.23549315, 0.64205401, 0.76078499, 0.98072064, 0.07448388,
       0.89372814, 0.37163698, 0.48542141, 0.81718525, 0.99191129,
       0.04010345, 0.40480186, 0.22849647, 0.52075559, 0.79131468,
       0.02044446, 0.12268074, 0.1609067 , 0.45122207, 0.66450157,
       0.72598415, 0.70118537, 0.63653124, 0.73011183, 0.213973  ,
       0.30099398, 0.16558514, 0.44656959, 0.93219874, 0.54230661,
       0.24620901, 0.1391322 , 0.76078781, 0.63776276, 0.55089475,
       0.11673195, 0.54692916, 0.15531503, 0.72806569, 0.66137679,
       0.47560769, 0.06579034, 0.83691973, 0.87276615, 0.27515305,
       0.7698667 , 0.73972888, 0.51012932, 0.00969739, 0.82855

#### name 属性

In [11]:
# 构建一个Series 有名字
ser_obj = pd.Series(np.random.rand(100),name='rand_num')

In [12]:
ser_obj.head()

0    0.471710
1    0.535374
2    0.426856
3    0.204975
4    0.423917
Name: rand_num, dtype: float64

In [13]:
# 给索引去一个名字
ser_obj.index.name = 'index'

In [14]:
ser_obj.head()

index
0    0.471710
1    0.535374
2    0.426856
3    0.204975
4    0.423917
Name: rand_num, dtype: float64

### 通过索引获得数据

#### 通过索引获得对应的value

In [15]:
# 创建一个Series
ser_obj2 = pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])

In [16]:
# 通过索引获得对应的value
ser_obj2['b']

0.7873745954849546

In [17]:
ser_obj2.loc['b']

0.7873745954849546

#### Series 可以被看成定长有序的字典， 通过 in 来判断数据是否存在**

In [18]:
'a' in ser_obj2

True

#### iloc
**通过位置获得数据而不是索引**

In [19]:
ser_obj2[0]

0.8442261848908726

In [20]:
ser_obj.iloc[0]

0.47171034256295696

### 处理缺失的数据

In [21]:
countries = ['China','US','Japan',None]
pd.Series(countries)

0    China
1       US
2    Japan
3     None
dtype: object

In [22]:
numbers = [4,5,6,None]
pd.Series(numbers)

0    4.0
1    5.0
2    6.0
3    NaN
dtype: float64

# 数据与结构 DataFrame

类似于多维数组<br>
每列数据可以是不同的类型<br>
索引包含**行索引**和**列索引**


## DataFrame

In [23]:
# 引入pandas 和 numpy 包
import pandas as pd
import numpy as np

### 构建DataFrame
(1)通过numpy.array构建<br>
(2)通过python dictionary 构建

In [24]:
array = np.random.randn(5,4)

In [25]:
df_obj = pd.DataFrame(array)
df_obj

Unnamed: 0,0,1,2,3
0,-1.079952,0.875558,-0.686495,-0.126443
1,-0.754023,-0.006586,1.378875,-0.460231
2,-0.353218,0.88296,0.383345,-0.059912
3,-0.846384,0.274403,2.009182,-1.344112
4,-0.508657,0.561667,0.075048,-0.258652


In [26]:
# 通过dict
dict_data = {'a':1,
             'b': pd.Timestamp('20190101'),
             'c': pd.Series(1, index = list(range(4)),dtype ='float32'),
             'd':np.array([3]*4,dtype = 'int32'), # 3 重复了4次
             'e':['Python','Java','C++','C#'],
             'f':'something'
    
}

In [27]:
dict_data

{'a': 1,
 'b': Timestamp('2019-01-01 00:00:00'),
 'c': 0    1.0
 1    1.0
 2    1.0
 3    1.0
 dtype: float32,
 'd': array([3, 3, 3, 3]),
 'e': ['Python', 'Java', 'C++', 'C#'],
 'f': 'something'}

In [28]:
df_obj2 = pd.DataFrame(dict_data)
df_obj2

Unnamed: 0,a,b,c,d,e,f
0,1,2019-01-01,1.0,3,Python,something
1,1,2019-01-01,1.0,3,Java,something
2,1,2019-01-01,1.0,3,C++,something
3,1,2019-01-01,1.0,3,C#,something


**数据大小不一样，可以生成DataFrame**<br>
哪个数据小，会被扩展成为最大的<br>
**DataFrame 的数据类型，是按照列来形成的*

### 获得DataFrame的属性

#### 获得列名
与Series一样

In [29]:
df_obj2.columns

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

#### 获得索引

In [30]:
df_obj2.index

Int64Index([0, 1, 2, 3], dtype='int64')

#### 获得值

##### 获得所有的值

In [31]:
df_obj2.values

array([[1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'Python',
        'something'],
       [1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'Java', 'something'],
       [1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'C++', 'something'],
       [1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'C#', 'something']],
      dtype=object)

##### 获得某一列的值

In [32]:
df_obj2['e']

0    Python
1      Java
2       C++
3        C#
Name: e, dtype: object

In [33]:
df_obj2.e #如果列名有空格或者其他字符，可能会返回错误

0    Python
1      Java
2       C++
3        C#
Name: e, dtype: object

In [34]:
type(df_obj2.e)

pandas.core.series.Series

#### head & tail

In [35]:
df_obj2.head(3)

Unnamed: 0,a,b,c,d,e,f
0,1,2019-01-01,1.0,3,Python,something
1,1,2019-01-01,1.0,3,Java,something
2,1,2019-01-01,1.0,3,C++,something


In [36]:
df_obj2.tail(3)

Unnamed: 0,a,b,c,d,e,f
1,1,2019-01-01,1.0,3,Java,something
2,1,2019-01-01,1.0,3,C++,something
3,1,2019-01-01,1.0,3,C#,something


### 修改数据

#### 增加数据列

In [37]:
df_obj2['g'] = range(4)

In [38]:
df_obj2

Unnamed: 0,a,b,c,d,e,f,g
0,1,2019-01-01,1.0,3,Python,something,0
1,1,2019-01-01,1.0,3,Java,something,1
2,1,2019-01-01,1.0,3,C++,something,2
3,1,2019-01-01,1.0,3,C#,something,3


#### 删除数据


##### drop 返回值是操作结果，原数据不会改变

In [39]:
df_obj2.drop(columns=['b','c']) 

Unnamed: 0,a,d,e,f,g
0,1,3,Python,something,0
1,1,3,Java,something,1
2,1,3,C++,something,2
3,1,3,C#,something,3


In [40]:
df_obj2

Unnamed: 0,a,b,c,d,e,f,g
0,1,2019-01-01,1.0,3,Python,something,0
1,1,2019-01-01,1.0,3,Java,something,1
2,1,2019-01-01,1.0,3,C++,something,2
3,1,2019-01-01,1.0,3,C#,something,3


##### del 对原数据进行修改

In [41]:
del df_obj2['a']
df_obj2

Unnamed: 0,b,c,d,e,f,g
0,2019-01-01,1.0,3,Python,something,0
1,2019-01-01,1.0,3,Java,something,1
2,2019-01-01,1.0,3,C++,something,2
3,2019-01-01,1.0,3,C#,something,3


# 数据与结构 Index

索引**不可被改变**<br>
<br>
***索引的种类***<br>(1)index<br>(2)int64index以整形作为index<br>(3)MultiIndex层级索引， 年份与月份<br>(4)DatatimeIndex 时间序列<br>

In [42]:
# 构建Series
ser_obj = pd.Series(range(10,20,2),index = ['a','b','c','d','e'])
ser_obj

a    10
b    12
c    14
d    16
e    18
dtype: int64

#### 查看index

In [43]:
ser_obj.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [44]:
ser_obj.index[2] #index 是无法单个修改

'c'

#### 重置索引
reset_index()

In [45]:
ser_obj.reset_index(drop = True)
# 返回修改数据，而非在原数据上修改

0    10
1    12
2    14
3    16
4    18
dtype: int64

In [46]:
ser_obj.reset_index(drop = False)
# drop = False 会保留index 这个column

Unnamed: 0,index,0
0,a,10
1,b,12
2,c,14
3,d,16
4,e,18


In [47]:
ser_obj

a    10
b    12
c    14
d    16
e    18
dtype: int64

In [48]:
ser_obj = pd.Series([2,4,5,9,1,6])

ser_obj.index = ['a','b','c','d','e','f']

In [49]:
ser_obj[ser_obj > 3]

b    4
c    5
d    9
f    6
dtype: int64

In [50]:
ser_obj['a':'c']

a    2
b    4
c    5
dtype: int64

## DataFrame索引操作

In [51]:
# 构建dataframe
country1 = pd.Series({'Name':'China','Language':'Chinese','Area':'9.597M km2','Happiness Rank': 79})
country2 = pd.Series({'Name':'U.S','Language':'English','Area':'9.834M km2','Happiness Rank': 14})
country3 = pd.Series({'Name':'Australia','Language':'English','Area':'7.692M km2','Happiness Rank': 9})

df = pd.DataFrame([country1,country2,country3], index = ['CH','US','AU'])

In [52]:
df

Unnamed: 0,Name,Language,Area,Happiness Rank
CH,China,Chinese,9.597M km2,79
US,U.S,English,9.834M km2,14
AU,Australia,English,7.692M km2,9


### 列索引
df_obj['label'] <br>
**不连续列索引** <br>
df_obj[['lable1','lable2']] <br>
**连续索引** <br>
df_obj[['lable1':'lable3']] # lable 必须是有顺序的数字

In [53]:
df['Name']

CH        China
US          U.S
AU    Australia
Name: Name, dtype: object

In [54]:
df[['Area','Name']]

Unnamed: 0,Area,Name
CH,9.597M km2,China
US,9.834M km2,U.S
AU,7.692M km2,Australia


### 行索引

df_obj.loc[&emsp;] &emsp; #<br> 
df_obj.iloc[&emsp;] &emsp; 

In [55]:
df

Unnamed: 0,Name,Language,Area,Happiness Rank
CH,China,Chinese,9.597M km2,79
US,U.S,English,9.834M km2,14
AU,Australia,English,7.692M km2,9


In [56]:
df.loc['CH'] 

Name                   China
Language             Chinese
Area              9.597M km2
Happiness Rank            79
Name: CH, dtype: object

In [57]:
df.iloc[0]

Name                   China
Language             Chinese
Area              9.597M km2
Happiness Rank            79
Name: CH, dtype: object

### 先行后列

In [58]:
print(df.loc['CH']['Area'])
print(df.iloc[0]['Area'])

9.597M km2
9.597M km2


### 先列后行

In [59]:
print(df['Area']['CH'])
print(df['Area'].loc['CH'])
print(df['Area'].iloc[0])

9.597M km2
9.597M km2
9.597M km2


### inplace 参数
默认 inplace = False, 表示将操作后的结果进行返回，对原始数据**不会产生影响** <br>
inplace = True, 没有返回值，在原始数据上进行操作，对原数据**会产生影响**

In [60]:
#复制一个 Data Frame
df2 = df.copy()

In [61]:
df.drop('Area',axis = 1)

Unnamed: 0,Name,Language,Happiness Rank
CH,China,Chinese,79
US,U.S,English,14
AU,Australia,English,9


In [62]:
df

Unnamed: 0,Name,Language,Area,Happiness Rank
CH,China,Chinese,9.597M km2,79
US,U.S,English,9.834M km2,14
AU,Australia,English,7.692M km2,9


**上面对于df 没有修改，只是返回了一个新的DF，需要一个新的变量名字去接收**

In [63]:
df.drop('Area',axis = 1,inplace = True)

In [64]:
df

Unnamed: 0,Name,Language,Happiness Rank
CH,China,Chinese,79
US,U.S,English,14
AU,Australia,English,9


**inplace = True, 没有返回值，在原始数据上进行操作，对原数据会产生影响** <br>
就算有新的变量去接收，也会对原值进行修改

## 布尔遮罩
条件索引 <br>
设置布尔值，再放入df[]里面

### 条件索引（值）

In [65]:
df[df['Happiness Rank'] <= 20]

Unnamed: 0,Name,Language,Happiness Rank
US,U.S,English,14
AU,Australia,English,9


### 条件索引（字符包含）

In [66]:
# 生成布尔mask
df['Language'].str.contains('English')
condition = df['Language'].str.contains('English')

In [67]:
#返回值
df[condition]

Unnamed: 0,Name,Language,Happiness Rank
US,U.S,English,14
AU,Australia,English,9


In [68]:
df_obj = pd.DataFrame([[1,2,3],[2,1,1],[4,3,1],[5,1,2]],columns=['a','b','c'])
df_obj

Unnamed: 0,a,b,c
0,1,2,3
1,2,1,1
2,4,3,1
3,5,1,2


In [69]:
df_obj.loc[df_obj.loc[:,"c"]>1,:]

Unnamed: 0,a,b,c
0,1,2,3
3,5,1,2


# 运算与对齐

## Series 运算
pandas 可以对不同索引的对象进行算术计算 <br>
索引与数据的对应关系仍保持在数组运算的结果中 <br>
如果没对齐的位置补NaN

In [70]:
ser1 = pd.Series(range(10))
ser2 = pd.Series(range(5))
ser1

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [71]:
ser1 + ser2

0    0.0
1    2.0
2    4.0
3    6.0
4    8.0
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: float64

## DataFrame 运算
相同位置的数据进行计算**重合的地方计算**

### DataFrame间计算

In [72]:
df1 = pd.DataFrame(np.ones((3,3)),columns=['a','b','c'])
df1

Unnamed: 0,a,b,c
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0


In [73]:
df2 = pd.DataFrame(np.ones((2,2)),columns=['a','b'])
df2

Unnamed: 0,a,b
0,1.0,1.0
1,1.0,1.0


In [74]:
df1 + df2

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


### Series 和 DataFrame 运算
Series 和 DataFrame 操作时， Series 被看作行数据（Index 被看作列）， 和 DataFrame中的每行数据进行计算

In [75]:
ser2 + df1
# 因为列不相同，不相加

Unnamed: 0,a,b,c,0,1,2,3,4
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,


In [76]:
ser3 = pd.Series([1,2,3],index = ['a','b','c'])
ser3

a    1
b    2
c    3
dtype: int64

In [77]:
df1

Unnamed: 0,a,b,c
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0


In [78]:
ser3 + df1

Unnamed: 0,a,b,c
0,2.0,3.0,4.0
1,2.0,3.0,4.0
2,2.0,3.0,4.0


In [79]:
ser1 = pd.Series([10,20,30,40,50,60])

df1=pd.DataFrame([[1,1,1]]*3)

In [80]:
df = ser1+df1
df

Unnamed: 0,0,1,2,3,4,5
0,11,21,31,,,
1,11,21,31,,,
2,11,21,31,,,


In [81]:
df = df1.add(ser1)
df

Unnamed: 0,0,1,2,3,4,5
0,11,21,31,,,
1,11,21,31,,,
2,11,21,31,,,


### 填补缺失值

#### fill_value 

In [82]:
ser1 = pd.Series([10,20,30,40,50,60])
ser2 = pd.Series([1,2,3])
ser1.add(ser2,fill_value=4)

0    11.0
1    22.0
2    33.0
3    44.0
4    54.0
5    64.0
dtype: float64

#### DataFrame.fillna 

In [83]:
df1=pd.DataFrame([ [1,1,1] ]*3)
df2=pd.DataFrame([ [2,2] ]*2)
df3 = df1 + df2
df3

Unnamed: 0,0,1,2
0,3.0,3.0,
1,3.0,3.0,
2,,,


In [84]:
df3 = df3.fillna(4)
df3

Unnamed: 0,0,1,2
0,3.0,3.0,4.0
1,3.0,3.0,4.0
2,4.0,4.0,4.0


# 函数应用

## map（）
将函数作用于一个**Series**的每一个元素<br>
类似于Python的高阶函数map()<br>

### Python 的高阶函数map()

In [85]:
lis= list(range(10))
lis

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [86]:
import math
result = map(math.sqrt,lis)
print(result)

<map object at 0x0000026D9BF9B148>


**返回一个object，操作未进行**，直到取出结果，才会执行操作

In [87]:
list(result)

[0.0,
 1.0,
 1.4142135623730951,
 1.7320508075688772,
 2.0,
 2.23606797749979,
 2.449489742783178,
 2.6457513110645907,
 2.8284271247461903,
 3.0]

### Pandas 中的map()函数
**只运用于Series**

In [88]:
ser = pd.Series(lis)
ser

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [89]:
ser.map(np.sqrt)

0    0.000000
1    1.000000
2    1.414214
3    1.732051
4    2.000000
5    2.236068
6    2.449490
7    2.645751
8    2.828427
9    3.000000
dtype: float64

### 自定义函数

#### 简短lambda

In [90]:
ser.map(lambda x:x **2 + 1)

0     1
1     2
2     5
3    10
4    17
5    26
6    37
7    50
8    65
9    82
dtype: int64

#### 复杂传入新的function name

In [91]:
# 定义一个函数
def func(x):
    x1 = x**3
    x2 = x1*2 +1
    return x2

In [92]:
ser.map(func)

0       1
1       3
2      17
3      55
4     129
5     251
6     433
7     687
8    1025
9    1459
dtype: int64

#### 拼接

In [94]:
ser = pd.Series(list("abcde"))
ser

0    a
1    b
2    c
3    d
4    e
dtype: object

##### 拼接方法一

In [95]:
ser.map(lambda x: ".".join([x,"com"]))

0    a.com
1    b.com
2    c.com
3    d.com
4    e.com
dtype: object

##### 拼接方法二

In [98]:
result = map(lambda x: x + ".com", ser)
list(result)

['a.com', 'b.com', 'c.com', 'd.com', 'e.com']

## apply() & applymap()
可以运用到DataFrame

apply <br>
axis = 0 每一列求，返回一行 <br>
axis = 1 每一行求，返回一列 <br>
applymap() <br>
对每一个数据处理

### apply()

In [103]:
df = pd.DataFrame(np.arange(10).reshape(5,2),columns = ['col_1','col_2'])
df

Unnamed: 0,col_1,col_2
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9


In [106]:
df.apply(np.sum)

col_1    20
col_2    25
dtype: int64

In [105]:
df.apply(np.sum,axis = 1)

0     1
1     5
2     9
3    13
4    17
dtype: int64

### applymap()

In [114]:
df.applymap(lambda x : x**2)

Unnamed: 0,col_1,col_2
0,0,1
1,4,9
2,16,25
3,36,49
4,64,81


In [125]:
staff_df = pd.DataFrame([{'姓名': '张三', '部门': '研发部'},
                         {'姓名': '李四', '部门': '财务部'},
                         {'姓名': '赵六', '部门': '市场部'}])
staff_df

Unnamed: 0,姓名,部门
0,张三,研发部
1,李四,财务部
2,赵六,市场部


In [126]:
staff_df['姓名'].apply(lambda x:x[0])

0    张
1    李
2    赵
Name: 姓名, dtype: object

In [128]:
staff_df["姓"] = staff_df['姓名'].apply(lambda x:x[0])
staff_df

Unnamed: 0,姓名,部门,姓
0,张三,研发部,张
1,李四,财务部,李
2,赵六,市场部,赵


In [130]:
df = pd.DataFrame(np.arange(15).reshape(5,3), columns = ['a','b','c'])
df

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,12,13,14


In [133]:
df.loc[2,:].apply(lambda x: x**2)

a    36
b    49
c    64
Name: 2, dtype: int64