In [2]:
### 为什么要学习pandas -> 因为pandas含有使得数据分析工作变得更简单更快的高级数据结构和操作工具
### pandas 是基于 Numpy 来进行构建的，以让Numpy为中心的应用变得更加的简单

In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
### Series类型说明
### 其实Series是一种类似于一维数组的对象，它是由一组数据以及一组与之相关的数据标签组成(索引)。仅由一组数据即可产生最简单的Series

In [3]:
obj = Series([1,2,3,4,5])
print(obj)
print(obj.values)
print(obj.index)

0    1
1    2
2    3
3    4
4    5
dtype: int64
[1 2 3 4 5]
RangeIndex(start=0, stop=5, step=1)


In [4]:
### 自定义索引
obj2 = Series(['a', 'b', 'c', 'd', 'e'], index=[11,22,33,44,55])
print(obj2)

11    a
22    b
33    c
44    d
55    e
dtype: object


In [5]:
### 我们也可以把Series当作字典来使用
data = {'a':10000, 'b':20000, 'c':30000}
obj3 = Series(data)
print(obj)
keys = ['a', 'c']
obj4 = Series(data, index=keys)
print(obj4)

0    1
1    2
2    3
3    4
4    5
dtype: int64
a    10000
c    30000
dtype: int64


In [6]:
### pandas对缺失数据的处理
data2 = {'a':None, 'b':20000, 'c':30000}
obj5 = Series(data2)
print(obj5)

a        NaN
b    20000.0
c    30000.0
dtype: float64


In [7]:
pd.isnull(obj5)

a     True
b    False
c    False
dtype: bool

In [8]:
pd.notnull(obj5)

a    False
b     True
c     True
dtype: bool

In [9]:
data3 = {'Lilei':None, 'HanMeiMei':25, 'Tony':None, 'Jack':50}
obj6 = Series(data3)
obj6.name = 'NameAndAge'
obj6.index.name = 'xingming'
print(obj6)

xingming
Lilei         NaN
HanMeiMei    25.0
Tony          NaN
Jack         50.0
Name: NameAndAge, dtype: float64


In [10]:
### DataFrame类型
### DataFrame是一个表格型的数据结构 - 就是个Table，它含有一组有序的列。每列可以是不同值的类型，数值，字符串，布尔值都可以
### DataFrame本身有行有索引，也有列索引
### DataFrame也可以理解成是由Series组成的一个字典

In [11]:
# 构建一个DataFrame
data = {
    '60年代':['狗子', '嘎子', '秀儿'],
    '70年代':['卫国', '建国', '爱国'],
    '80年代':['李雷', '韩梅梅', '张伟']
}
frame_data = DataFrame(data)
print(frame_data)
print(frame_data['70年代'])

  60年代 70年代 80年代
0   狗子   卫国   李雷
1   嘎子   建国  韩梅梅
2   秀儿   爱国   张伟
0    卫国
1    建国
2    爱国
Name: 70年代, dtype: object


In [12]:
import numpy as np
dates = pd.date_range('20190301', periods=6)
print(dates)

DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04',
               '2019-03-05', '2019-03-06'],
              dtype='datetime64[ns]', freq='D')


In [13]:
df = pd.DataFrame(np.random.rand(6,4), index = dates, columns = list('ABCD'))
print(df)

                   A         B         C         D
2019-03-01  0.911092  0.119234  0.264810  0.625520
2019-03-02  0.751258  0.780732  0.894685  0.344767
2019-03-03  0.272230  0.362655  0.854664  0.360249
2019-03-04  0.530003  0.219863  0.769741  0.406375
2019-03-05  0.987469  0.457463  0.583420  0.911124
2019-03-06  0.846215  0.795985  0.097667  0.438068


In [14]:
df.T

Unnamed: 0,2019-03-01,2019-03-02,2019-03-03,2019-03-04,2019-03-05,2019-03-06
A,0.911092,0.751258,0.27223,0.530003,0.987469,0.846215
B,0.119234,0.780732,0.362655,0.219863,0.457463,0.795985
C,0.26481,0.894685,0.854664,0.769741,0.58342,0.097667
D,0.62552,0.344767,0.360249,0.406375,0.911124,0.438068


In [15]:
df['20190301':'20190303']

Unnamed: 0,A,B,C,D
2019-03-01,0.911092,0.119234,0.26481,0.62552
2019-03-02,0.751258,0.780732,0.894685,0.344767
2019-03-03,0.27223,0.362655,0.854664,0.360249


In [16]:
df.loc['20190301':'20190303', ['A', "B"]] ### 对行和列同时进行筛选

Unnamed: 0,A,B
2019-03-01,0.911092,0.119234
2019-03-02,0.751258,0.780732
2019-03-03,0.27223,0.362655


In [17]:
df.at[dates[0], 'A']

0.9110922290452289

In [18]:
df.head(2)

Unnamed: 0,A,B,C,D
2019-03-01,0.911092,0.119234,0.26481,0.62552
2019-03-02,0.751258,0.780732,0.894685,0.344767


In [19]:
df.tail(3)

Unnamed: 0,A,B,C,D
2019-03-04,0.530003,0.219863,0.769741,0.406375
2019-03-05,0.987469,0.457463,0.58342,0.911124
2019-03-06,0.846215,0.795985,0.097667,0.438068


In [20]:
### DataFrame 构造函数能够接收哪些数据类型呢
1.二维numpy, array
2.由数组，列表或者元组 组成的字典
3.由series组成的字典
4.由字典组成的字典
5.字典或Series的列表
6.由列表或者元组组成的列表
7.另一个DataFeam
8.其他

SyntaxError: invalid syntax (4014698354.py, line 2)

In [21]:
### Pandas 重新索引 reindex

In [22]:
objA = Series([4.5,9.8,-1.2], index=['a','b','c'])
print(objA)
jobA = objA.reindex(['a','b','c','e','f'])
print(jobA)

a    4.5
b    9.8
c   -1.2
dtype: float64
a    4.5
b    9.8
c   -1.2
e    NaN
f    NaN
dtype: float64


In [23]:
obj.reindex(['a','b','c','e','f'], fill_value=0)

a    0
b    0
c    0
e    0
f    0
dtype: int64

In [24]:
objB = Series([4.5,9.8,-1.2], index=[0, 2, 4])
o = objB.reindex(range(6), method='ffill')
print(objB)
print(o)

0    4.5
2    9.8
4   -1.2
dtype: float64
0    4.5
1    4.5
2    9.8
3    9.8
4   -1.2
5   -1.2
dtype: float64


In [25]:
### 算数运算和数据对齐
### pandas的一个重要功能，就是可以对不同索引的对象进行算数运算，在将对象相加时，如果存在不同的索引对，则结果的索引就是该索引的并集

In [26]:
d1 = Series([1.3, 1.5, 1.8, 3.5], index=['a', 'b', 'c', 'd'])
d2 = Series([-1.3, -1.5, -1.8, -3.5, -8.0], index=['a', 'b', 'c', 'd', 'e'])
d1+d2

a    0.0
b    0.0
c    0.0
d    0.0
e    NaN
dtype: float64

In [27]:
df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('abc'), index=[1,2,3])
print(df1)
df2 = DataFrame(np.arange(16).reshape((4,4)), columns=list('cdef'), index=[1,2,3,4])
df1+df2

   a  b  c
1  0  1  2
2  3  4  5
3  6  7  8


Unnamed: 0,a,b,c,d,e,f
1,,,2.0,,,
2,,,9.0,,,
3,,,16.0,,,
4,,,,,,


In [28]:
df1.add(df2, fill_value=0) # 用0来填充不重叠的值 如果本身为空，那么还是空

Unnamed: 0,a,b,c,d,e,f
1,0.0,1.0,2.0,1.0,2.0,3.0
2,3.0,4.0,9.0,5.0,6.0,7.0
3,6.0,7.0,16.0,9.0,10.0,11.0
4,,,12.0,13.0,14.0,15.0


In [29]:
### DataFrame和Series之间的运算

In [30]:
frame = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=[1,2,3,4])
series = frame.loc[1] #选取frame中索引为1的一行数据
print(frame)
print(series)


   b   d   e
1  0   1   2
2  3   4   5
3  6   7   8
4  9  10  11
b    0
d    1
e    2
Name: 1, dtype: int64


In [31]:
frame-series # 一直向下广播相减

Unnamed: 0,b,d,e
1,0,0,0
2,3,3,3
3,6,6,6
4,9,9,9


In [32]:
seriesB = Series(range(3), index=list('bef'))
frame+seriesB # 相加时，没有就合并

Unnamed: 0,b,d,e,f
1,0.0,,3.0,
2,3.0,,6.0,
3,6.0,,9.0,
4,9.0,,12.0,


In [33]:
### 排序
### 根据条件对数据集进行排序

In [34]:
obj = Series(range(4), index=['d', 'e', 'a', 'b'])
print(obj)

d    0
e    1
a    2
b    3
dtype: int64


In [35]:
obj.sort_index()

a    2
b    3
d    0
e    1
dtype: int64

In [36]:
obj.sort_values()

d    0
e    1
a    2
b    3
dtype: int64

In [37]:
### 针对DataFrame, 根据任意一个轴上的索引进行排序
frame = DataFrame(np.arange(8).reshape((2,4)), index=['two', 'one'], columns=['c', 'd', 'a', 'b'])
frame

Unnamed: 0,c,d,a,b
two,0,1,2,3
one,4,5,6,7


In [38]:
frame.sort_index()

Unnamed: 0,c,d,a,b
one,4,5,6,7
two,0,1,2,3


In [39]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
two,2,3,0,1
one,6,7,4,5


In [40]:
frame = DataFrame({'b':[4,7,2,-1], 'a':[0,4,2,0]})
frame

Unnamed: 0,b,a
0,4,0
1,7,4
2,2,2
3,-1,0


In [41]:
frame.sort_values(by='b')

Unnamed: 0,b,a
3,-1,0
2,2,2
0,4,0
1,7,4


In [42]:
### 层次化索引
### 层次化索引是pandas的一项比较重要的功能。它能够让你在一个轴上拥有多个索引级别，另一种说法是它能以低维度的形式处理高维度的数据

In [43]:
data = Series(np.random.randn(10), index=[['a','a','a','b','b','b','c','c','d','d'], [1,2,3,4,5,6,7,8,1,2]])
data

a  1    0.226037
   2   -1.455741
   3    0.948430
b  4   -0.960568
   5    0.844041
   6   -0.969707
c  7   -0.126668
   8    2.462585
d  1    0.701303
   2    0.135230
dtype: float64

In [44]:
data.index # 索引的等级展示

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 4),
            ('b', 5),
            ('b', 6),
            ('c', 7),
            ('c', 8),
            ('d', 1),
            ('d', 2)],
           )

In [45]:
### 选取子集的操作
data['b']
data['b':'c']

b  4   -0.960568
   5    0.844041
   6   -0.969707
c  7   -0.126668
   8    2.462585
dtype: float64

In [46]:
### 内层子集索引选取
data[:,2]   # 第一层选择所有的元素，第二层索引中选择索引为2的元素

a   -1.455741
d    0.135230
dtype: float64

In [47]:
data.unstack()  # 从series生成一个新的DataFrame

Unnamed: 0,1,2,3,4,5,6,7,8
a,0.226037,-1.455741,0.94843,,,,,
b,,,,-0.960568,0.844041,-0.969707,,
c,,,,,,,-0.126668,2.462585
d,0.701303,0.13523,,,,,,


In [48]:
data.unstack().stack() #从之前的DataFrame又变形回来

a  1    0.226037
   2   -1.455741
   3    0.948430
b  4   -0.960568
   5    0.844041
   6   -0.969707
c  7   -0.126668
   8    2.462585
d  1    0.701303
   2    0.135230
dtype: float64

In [49]:
### 对于DataFrame, 每条轴(row or column)都可以有分层索引，各层也是都可以有名字的

In [50]:
frame_data = DataFrame(np.arange(12).reshape((4,3)), index=[['a','a','b','b'], [1,2,1,2]], columns=[['Black','Yellow','Blue'], ['Green','Red','Green']])
frame_data.index.names=['color1','color2']
frame_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Black,Yellow,Blue
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
color1,color2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [51]:
### DataFrame 类型层次化索引的操作
frame_data = DataFrame(np.arange(12).reshape((4,3)), index=[['a','a','b','b'], [1,2,1,2]], columns=[['Black','Yellow','Blue'], ['Green','Red','Green']])
frame_data


Unnamed: 0_level_0,Unnamed: 1_level_0,Black,Yellow,Blue
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [52]:
frame_data['Black']   # 对列进行筛选

Unnamed: 0,Unnamed: 1,Green
a,1,0
a,2,3
b,1,6
b,2,9


In [53]:
### pandas 文本格式处理
### read_csv: 从文件，url, 文件型对象中加载带分隔符的数据，默认分隔符号为逗号
### read_table: 从文件，url, 文件型对象中加载带分隔符的数据，默认分隔符号为制表符'\t'
### read_fwf: 读取固定宽列格式数据
### read_clipboard: 读取剪切板中的数据，可以看做是read_table的剪切板。可以用在将网页中的数据转换为表格中的数据时用到

In [None]:
### pandas读取Excel文件

In [None]:
excel = pd.read_excel('data_excel.xlsx')
pd.read_excel('data_excel.xlsx', sheet_name='work_sheet_2')
pl = excel.plot(kind='scatter', x='age', y='place').get_figure()
pl.savefig('1.jpg')

In [65]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

dates = pd.date_range('20200101', periods=6)
df = pd.DataFrame(np.random.rand(6,4), index=dates, columns=list('ABCD'))
print(df)

                   A         B         C         D
2020-01-01  0.120580  0.590954  0.044551  0.690010
2020-01-02  0.863991  0.636671  0.963686  0.101109
2020-01-03  0.511690  0.715066  0.627801  0.629779
2020-01-04  0.894260  0.818051  0.829116  0.789043
2020-01-05  0.786161  0.869860  0.765354  0.379541
2020-01-06  0.826243  0.254829  0.056221  0.379835


In [66]:
pl = df.plot(kind='scatter', x='A', y='B').get_figure()
pl.savefig('2.png')

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.