# 2.1---Pandas数据结构-Series：基本概念及创建
## “一维数组”Series

In [1]:
# Series数据结构
# Series是带有标签的一维数组，可以保存任何数据类型（整数、字符串、浮点数、Python对象等），轴标签统称为索引

import numpy as np
import pandas as pd
# 导入Numpy，Pandas模块

s = pd.Series(np.random.rand(5))        # Series的索引未必一定是数字，也可以是字母等等
print(s)
print(type(s))
# 查看数据，数据类型

print(s.index,type(s.index))     # index类似于一个生成器，要想输出可以用list()
print(s.values,type(s.values))
# .index查看series索引，类型为rangeindex
# .values查看series值，类型是ndarray

# 核心：series相比于ndarray，是一个自带索引index的数组 → 一维数组 + 对应索引
# 所以当只看series的值的时候，就是一个ndarray
# series和ndarray较相似，索引切片功能差别不大
# series和dict相比，series更像一个有顺序的字典（dict本身不存在顺序），其索引原理与字典相似（一个用key，一个用index）

0    0.544708
1    0.634912
2    0.480914
3    0.801161
4    0.913004
dtype: float64
<class 'pandas.core.series.Series'>
RangeIndex(start=0, stop=5, step=1) <class 'pandas.core.indexes.range.RangeIndex'>
[0.54470823 0.63491222 0.48091356 0.80116073 0.91300362] <class 'numpy.ndarray'>


In [2]:
# Series创建方法一：由字典创建，字典的key就是index，values就是values

dic = {'a':1, 'b':2, 'c':3, '4':4, '5': 5}
s =pd.Series(dic)
print(s,'\n',type(s))
# 注意key肯定是字符串，假如values类型不止一个会怎么样？---> dic = {'a':1, 'b':'hello', 'c':3, '4':4, '5':5}

4    4
5    5
a    1
b    2
c    3
dtype: int64 
 <class 'pandas.core.series.Series'>


In [3]:
# Series创建方法二：由数组创建（一维数组）

arr =np.random.rand(5)
s = pd.Series(arr)
print(arr,'\t',type(arr))
print(s,'\t',type(s))
# 默认index是从0开始，步长为1的数字

s = pd.Series(arr,index = list('abcde'),dtype = np.object)
print(s,'\n',type(s))
# index参数：设置index，长度保持一致
# dtype参数：设置数值类型

[0.39636219 0.80357757 0.53991881 0.38515742 0.04750525] 	 <class 'numpy.ndarray'>
0    0.396362
1    0.803578
2    0.539919
3    0.385157
4    0.047505
dtype: float64 	 <class 'pandas.core.series.Series'>
a     0.396362
b     0.803578
c     0.539919
d     0.385157
e    0.0475053
dtype: object 
 <class 'pandas.core.series.Series'>


In [4]:
# Series创建方法三：由标量创建

s = pd.Series(10, index = range(4))
print(s)
# 如果data是标量值，则必须提供索引。该值会重复，来匹配索引的长度。

0    10
1    10
2    10
3    10
dtype: int64


In [5]:
# Series名称属性：name

s1 = pd.Series(np.random.randn(5))
print(s1)
print("------------------------------------------")
s2 = pd.Series(np.random.randn(5), name = 'test')
print(s2)
print(s1.name, s2.name, type(s2.name))
# name为Series的一个参数，创建一个数组的名称
# .name方法：输出数组的名称，输出格式为str，如果没有定义输出名称，输出为None

s3 = s2.rename("hehehe")
print(s3)
print(s3.name,s2.name)
# .rename()方法：重命名一个数组的名称，并且新指向一个数组，原数组不变

0   -0.188485
1    1.076107
2   -1.228941
3   -0.161078
4   -0.907111
dtype: float64
------------------------------------------
0   -2.030903
1   -0.640124
2    0.953726
3    1.077311
4   -0.409585
Name: test, dtype: float64
None test <class 'str'>
0   -2.030903
1   -0.640124
2    0.953726
3    1.077311
4   -0.409585
Name: hehehe, dtype: float64
hehehe test


# 2.2---Pandas数据结构-Series：索引
## 位置下标 / 标签索引 / 切片索引 / 布尔型索引

In [6]:
# 位置下标，类似序列

s = pd.Series(np.random.rand(5))
print(s)
print(s[0], type(s[0]), s[0].dtype)
print(float(s[0]),type(float(s[0])))
print(s[-1])         # 事实证明会报错！！！                 
# 位置下标从0开始
# 输出结果为numpy.float格式
# 可以通过float()函数转换为Python float 格式
# numpy.float与float占用字节不同
# s[-1]结果如何？

0    0.690311
1    0.475359
2    0.485334
3    0.365693
4    0.901511
dtype: float64
0.6903105273643406 <class 'numpy.float64'> float64
0.6903105273643406 <class 'float'>


KeyError: -1

In [7]:
# 标签索引

s = pd.Series(np.random.rand(5),index = list('abcde'))
print(s)
print(s['a'], '\t', type(s['a']), '\t', s['a'].dtype)
# 方法类似下标索引，用[]表示，里面协商相应的index，注意，index是字符串

sci = s[list('abe')]
print(sci, '\t', type(sci), '\t', sci.dtype)
# 如果需要选择多个标签的值，用[[]]来表示（相当于[]中包含一个列表）
# 多标签索引结果是新的数组  注意：标签的顺序可以改变！！！

a    0.249218
b    0.168707
c    0.377836
d    0.457591
e    0.514422
dtype: float64
0.24921796798069018 	 <class 'numpy.float64'> 	 float64
a    0.249218
b    0.168707
e    0.514422
dtype: float64 	 <class 'pandas.core.series.Series'> 	 float64


In [8]:
# 切片索引

s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5), index = list('abcde'))
print(s1[1:4], '\t', s1[4])       # 下标
print(s2['a':'c'],'\t',s2['c']) #  标签
print(s2[0:3],'\t',s2[3])
print("-------------------------------------------------")
# 注意：用index做切片是末端包含

print(s2[:-1])
print(s2[::2])
# 下标索引做切片，和list写法一样

1    0.837221
2    0.539634
3    0.875737
dtype: float64 	 0.4989435351902991
a    0.705824
b    0.455990
c    0.750291
dtype: float64 	 0.7502914417297876
a    0.705824
b    0.455990
c    0.750291
dtype: float64 	 0.08594684319837054
-------------------------------------------------
a    0.705824
b    0.455990
c    0.750291
d    0.085947
dtype: float64
a    0.705824
c    0.750291
e    0.198375
dtype: float64


In [9]:
# 布尔型索引

s = pd.Series(np.random.rand(3)*100)
s[4] = None      # 添加一个空值，将空值写成np.NaN也是可以的
print(s)
bs1 = s > 50
bs2 = s.isnull()
bs3 = s.notnull()
print(bs1,type(bs1),bs1.dtype)
print(bs2,type(bs2),bs2.dtype)
print(bs3,type(bs3),bs3.dtype)
print("-----------------------------------------")
# 数组作出判断之后，返回的是有一个布尔值组成的新的数组
# .isnull() / .notnull() 方法判断是否为空值（None代表空值，NaN代表有问题的数值，两个都会识别为空值）

print(s[s > 50])
print(s[bs3])
# 布尔型索引方法：用[判断条件]来进行表示，其中判断条件可以是一条语句，或者也可以是一个布尔型数组！

0    17.8675
1    54.9716
2    52.1081
4       None
dtype: object
0    False
1     True
2     True
4    False
dtype: bool <class 'pandas.core.series.Series'> bool
0    False
1    False
2    False
4     True
dtype: bool <class 'pandas.core.series.Series'> bool
0     True
1     True
2     True
4    False
dtype: bool <class 'pandas.core.series.Series'> bool
-----------------------------------------
1    54.9716
2    52.1081
dtype: object
0    17.8675
1    54.9716
2    52.1081
dtype: object


# 2.3---Pandas数据结构-Series：基本技巧
## 数据查看 / 重新索引 / 对齐 / 添加、修改、删除值

In [10]:
# 数据查看

s = pd.Series(np.random.rand(50))
print(s.head(10))
print(s.tail())
# .head()查看头部数据
# .tail()查看尾部数据
# 默认查看5条

0    0.339713
1    0.841277
2    0.829044
3    0.053316
4    0.945122
5    0.297388
6    0.593136
7    0.887259
8    0.712797
9    0.402734
dtype: float64
45    0.097023
46    0.172263
47    0.751410
48    0.201015
49    0.417799
dtype: float64


In [11]:
# 重新索引reindex
# .reindex将会根据索引重新排序，如果当前索引不存在，则会引入缺失值

s = pd.Series(np.random.rand(3),index = list('abc'))
print(s)
s1 = s.reindex(list('cbad'))
print(s1)
# .reindex()中也是写列表
# 注意：这里的'd'索引不存在，所以值为NaN

s2 = s.reindex(list('cbad'),fill_value = 0)
print(s2)
# fill_value参数：填充缺失值的值

a    0.641504
b    0.238772
c    0.719023
dtype: float64
c    0.719023
b    0.238772
a    0.641504
d         NaN
dtype: float64
c    0.719023
b    0.238772
a    0.641504
d    0.000000
dtype: float64


In [12]:
# Series对齐

s1 = pd.Series(np.random.rand(3),index = ['Jack','Marry','Tom'])
s2 = pd.Series(np.random.rand(3),index = ['Wang','Jack','Marry'])
print(s1)
print(s2)
print(s1+s2)
# Series 和 ndarray 之间的主要区别是，Series 上的操作会根据标签自动对齐
# index顺序不会影响数值计算，以标签来计算
# 空值和任何值计算结果扔为空值

Jack     0.734995
Marry    0.933204
Tom      0.127558
dtype: float64
Wang     0.112632
Jack     0.011946
Marry    0.812446
dtype: float64
Jack     0.746941
Marry    1.745649
Tom           NaN
Wang          NaN
dtype: float64


In [13]:
# 删除：.drop

s = pd.Series(np.random.rand(5), index = list('ngjur'))
print(s)
s1 = s.drop('n')
s2 = s.drop(list('gj'))
print(s1)
print(s2)
print(s)
# drop 删除元素之后返回副本（inplace = False）

n    0.366744
g    0.937997
j    0.064086
u    0.115443
r    0.811429
dtype: float64
g    0.937997
j    0.064086
u    0.115443
r    0.811429
dtype: float64
n    0.366744
u    0.115443
r    0.811429
dtype: float64
n    0.366744
g    0.937997
j    0.064086
u    0.115443
r    0.811429
dtype: float64


In [14]:
# 添加

s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5), index = list('ngjur'))
print(s1)
print(s2)
s1[5] = 100
s2['a'] = 100
print(s1)
print(s2)
print("-------------------------------------------------")
# 直接通过下标索引/标签index添加值

s3 = s1.append(s2)
print(s3)
print(s1)
# 通过.append()方法，直接添加一个数组
# .append() 方法生成一个新的数组，不改变之前的数组

0    0.036616
1    0.748492
2    0.801879
3    0.157206
4    0.921338
dtype: float64
n    0.022153
g    0.732041
j    0.887380
u    0.700776
r    0.639713
dtype: float64
0      0.036616
1      0.748492
2      0.801879
3      0.157206
4      0.921338
5    100.000000
dtype: float64
n      0.022153
g      0.732041
j      0.887380
u      0.700776
r      0.639713
a    100.000000
dtype: float64
-------------------------------------------------
0      0.036616
1      0.748492
2      0.801879
3      0.157206
4      0.921338
5    100.000000
n      0.022153
g      0.732041
j      0.887380
u      0.700776
r      0.639713
a    100.000000
dtype: float64
0      0.036616
1      0.748492
2      0.801879
3      0.157206
4      0.921338
5    100.000000
dtype: float64


In [15]:
# 修改

s = pd.Series(np.random.rand(3),index = list('abc'))
print(s)
s['a'] = 100
s[list('bc')] = 200
print(s)
# 通过索引直接修改，类似序列

a    0.169388
b    0.025191
c    0.789319
dtype: float64
a    100.0
b    200.0
c    200.0
dtype: float64


# 2.4---Pandas数据结构DataFrame：基本概念及创建
## “二维数组”Dataframe：是一个表格型的数据结构，包含一组有序的列，其列的值类型可以是数值，字符串，布尔值等等
## Dataframe中的数据以一个或者多个二维块存放，不是列表，字典或者一维数组结构

In [16]:
# Dataframe数据结构
# Dataframe是一个表格型的数据结构，“带有标签的二维数组”
# Dataframe带有index（行标签）和columns（列标签）

data = {'name':['Jack','Tom','Mary'],
        'age':[18,19,20],
        'gender':['m','m','w']
       }
frame = pd.DataFrame(data)
print(frame)
print(type(frame))
print(frame.index, '\n该数据类型为：',type(frame.index))
print(frame.columns, '\n该数据类型为：',type(frame.columns))
print(frame.values, '\n该数据类型为：',type(frame.values))
# 查看数据，数据类型为Dataframe
# .index查看行标签
# .columns查看列标签
# .values查看值，数据类型为ndarray

   age gender  name
0   18      m  Jack
1   19      m   Tom
2   20      w  Mary
<class 'pandas.core.frame.DataFrame'>
RangeIndex(start=0, stop=3, step=1) 
该数据类型为： <class 'pandas.core.indexes.range.RangeIndex'>
Index(['age', 'gender', 'name'], dtype='object') 
该数据类型为： <class 'pandas.core.indexes.base.Index'>
[[18 'm' 'Jack']
 [19 'm' 'Tom']
 [20 'w' 'Mary']] 
该数据类型为： <class 'numpy.ndarray'>


In [17]:
# Dataframe 创建方法一：由数组/list组成的字典
# 创建方法：pd.DataFrame()

data1 = {
    'a':[1,2,3],
    'b':[3,4,5],
    'c':[5,6,7]
        }
data2 = {
    'one':np.random.rand(3),
    'two':np.random.rand(3)
}                           # 这里如果尝试 'two': np.random.rand(4)会怎么样？
print(data1)
print(data2)
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)
# 注意：由数组/list构成的字典，创建Dataframe，cloumns为key，index为默认数字标签
# ！！！字典的长度必须保持一致！！！

df1 = pd.DataFrame(data1, columns = list('bcad'))
print(df1)
df1 = pd.DataFrame(data1, columns = list('bc'))
print(df1)
# columns参数：可以重新指定列的顺序，格式为list，如果现在数据中没有该列（比如'd'），则会产生NaN值
# 如果columns重新指定时候，列的数量可以小于原数据

df2 = pd.DataFrame(data2, index = ['f1','f2','f3'])  # 这里如果尝试  index = ['f1','f2','f3','f4'] 会怎么样？
print(df2)
# index参数：重新定义index，格式为list，长度必须保持一致

{'a': [1, 2, 3], 'b': [3, 4, 5], 'c': [5, 6, 7]}
{'one': array([0.28037544, 0.14699078, 0.80693857]), 'two': array([0.546477  , 0.72889235, 0.34602245])}
   a  b  c
0  1  3  5
1  2  4  6
2  3  5  7
        one       two
0  0.280375  0.546477
1  0.146991  0.728892
2  0.806939  0.346022
   b  c  a    d
0  3  5  1  NaN
1  4  6  2  NaN
2  5  7  3  NaN
   b  c
0  3  5
1  4  6
2  5  7
         one       two
f1  0.280375  0.546477
f2  0.146991  0.728892
f3  0.806939  0.346022


In [18]:
# Dataframe 创建方法二：由Series组成的字典

data1 = {
    'one':pd.Series(np.random.rand(2)),
    'two':pd.Series(np.random.rand(3))
}                                    # 没有设置index的Series
data2 = {
    'one':pd.Series(np.random.rand(2),index = list('ab')),
    'two':pd.Series(np.random.rand(3),index = list('abc'))
}
print(data1)
print(data2)
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)             # 若在这里更改index，并不会更改index，而是会指向新的标签！！！务必注意！！！
print(df1)
print(df2)
# 由Seris组成的字典 创建Dataframe，columns为字典key，index为Series的标签（如果Series没有指定标签，则是默认数字标签）
# Series可以长度不一样，生成的Dataframe会出现NaN值

{'one': 0    0.491075
1    0.422089
dtype: float64, 'two': 0    0.756614
1    0.831410
2    0.871945
dtype: float64}
{'one': a    0.397379
b    0.397216
dtype: float64, 'two': a    0.501213
b    0.518878
c    0.211261
dtype: float64}
        one       two
0  0.491075  0.756614
1  0.422089  0.831410
2       NaN  0.871945
        one       two
a  0.397379  0.501213
b  0.397216  0.518878
c       NaN  0.211261


In [19]:
# Dataframe 创建方法三：通过二维数组直接创建

ar = np.random.rand(9).reshape(3,3)
print(ar)
df1 = pd.DataFrame(ar)
df2 = pd.DataFrame(ar, index = list('abc'), columns = ['one','two','three'])         # 可以尝试一下index或columns长度不等于已有数组的情况
print(df1)
print(df2)
# 通过二维数组直接创建Dataframe，得到一样形状的结果数据，如果不指定index和columns，两者均返回默认数字格式
# index和colunms指定长度与原数组保持一致

[[0.52176652 0.59638681 0.38100582]
 [0.38380165 0.38048098 0.75275102]
 [0.33906418 0.58788342 0.33444369]]
          0         1         2
0  0.521767  0.596387  0.381006
1  0.383802  0.380481  0.752751
2  0.339064  0.587883  0.334444
        one       two     three
a  0.521767  0.596387  0.381006
b  0.383802  0.380481  0.752751
c  0.339064  0.587883  0.334444


In [20]:
# Dataframe 创建方法四：由字典组成的列表

data = [{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
print(data)
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, index = ['a','b'])
df3 = pd.DataFrame(data, columns = ['one','two'])
print(df1)
print(df2)
print(df3)
# 由字典组成的列表创建Dataframe，columns为字典的key，index不做指定则为默认数组标签
# colunms和index参数分别重新指定相应列及行标签

[{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
   one  three  two
0    1    NaN    2
1    5   20.0   10
   one  three  two
a    1    NaN    2
b    5   20.0   10
   one  two
0    1    2
1    5   10


In [21]:
# Dataframe 创建方法五：由字典组成的字典

data = {'Jack':{'math':90,'english':89,'art':78},
       'Marry':{'math':82,'english':95,'art':92},
       'Tom':{'math':78,'english':67}}
df1 = pd.DataFrame(data)
print(df1)
# 由字典组成的字典创建Dataframe，columns为字典的key，index为子字典的key

df2 = pd.DataFrame(data, columns = ['Jack','Tom','Bob'])
df3 = pd.DataFrame(data, index = ['a','b','c'])
print(df2)
print(df3)
# columns参数可以增加和减少现有列，如出现新的列，值为NaN
# index在这里和之前不同，并不能改变原有index，如果指向新的标签，值为NaN （非常重要！）

         Jack  Marry   Tom
art        78     92   NaN
english    89     95  67.0
math       90     82  78.0
         Jack   Tom  Bob
art        78   NaN  NaN
english    89  67.0  NaN
math       90  78.0  NaN
   Jack  Marry  Tom
a   NaN    NaN  NaN
b   NaN    NaN  NaN
c   NaN    NaN  NaN


# 2.5---Pandas数据结构Dataframe：索引
## Dataframe既有行索引也有列索引，可以被看做由Series组成的字典（共用一个索引）
## 选择列 / 选择行 / 切片 / 布尔判断

In [22]:
# 选择行与列

df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
                 index = ['one','two','three'],
                 columns = ['a','b','c','d']
                 )
print(df)
data1 = df['a']
data2 = df[['a','c']]
print(data1,type(data1))
print(data2,type(data2))
print("---------------------------------------------------")
# 按照列名选择列，只选择一列输出Series，选择多列输出Dataframe

data3 = df.loc['one']
data4 = df.loc[['one','two']]
print(data3,type(data3))
print(data4,type(data4))
# 按照index选择行，只选择一行输出Series，选择多行输出Dataframe

               a          b          c          d
one    83.779371  56.285376  86.873265  93.089983
two    16.055004  23.748647  31.772143  15.622964
three  24.665789  18.859390  38.126920   6.847767
one      83.779371
two      16.055004
three    24.665789
Name: a, dtype: float64 <class 'pandas.core.series.Series'>
               a          c
one    83.779371  86.873265
two    16.055004  31.772143
three  24.665789  38.126920 <class 'pandas.core.frame.DataFrame'>
---------------------------------------------------
a    83.779371
b    56.285376
c    86.873265
d    93.089983
Name: one, dtype: float64 <class 'pandas.core.series.Series'>
             a          b          c          d
one  83.779371  56.285376  86.873265  93.089983
two  16.055004  23.748647  31.772143  15.622964 <class 'pandas.core.frame.DataFrame'>


In [23]:
# df[]--- 选择列
# 一般用于选择列，也可以选择行

df = pd.DataFrame(np.random.rand(3,4).reshape(3,4),
                 index = ['one','two','three'],
                 columns = list("abcd")
                 )
print(df)
print("--------------------------------------------------")
data1 = df['a']
data2 = df[['b','c']]                          # # 尝试输入 data2 = df[['b','c','e']]
print(data1,'\t',type(data1))
print(data2,'\t',type(data2))
# df[]默认选择列，[]中写列名（所以一般数据colunms都会单独制定，不会用默认数字列名，以免和index冲突）
# 单选列为Series，print结果为Series格式
# 多选列为Dataframe，print结果为Dataframe格式

data3 = df[:1]
# data3 = df[0]
# data3 = df['one']
print(data3,type(data3))
# df[]中为数字时，默认选择行，且只能进行切片的选择，不能单独选择（df[0]）  此处务必要注意了！！！
# 输出结果为Dataframe，即便只选择一行
# df[]不能通过索引标签名来选择行(df['one'])

# 核心笔记：df[col]一般用于选择列，[]中写列名

              a         b         c         d
one    0.474373  0.467418  0.953463  0.109177
two    0.610307  0.226797  0.788088  0.752459
three  0.654571  0.095461  0.944970  0.885476
--------------------------------------------------
one      0.474373
two      0.610307
three    0.654571
Name: a, dtype: float64 	 <class 'pandas.core.series.Series'>
              b         c
one    0.467418  0.953463
two    0.226797  0.788088
three  0.095461  0.944970 	 <class 'pandas.core.frame.DataFrame'>
            a         b         c         d
one  0.474373  0.467418  0.953463  0.109177 <class 'pandas.core.frame.DataFrame'>


In [26]:
# df.loc[]---按照index选择行

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                  index = ['one','two','three','four'],
                  columns = list("abcd")
                  )
df2 = pd.DataFrame(np.random.rand(4,4).reshape(4,4)*100,
                  columns = list("abcd")
                  )
print(df1)
print(df2)
print("--------------------------------------------------")

data1 = df1.loc['one']
data2 = df2.loc[1]
print(data1)
print(data2)
print('单标签索引\n------------------------------------------------------------------')
# 单个标签索引，返回Series

data3 = df1.loc[['two','three','five']]
data4 = df2.loc[[3,2,1]]
print(data3)
print(data4)
print('多标签索引\n-----------------------------------------------------------------')
# 多个标签索引，如果标签不存在，则返回NaN
# 顺序可变

data5 = df1.loc['one':'three']
data6 = df2.loc[1:3]
print(data5)
print(data6)
str = '切片索引'
print("{:-^20}".format(str))
# 可以做切片对象
# 末端包含

# 核心笔记：df.loc[label]主要针对index选择行，同时支持指定index，及默认数字index

               a          b          c          d
one    39.198392  24.962830  14.029617  33.007227
two    96.797550  48.136346  31.453764  34.781173
three  92.094622  79.169546  29.074811  18.559852
four   89.539980  92.726186   7.467540  81.289885
           a          b          c          d
0  67.709251   1.485328  70.090358  86.362853
1  60.418051  42.069528   1.930579  25.235333
2  51.598369  10.377433  64.684472   3.397851
3  27.294191  13.877274  10.602317  49.192797
--------------------------------------------------
a    39.198392
b    24.962830
c    14.029617
d    33.007227
Name: one, dtype: float64
a    60.418051
b    42.069528
c     1.930579
d    25.235333
Name: 1, dtype: float64
单标签索引
------------------------------------------------------------------
               a          b          c          d
two    96.797550  48.136346  31.453764  34.781173
three  92.094622  79.169546  29.074811  18.559852
five         NaN        NaN        NaN        NaN
           a          b   

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [27]:
# df.iloc[]---按照整数位置（从轴的0到length-1）来选择行
# 类似list的索引，其顺序就是Dataframe的整数位置，从0开始计数

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                 index = ['one','two','three','four'],
                 columns = list('abcd')
                 )
print(df)
print("----------------------------------------------------------")

print(df.iloc[0])
print(df.iloc[-1])
# print(df.iloc[4])
print('单位置索引\n------------------------------------------------------------------')
# 单位置索引
# 和loc索引不同，不能索引超出数据行数的整数位置

print(df.iloc[[0,2]])
print(df.iloc[[3,2,1]])
print('多位置索引\n-----------------------------------------------------------------')
# 多位置索引
# 顺序可变

print(df.iloc[1:3])
print(df.iloc[::2])
print('切片索引')
# 切片索引
# 末端不包含   Note: 此处需要注意和.loc()方法的区别

               a          b          c          d
one    67.008044  31.665447  42.100905  76.993404
two     0.900278  28.284916  55.616162   4.003830
three  28.757140  34.254015  16.757307  42.946138
four   85.097424  64.501242  84.786248  87.951237
----------------------------------------------------------
a    67.008044
b    31.665447
c    42.100905
d    76.993404
Name: one, dtype: float64
a    85.097424
b    64.501242
c    84.786248
d    87.951237
Name: four, dtype: float64
单位置索引
------------------------------------------------------------------
               a          b          c          d
one    67.008044  31.665447  42.100905  76.993404
three  28.757140  34.254015  16.757307  42.946138
               a          b          c          d
four   85.097424  64.501242  84.786248  87.951237
three  28.757140  34.254015  16.757307  42.946138
two     0.900278  28.284916  55.616162   4.003830
多位置索引
-----------------------------------------------------------------
               a       

In [28]:
# 布尔型索引
# 和Series原理相同

df= pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                index = ['one','two','three','four'],
                columns = list("abcd")
                )
print(df)
print("-----------------------------------------------------------")

b1 = df < 20
print(b1,type(b1))
print(df[b1])                      # 也可以书写为 df[df < 20]
print("----------------------------------------------------------")
# 不做索引则会对数据的每个值进行判断
# 索引结果保留所有数据：True返回原数据，False返回值为NaN

b2 = df['a'] > 50
print(b2,type(b2))
print(df[b2])                    # 也可以书写为 df[df['a'] > 50]
print("---------------------------------------------------------")
# 单列做判断
# 索引结果保留，单列判断为True的行数据，包括其他列

b3 = df[['a','b']] > 50
print(b3,type(b3))
print(df[b3])                # 也可以书写为 df[df[['a','b']] > 50]
print("---------------------------------------------------------")
# 多列做判断
# 索引结果保留 所有数据：True返回原数据，False返回值为NaN

b4 = df.loc[['one','three']] < 50
print(b4,type(b4))
print(df[b4])              # 也可以书写为 df[df.loc[['one','three']] < 50]
print("---------------------------------------------------------")
# 多行做判断
# 索引结果保留 所有数据：True返回原数据，False返回值为NaN

               a          b          c          d
one    25.163165  83.283309  49.134892  95.751979
two    99.854598  65.429702  54.861576  97.490645
three  54.456918   0.900533  71.109485   8.645579
four   16.386811   4.927631  34.203624  71.825235
-----------------------------------------------------------
           a      b      c      d
one    False  False  False  False
two    False  False  False  False
three  False   True  False   True
four    True   True  False  False <class 'pandas.core.frame.DataFrame'>
               a         b   c         d
one          NaN       NaN NaN       NaN
two          NaN       NaN NaN       NaN
three        NaN  0.900533 NaN  8.645579
four   16.386811  4.927631 NaN       NaN
----------------------------------------------------------
one      False
two       True
three     True
four     False
Name: a, dtype: bool <class 'pandas.core.series.Series'>
               a          b          c          d
two    99.854598  65.429702  54.861576  97.490645
t

In [29]:
# 多重索引：比如同时索引行和列
# 注意：先选择列再选择行---相当于对于一个数据，先筛选字段，再选择数据量

df= pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                index = ['one','two','three','four'],
                columns = list('abcd')
                )
print(df)
print("--------------------------------------------------------------------")

print(df['a'].loc[['one','three']])                            # 选择a列的one，three行
print(df[['b','c','d']].iloc[::2])                            # 选择b，c，d列的one，three行
print(df[df['a'] < 50].iloc[:2])                             # 选择满足判断索引的前两行数据  Note:如果选择列，可以：df[df['a]'<50][['a','b']]

               a          b          c          d
one    23.048762  59.687004  10.267036  20.490032
two    14.282800  72.669201  38.758915  40.448633
three  39.150984  93.643248  47.786153  15.020371
four   46.214493  62.561075  39.322810  20.039815
--------------------------------------------------------------------
one      23.048762
three    39.150984
Name: a, dtype: float64
               b          c          d
one    59.687004  10.267036  20.490032
three  93.643248  47.786153  15.020371
             a          b          c          d
one  23.048762  59.687004  10.267036  20.490032
two  14.282800  72.669201  38.758915  40.448633


# 2.6---Pandas数据结构Dataframe：基本技巧
## 数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序

In [30]:
# 数据查看、转置

df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,
                 columns = ['a','b']
                 )
print(df.head(2))
print(df.tail())
# .head()查看头部数据
# .tail()查看尾部数据
# 默认查看5条

print(df.T)
# .T 转置

           a          b
0  45.784533  67.697824
1  11.838376  91.240721
           a          b
3  10.645220  58.668732
4  12.078548  46.163054
5  35.639371  26.747595
6  62.481266  42.904762
7  76.167404  76.338807
           0          1          2          3          4          5  \
a  45.784533  11.838376  97.965373  10.645220  12.078548  35.639371   
b  67.697824  91.240721  85.899184  58.668732  46.163054  26.747595   

           6          7  
a  62.481266  76.167404  
b  42.904762  76.338807  


In [31]:
# 添加与修改

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                 columns = list('abcd')
                 )
print(df)

df['e'] = 10
df.loc[4] = 20
print(df)
# 新增列/行并且赋值

df['e'] = 20
df[['a','c']] = 100
print(df)
# 索引后直接修改值

           a          b          c          d
0  32.631359  40.256404  40.576781  45.902410
1  55.286121  33.676971  11.949312  79.405463
2  89.640929  55.832964  58.947374  36.704955
3  81.916838  92.923595  31.053016  15.945423
           a          b          c          d   e
0  32.631359  40.256404  40.576781  45.902410  10
1  55.286121  33.676971  11.949312  79.405463  10
2  89.640929  55.832964  58.947374  36.704955  10
3  81.916838  92.923595  31.053016  15.945423  10
4  20.000000  20.000000  20.000000  20.000000  20
     a          b    c          d   e
0  100  40.256404  100  45.902410  20
1  100  33.676971  100  79.405463  20
2  100  55.832964  100  36.704955  20
3  100  92.923595  100  15.945423  20
4  100  20.000000  100  20.000000  20


In [32]:
# 删除 del() / drop()

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                 columns = list("abcd")
                 )
print(df)

del df['a']
print(df)
print("-----------------------------------------------------")
# del语句---删除列

print(df.drop(0))
print(df.drop([1,2]))
print(df)
print("-----------------------------------------------------")
# drop()删除行，inplace = False--->删除后生成新的数据，不改变原数据

print(df.drop(['d'],axis = 1))
print(df)
# drop()删除列，需要加上axis = 1，inplace=False → 删除后生成新的数据，不改变原数据

           a          b          c          d
0  24.808262  59.304360  57.549104  96.493986
1  54.631284  38.374155  60.849887  45.405321
2  43.132412   6.096231   1.180559  73.018923
3  52.330936  77.563498  30.454008  92.882297
           b          c          d
0  59.304360  57.549104  96.493986
1  38.374155  60.849887  45.405321
2   6.096231   1.180559  73.018923
3  77.563498  30.454008  92.882297
-----------------------------------------------------
           b          c          d
1  38.374155  60.849887  45.405321
2   6.096231   1.180559  73.018923
3  77.563498  30.454008  92.882297
           b          c          d
0  59.304360  57.549104  96.493986
3  77.563498  30.454008  92.882297
           b          c          d
0  59.304360  57.549104  96.493986
1  38.374155  60.849887  45.405321
2   6.096231   1.180559  73.018923
3  77.563498  30.454008  92.882297
-----------------------------------------------------
           b          c
0  59.304360  57.549104
1  38.374155  60.84

In [33]:
# 对齐

df1 = pd.DataFrame(np.random.randn(10,4),columns = list('ABCD'))
df2 = pd.DataFrame(np.random.randn(7,3),columns = list('ABC'))
print(df1 + df2)
# DataFrame对象之间的数据自动按照列和索引（行标签）对齐

          A         B         C   D
0  0.265819  0.770987  1.300981 NaN
1  0.675162  1.054662  2.324128 NaN
2  3.182175  0.268563 -1.417100 NaN
3 -0.125626  1.414944  0.429493 NaN
4  1.074991  0.308236 -0.320261 NaN
5 -1.000264 -0.747469  1.388564 NaN
6 -0.334008 -0.219700 -0.216004 NaN
7       NaN       NaN       NaN NaN
8       NaN       NaN       NaN NaN
9       NaN       NaN       NaN NaN


In [35]:
# 排序一：按值排序 .sort_values
# 同样适用于Series

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                  columns = list('abcd')
                  )
print(df1)
print(df1.sort_values(['a'],ascending = True))                 # 按照升序排序
print(df1.sort_values(['a'],ascending = False))                # 按照降序排序
print("--------------------------------------------------------------------------")
# ascending参数：设置升序降序，默认为升序
# 单列排序

df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],
                  'b':list(range(8)),
                  'c':list(range(8,0,-1))
                   })
print(df2)
print(df2.sort_values(['a','c']))
# 多列排序，按照列顺序排序   Note: 此处务必要注意！！！

           a          b          c          d
0  91.055481  45.090422  13.354916  20.727048
1  86.867987  30.581833  52.916170  72.929333
2  97.058462  48.299240  83.554918  61.990196
3  80.279834  26.735496  74.633226  68.268991
           a          b          c          d
3  80.279834  26.735496  74.633226  68.268991
1  86.867987  30.581833  52.916170  72.929333
0  91.055481  45.090422  13.354916  20.727048
2  97.058462  48.299240  83.554918  61.990196
           a          b          c          d
2  97.058462  48.299240  83.554918  61.990196
0  91.055481  45.090422  13.354916  20.727048
1  86.867987  30.581833  52.916170  72.929333
3  80.279834  26.735496  74.633226  68.268991
--------------------------------------------------------------------------
   a  b  c
0  1  0  8
1  1  1  7
2  1  2  6
3  1  3  5
4  2  4  4
5  2  5  3
6  2  6  2
7  2  7  1
   a  b  c
3  1  3  5
2  1  2  6
1  1  1  7
0  1  0  8
7  2  7  1
6  2  6  2
5  2  5  3
4  2  4  4


In [36]:
# 排序二：索引排序：.sort_index

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                  index = [5,4,3,2],
                  columns = list('abcd')
                  )
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                  index = ['h','s','x','g'],
                  columns = list('abcd')
                  )
print(df1)
print(df1.sort_index())
print(df2)
print(df2.sort_index())
# 按照index排序
# 默认 ascending=True, inplace=False

           a          b          c          d
5  54.363034  28.464971  32.001933  21.609576
4   8.664021  85.783805   3.670999  16.035448
3  11.976784  77.968659  40.742603  86.147391
2  54.331812  57.211603  38.410442  82.736831
           a          b          c          d
2  54.331812  57.211603  38.410442  82.736831
3  11.976784  77.968659  40.742603  86.147391
4   8.664021  85.783805   3.670999  16.035448
5  54.363034  28.464971  32.001933  21.609576
           a          b          c          d
h  15.851084  29.929131  75.178962  78.714376
s  36.305924  28.709192  18.253719  60.419313
x  74.253887  51.978034  49.692612  59.671422
g  17.858464  39.236763  83.080844  61.641289
           a          b          c          d
g  17.858464  39.236763  83.080844  61.641289
h  15.851084  29.929131  75.178962  78.714376
s  36.305924  28.709192  18.253719  60.419313
x  74.253887  51.978034  49.692612  59.671422
