In [2]:
import pandas as pd
import numpy as np

<a id=menu><center><h1>目录</h1></center></a>

1. [数据的读写](#I/O)
        1.1 csv的读写      
        1.2 excel的读写    
        1.3 HDF5的读写     
        1.4 数据库的读写  
    
2. [缺失值处理](#na)
        2.1 过滤缺失值
        2.2 补全缺失值
        2.3 找出缺失值
    
3. [数据转换](#preparation)
        3.1 删除重复值
        3.2 使用函数或映射进行数据转换
        3.3 替代值
        3.4 重命名轴索引
        3.5 离散化和分箱
        3.6 检测和过滤异常值
        
4. [字符串操作](#str)
        4.1 pandas中的向量化字符串函数
        

<a id="I/O"></a>
# 1. 数据的读写

[返回目录](#menu)
## 1.1 `csv`的读写
CSV（Comma-Separated Values），逗号分隔值，有时也称为字符分隔值，因为分隔字符也可以不是逗号
### 1.1.1 `csv`文件的读取

In [10]:
# csv文件的读取
pd.read_csv?

In [20]:
# ==常用参数(重点)
# =filepath_or_buffer，数据输入路径
# 可以是文件路径，也可以是 URL
df = pd.read_csv('../datas/sh000001_daily.csv')
df.head()
# pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')

Unnamed: 0.1,Unnamed: 0,date,open,close,high,low,amount,pre_close
0,0,1990-12-19,113.1,113.1,113.1,113.1,2650.0,
1,1,1990-12-20,113.1,113.5,113.5,112.85,1990.0,113.1
2,2,1990-12-21,113.5,113.5,113.5,113.4,1190.0,113.5
3,3,1990-12-24,113.5,114.0,114.0,113.3,8070.0,113.5
4,4,1990-12-25,114.0,114.1,114.2,114.0,2780.0,114.0


In [7]:
# =sep,数据文件的分隔符
# 默认为逗号
# g = pd.read_csv('girl.csv')  # girl.csv分割符为 \t，如果使用默认的逗号分隔符，读入后的数据混为一体。
# g
pd.read_csv('girl.csv', sep='\t')

Unnamed: 0,name,age,gender
0,小红,18,女
1,小花,17,女
2,小白,16,男
3,#,19,男


In [9]:
# =delimiter
# 分隔符的另一个名字，与sep功能相似。

# =delim_whitespace
# 默认为 False，设置为 True 时，表示分割符为空白字符，可以是空格、\t 等。
pd.read_csv('girl.csv', delim_whitespace=True)

Unnamed: 0,name,age,gender
0,小红,18,女
1,小花,17,女
2,小白,16,男
3,#,19,男


In [12]:
# =header, 设置导入DataFrame的表头，即列名称
# 默认为'infer'，可通过整数或整数列表指定列名
# 还需注意它与下面介绍的names参数的微妙关系。
pd.read_csv('girl.csv', sep='\t', header=1)

Unnamed: 0,小红,18,女
0,小花,17,女
1,小白,16,男
2,#,19,男


In [14]:
# =names
# 当names没被赋值时，header会变成0，即选取数据文件的第一行作为列名(此处指第一行数据，而非文件的第一行)。
# 当names被赋值，header没被赋值时，那么header会变成None。如果都赋值，就会实现两个参数的组合功能。
# pd.read_csv('girl.csv', sep='\t', names=['name', 'age', 'gender'])  # 默认header为None，即不从DataFrame中设置列名
pd.read_csv('girl.csv', sep='\t', 
            names=['姓名', '年龄', '性别'], header=0)  # 设置第一行为列名，且指定列名名称，第一行下面当做数据

Unnamed: 0,姓名,年龄,性别
0,小红,18,女
1,小花,17,女
2,小白,16,男
3,#,19,男


In [18]:
# =skiprows，过滤行
# 可指定跳过几行(int)，或指定跳过哪些行(list-like, callable)
# 此处是先过滤，再确定表头
pd.read_csv('girl.csv', sep='\t', skiprows=1) # 跳过前1行
pd.read_csv('girl.csv', sep='\t', skiprows=[1]) # 跳过第2行

Unnamed: 0,Unnamed: 1,我的数据库
小红,18,女
小花,17,女
小白,16,男
#,19,男


In [19]:
# =nrows, 设置一次性读入的文件行数
# 读入大文件时十分有用
df = pd.read_csv('../datas/sh000001_daily.csv', index_col=0, nrows=100)
df

Unnamed: 0,date,open,close,high,low,amount,pre_close
0,1990-12-19,113.1,113.10,113.1,113.10,2650.0,
1,1990-12-20,113.1,113.50,113.5,112.85,1990.0,113.1
2,1990-12-21,113.5,113.50,113.5,113.40,1190.0,113.5
3,1990-12-24,113.5,114.00,114.0,113.30,8070.0,113.5
4,1990-12-25,114.0,114.10,114.2,114.00,2780.0,114.0
...,...,...,...,...,...,...,...
95,1991-05-08,126.0,126.00,126.0,126.00,2090.0,126.0
96,1991-05-09,126.1,126.10,126.1,126.10,70.0,126.0
97,1991-05-10,126.7,126.70,126.7,126.70,300.0,126.1
98,1991-05-13,126.0,126.00,126.0,126.00,120.0,126.7


In [None]:
# =index_col，设置索引
# 默认为None，即生成默认索引0 1 2 3
# 可指定某列为索引
# df = pd.read_csv('../datas/sh000001_daily.csv')
df = pd.read_csv('../datas/sh000001_daily.csv', index_col=0)
df

In [21]:
# =usecols，使用指定列
# 当我们不想要全部列时，可使用这个参数
pd.read_csv('girl.csv', sep='\t', usecols=['name', 'age'])

Unnamed: 0,name,age
0,小红,18
1,小花,17
2,小白,16
3,#,19


In [23]:
# ==不常见参数(了解)
# =mangle_dupe_cols，处理重名列
# 默认为True，当遇到复杂数据时，有时会遇到重名列，重名的列导入后会多一个.1
# 若设为False，则会抛出异常

# =prefix，添加前缀
# 当导入数据没有header时，设置此参数会自动加一个前缀
# pd.read_csv('girl.csv', sep='\t', header=None)
pd.read_csv('girl.csv', sep='\t', header=None, prefix='col')

Unnamed: 0,col0,col1,col2
0,name,age,gender
1,小红,18,女
2,小花,17,女
3,小白,16,男
4,#,19,男


In [26]:
# ==解析参数
# =dtype，解析某列的类型(掌握)
# 应用场景：将形似整数的列设置为字符串。
# 如，整理员工id编号00001时，若直接读取，则会被解析成整数，把开头的0丢失。
g = pd.read_csv('girl.csv', delim_whitespace=True, dtype={"age": str})
g['age']

0    18
1    17
2    16
3    19
Name: age, dtype: object

In [27]:
df['date']

0       1990-12-19
1       1990-12-20
2       1990-12-21
3       1990-12-24
4       1990-12-25
           ...    
6954    2020-02-18
6955    2020-02-19
6956    2020-02-20
6957    2020-02-21
6958    2020-02-24
Name: date, Length: 6959, dtype: object

In [28]:
# =parse_dates，指定某些列为时间类型（掌握）
df = pd.read_csv('../datas/sh000001_daily.csv', index_col=0, 
                parse_dates=['date'])
df['date']

0      1990-12-19
1      1990-12-20
2      1990-12-21
3      1990-12-24
4      1990-12-25
          ...    
6954   2020-02-18
6955   2020-02-19
6956   2020-02-20
6957   2020-02-21
6958   2020-02-24
Name: date, Length: 6959, dtype: datetime64[ns]

In [None]:
# =date_parser, 定制某种时间类型(了解)
# 有些日期格式可以直接转化，如20200710、2020-07-10；
# 但有些日期格式不可直接转换，如：2020年7月10日，这是需要手动定制解析规则
# 同时指定需解析的列，和解析方法，如：解析date列，日期格式为2020年7月10日
# parse_dates=["date"], date_parser=lambda x: pd.datetime.strptime(x, "%Y年%m月%d日")

In [None]:
# =encoding，解析字符编码(掌握)
# 通常指定为'utf-8'，或'gbk'(中文常用)

In [31]:
# =converters(了解)
# 在读取的时候对列数据进行变换
# 如：对age列的数据加10
pd.read_csv('girl.csv', sep="\t", 
            converters={"age": lambda x: int(x) + 10}) # 注意int(x)，此处解析器默认所有列的类型为 str，所以需要显式类型转换。

Unnamed: 0,name,age,gender
0,小红,28,女
1,小花,27,女
2,小白,16,男


In [29]:
# =true_values和false_values(了解)
# 指定哪些值应该被清洗为True，哪些值被清洗为False。
# 需要成对出现，只出现一个没有效果。
pd.read_csv('girl.csv', sep="\t", true_values=["女"], false_values=["男"])

Unnamed: 0,name,age,gender
0,小红,18,True
1,小花,17,True
2,小白,16,False
3,#,19,False


In [30]:
# ==空值处理
# =na_values，配置需要处理成na值的值(掌握)
# 可指定一个或多个值作为na值处理, 可以只对指定的列进行替换
# df = pd.read_csv('girl.csv', sep='\t', na_values=["#"])
df = pd.read_csv('girl.csv',  sep='\t', 
                 na_values={"name": ["#"], "gender": ["女"]})
df

Unnamed: 0,name,age,gender
0,小红,18,
1,小花,17,
2,小白,16,男
3,,19,男


In [None]:
# =keep_default_na(了解)
# 和na_values搭配使用的，如果前者为True，则na_values被解析为Na/NaN的字符除了用户设置外，还包括默认值。默认为True

In [31]:
# ==分块读入(处理大文件时很有用)
# =chunksize，设置文件块大小
# 默认为None
chunker = pd.read_csv('../datas/sh000001_daily.csv', index_col=0, chunksize=1000)
for c in chunker:
    print(c)
    print('-'*10)

           date    open   close    high     low     amount  pre_close
0    1990-12-19  113.10  113.10  113.10  113.10     2650.0        NaN
1    1990-12-20  113.10  113.50  113.50  112.85     1990.0     113.10
2    1990-12-21  113.50  113.50  113.50  113.40     1190.0     113.50
3    1990-12-24  113.50  114.00  114.00  113.30     8070.0     113.50
4    1990-12-25  114.00  114.10  114.20  114.00     2780.0     114.00
..          ...     ...     ...     ...     ...        ...        ...
995  1995-07-26  703.29  691.12  713.04  690.36  3705400.0     703.58
996  1995-07-27  686.90  697.72  697.72  686.76  2246990.0     691.12
997  1995-07-28  699.85  695.42  702.97  692.88  3230919.0     697.72
998  1995-07-31  699.61  695.55  702.44  692.48  2324119.0     695.42
999  1995-08-01  697.22  715.40  718.24  688.65  3194200.0     695.55

[1000 rows x 7 columns]
----------
            date     open    close     high      low     amount  pre_close
1000  1995-08-02   721.15   699.99   733.86   699

### 1.1.2 `csv`文件的存储

In [32]:
# csv文件的存储
df.to_csv?

In [33]:
df = pd.read_csv('../datas/sh000001_daily.csv', index_col=0)

df = df.loc[pd.to_datetime(df['date']) > pd.to_datetime('20070101')]
df.loc[3764, 'amount'] = np.nan
df.head()

Unnamed: 0,date,open,close,high,low,amount,pre_close
3764,2007-01-04,2728.19,2715.72,2847.61,2684.82,,2675.47
3765,2007-01-05,2668.58,2641.33,2685.8,2617.02,106156000.0,2715.72
3766,2007-01-08,2621.07,2707.2,2708.44,2620.62,106813000.0,2641.33
3767,2007-01-09,2711.05,2807.8,2809.39,2691.36,110751000.0,2707.2
3768,2007-01-10,2838.11,2825.58,2841.74,2770.99,111769000.0,2807.8


In [35]:
# =sep，分隔符
# 默认为逗号

# =na_rep，缺失值处理
# 若不指定，则默认处理为空值

# =float_format，浮点数保留的格式
df.to_csv('data.csv', sep=';', na_rep='NA', float_format='%.1f')

In [40]:
# =columns，选择存储的列
# =header，设置表头
# =index，设置索引名
df.to_csv(
    'data.csv',
    columns=['date', 'open', 'close', 'pre_close'], 
    header=list('abcd'),
    index=None
)

In [41]:
# =encoding，设置编码
df = pd.read_csv('girl.csv', sep="\t")
df.to_csv('girl2.csv', encoding='gbk')  # 设置成gbk编码

In [51]:
# =compression，压缩文件
compression_opts = dict(method='zip',  # 设置压缩格式为zip格式，支持 ‘gzip’, ‘bz2’, ‘zip’, ‘xz’
                        archive_name='out.csv')  # 指定解压缩后的文件名称 
df.to_csv('out.zip', index=False,
          compression=compression_opts)

In [43]:
# =mode，写入模式(可参考01-2中对mode的介绍)
df1 = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]}, columns=['a', 'b'])
df2 = pd.DataFrame({'a':[7,8,9], 'b':[6,5,4]}, columns=['a', 'b'])
print(df1)
print(df2)
# df1.to_csv?
df1.to_csv('a.csv',index=False)
df2.to_csv('a.csv', mode='a', header=False, index=False)

   a  b
0  1  4
1  2  5
2  3  6
   a  b
0  7  6
1  8  5
2  9  4


## 1.2 `excel`的读写

In [44]:
pd.read_excel?

In [46]:
df1.to_excel?

## 1.3 HDF5读写

In [48]:
pd.read_hdf?

## 1.4 数据库读写
sqlalchemey

In [3]:
pd.read_sql?

<a id="na"></a>
# 2. 缺失值处理

[返回目录](#menu)

缺失值处理思路：
1. 不作处理，保留缺失值。（缺失值少，不影响后续分析）
2. 填补缺失值。（缺失值少，可以做填补，通常应用在时间序列分析过程中）
3. 剔除缺失值。（缺失值较多，或缺失值无法进行填补且会影响后续分析）

常用函数：
```
dropna
fillna
isnull
notnull
```
## 2.1 过滤缺失值

In [2]:
import pandas as pd
import numpy as np

In [5]:
# ==Series
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()  # data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
# ==DataFrame
data = pd.DataFrame([[1, 6.5, 3.], [1., np.nan, np.nan],
                    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
cleaned = data.dropna()  # 只保留非na值的行
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [4]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [9]:
data.dropna(how='all')  # 剔除全是na值的那一行

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [10]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [11]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [34]:
# 只想保留包含一定数量的观察值的行
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.313186,,
1,-0.693909,,
2,0.260167,,0.218943
3,-0.028439,,1.323377
4,0.535897,0.842857,0.091523
5,1.582487,-1.462451,0.541846
6,0.577495,-0.673343,0.603326


In [10]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.501516,0.422362,0.533241
5,1.142885,0.073374,-1.663074
6,0.538152,-0.479478,2.212984


In [13]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.16401,,1.376482
3,-0.008227,,-0.130976
4,0.113055,0.895142,2.100963
5,0.17284,-0.578541,1.003203
6,0.826785,0.99096,1.612439


In [35]:
df.dropna(subset=[1])

Unnamed: 0,0,1,2
4,0.535897,0.842857,0.091523
5,1.582487,-1.462451,0.541846
6,0.577495,-0.673343,0.603326


## 2.2 补全缺失值

In [15]:
# 统一填充某个值
df.fillna(0)  

Unnamed: 0,0,1,2
0,2.011961,0.0,0.0
1,1.55127,0.0,0.0
2,-1.16401,0.0,1.376482
3,-0.008227,0.0,-0.130976
4,0.113055,0.895142,2.100963
5,0.17284,-0.578541,1.003203
6,0.826785,0.99096,1.612439


In [16]:
# 指定列的填充值
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,2.011961,0.5,0.0
1,1.55127,0.5,0.0
2,-1.16401,0.5,1.376482
3,-0.008227,0.5,-0.130976
4,0.113055,0.895142,2.100963
5,0.17284,-0.578541,1.003203
6,0.826785,0.99096,1.612439


In [17]:
df.fillna(0, inplace=True)  # 修改已存在的对象，此时无返回值

In [18]:
df

Unnamed: 0,0,1,2
0,2.011961,0.0,0.0
1,1.55127,0.0,0.0
2,-1.16401,0.0,1.376482
3,-0.008227,0.0,-0.130976
4,0.113055,0.895142,2.100963
5,0.17284,-0.578541,1.003203
6,0.826785,0.99096,1.612439


In [19]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.966028,-1.417418,0.19985
1,-0.299857,0.064862,-0.112958
2,-1.499589,,-0.240728
3,0.766993,,-0.538543
4,-0.431372,,
5,0.413706,,


In [22]:
# 从前向后填充
# df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)  # 限制填充的个数

Unnamed: 0,0,1,2
0,-0.966028,-1.417418,0.19985
1,-0.299857,0.064862,-0.112958
2,-1.499589,0.064862,-0.240728
3,0.766993,0.064862,-0.538543
4,-0.431372,,-0.538543
5,0.413706,,-0.538543


In [23]:
# 用平均值填补
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [24]:
data.mean()

3.8333333333333335

## 2.3 找出缺失值

In [28]:
# =Series
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [31]:
# =DataFrame
data = pd.DataFrame([[1, 6.5, 3.], [1., np.nan, np.nan],
                    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,True
2,True,True,True
3,True,False,False


<a id="preparation"></a>
# 3. 数据转换

[返回目录](#menu)

## 3.1 删除重复值

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [5]:
# duplicated返回每行是否存在重复的布尔序列
data.duplicated?

In [8]:
data.duplicated(subset=['k1'])

Unnamed: 0,k1,k2
5,two,4


In [11]:
data.drop_duplicates()  # 剔除重复的行

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [13]:
data['v1'] = range(7)
print(data)
data.drop_duplicates(subset=['k1'])  # 指定筛选某一列中存在重复的行

    k1  k2  v1
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
5  two   4   5
6  two   4   6


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [24]:
data.drop_duplicates(subset=['k1', 'k2'], keep='last')  # 保留重复行中的后一个

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## 3.2 使用函数或映射进行数据转换

In [16]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                             'Pastrami', 'corned beef', 'Bacon',
                             'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [17]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow', 
    'honey ham': 'pig',
    'nova lox': 'salmon',
}

In [19]:
# 将food列统一成小写
lowercased = data['food'].str.lower()  # pd.Series.str方法是Pandas下对字符串序列处理的强力输出。
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [21]:
# 利用map将字典中的映射关系反应到Series中
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [26]:
# map还可以和函数进行搭配使用(同样的结合在apply系列中也经常遇到)
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [29]:
# 举个例子：客户名单
data = pd.DataFrame({'name': ['张三', '李四', '王五',
                             '小红', '小花', '小孙',
                             '小李', '小钱', '老陈'],
                     'gender': ['male', '男', '男', '女', 
                                '女', 'female', '女', 'f', 'm']})
data

Unnamed: 0,name,gender
0,张三,male
1,李四,男
2,王五,男
3,小红,女
4,小花,女
5,小孙,female
6,小李,女
7,小钱,f
8,老陈,m


In [32]:
gender_dict = {'male': '男',
              'female': '女',
              'm': '男',
              'f': '女',
              '男': '男',
              '女': '女',}
data['gender'] = data['gender'].map(gender_dict)

In [33]:
data

Unnamed: 0,name,gender
0,张三,男
1,李四,男
2,王五,男
3,小红,女
4,小花,女
5,小孙,女
6,小李,女
7,小钱,女
8,老陈,男


## 3.3 替代值

In [34]:
data = pd.Series([1., -999., 2., -999., -1000, 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [35]:
# replace函数
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [36]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [38]:
# data.replace([-999, -1000], [np.nan, 0])  
# 等价于字典传参
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## 3.4 重命名轴索引

In [40]:
np.arange(12).reshape((3, 4))

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [47]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                   index=['Ohio', 'Colorado', 'New York'],
                   columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [48]:
data.index

Index(['Ohio', 'Colorado', 'New York'], dtype='object')

In [49]:
# 与Series类似，轴索引也有一个map方法
transform = lambda x: x[:4].upper()  # 构造自定义函数
data.index = data.index.map(transform)
data.index

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [44]:
# 可以用rename方法，不修改原有数据集，创建新的数据集转换后的版本
data.rename(index=str.title, columns=str.upper, inplace=True)

In [45]:
data

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [50]:
# rename可以结合字典型对象使用
data.rename(index={'OHIO': 'INDIANA'}, 
            columns={'three': 'peekabo'})

Unnamed: 0,one,two,peekabo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [9]:
# rename中的inplace参数可以修改原有数据集
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

In [51]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## 3.5 离散化和分箱

In [52]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
# 使用cut方法，将年龄分为18~25,26~35,36~60,61以上等若干组
cats = pd.cut(ages, bins)
cats  # 返回的是一个特殊的Categorical对象

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [53]:
print(cats.categories)  # 类别数组
print(cats.codes)  # 查看数据标签

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')
[0 0 0 1 0 0 2 1 3 2 2 1]


In [54]:
# 对cut结果中的箱数量的计数
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [55]:
# 通过right=False改变哪一边是封闭的
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [56]:
# 通过labels参数传递一个列表或数组，传入自定义的箱名
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [59]:
# 将数据等长区间的切成四份
data = np.random.rand(20)
pd.cut(data, 4, precision=2)  # precison=2将十进制精度限制在两位

[(0.29, 0.51], (0.064, 0.29], (0.29, 0.51], (0.29, 0.51], (0.74, 0.96], ..., (0.74, 0.96], (0.51, 0.74], (0.064, 0.29], (0.064, 0.29], (0.74, 0.96]]
Length: 20
Categories (4, interval[float64]): [(0.064, 0.29] < (0.29, 0.51] < (0.51, 0.74] < (0.74, 0.96]]

In [61]:
data

array([0.41438164, 0.06800283, 0.40471417, 0.48474691, 0.86560733,
       0.43008756, 0.66986663, 0.8513468 , 0.26615467, 0.53887882,
       0.37844857, 0.96312956, 0.07023489, 0.42400365, 0.50541472,
       0.80359802, 0.63170567, 0.06471264, 0.08739276, 0.82534359])

In [62]:
# 获得等分位长度的箱
data = np.random.randn(1000)  # 正态分布
cats = pd.qcut(data, 4)
cats

[(-0.00468, 0.626], (0.626, 3.094], (-0.727, -0.00468], (0.626, 3.094], (-0.00468, 0.626], ..., (-0.727, -0.00468], (-0.727, -0.00468], (0.626, 3.094], (-3.057, -0.727], (-0.00468, 0.626]]
Length: 1000
Categories (4, interval[float64]): [(-3.057, -0.727] < (-0.727, -0.00468] < (-0.00468, 0.626] < (0.626, 3.094]]

In [63]:
pd.value_counts(cats)

(0.626, 3.094]        250
(-0.00468, 0.626]     250
(-0.727, -0.00468]    250
(-3.057, -0.727]      250
dtype: int64

In [64]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])  # 自定义分位数

[(-0.00468, 1.154], (-0.00468, 1.154], (-1.311, -0.00468], (1.154, 3.094], (-0.00468, 1.154], ..., (-1.311, -0.00468], (-1.311, -0.00468], (1.154, 3.094], (-1.311, -0.00468], (-0.00468, 1.154]]
Length: 1000
Categories (4, interval[float64]): [(-3.057, -1.311] < (-1.311, -0.00468] < (-0.00468, 1.154] < (1.154, 3.094]]

## 3.6 检测和过滤异常值
通常我们会将异常值替换成na值，然后再进行后续的处理。

In [68]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,-0.615693,-1.597289,-0.084282,-0.364144
1,0.484604,0.152369,1.384852,-0.652321
2,-0.279949,-0.210190,0.378417,-0.218574
3,0.859034,0.108441,-0.993252,-2.049848
4,-1.280568,-1.104683,1.036508,-0.276246
...,...,...,...,...
995,0.449217,0.800898,0.351917,-0.354972
996,-1.074574,1.280535,-0.184312,-2.313423
997,0.115441,-0.898692,0.376335,-0.743295
998,0.013404,0.964797,0.199798,-1.138056


In [69]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.010105,0.002653,0.007396,0.047147
std,0.9882,1.003093,1.005564,0.984815
min,-3.292183,-2.807049,-3.391049,-2.616292
25%,-0.644905,-0.666976,-0.623157,-0.619701
50%,-0.013156,-0.023687,0.004225,0.049192
75%,0.655123,0.678353,0.662925,0.70197
max,2.952883,3.517682,3.243769,3.579704


In [70]:
# 找出某一列中绝对值大于3的值
data.loc[abs(data[1])>3, 1]

761    3.116001
826    3.517682
Name: 1, dtype: float64

In [73]:
# 选出所有大于3或小于-3的行
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
482,0.501497,-1.046603,-3.128618,-0.895983
501,-0.526031,-1.404375,1.804009,3.579704
699,0.558223,-1.07083,-3.391049,-2.027114
761,1.528119,3.116001,1.167431,-0.146903
826,-0.103745,3.517682,0.219304,-0.381302
834,-3.292183,0.401905,0.216176,-0.272339
939,0.663016,1.970357,3.243769,-0.737953
942,-3.046764,-0.202334,1.976734,-0.27932
971,-0.69092,-0.123459,-3.002942,-0.048554


In [72]:
(np.abs(data)>3).any(axis=1)

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [75]:
# np.sign函数的内部逻辑：-1 if x < 0, 0 if x==0, 1 if x > 0
data[np.abs(data) > 3] = np.sign(data) * 3  # 将数值限制在-3和3之间
data.iloc[942, 0]

-3.0

In [76]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.009766,0.002019,0.007675,0.046568
std,0.987128,1.001055,1.003157,0.982902
min,-3.0,-2.807049,-3.0,-2.616292
25%,-0.644905,-0.666976,-0.623157,-0.619701
50%,-0.013156,-0.023687,0.004225,0.049192
75%,0.655123,0.678353,0.662925,0.70197
max,2.952883,3.0,3.0,3.0


In [77]:
np.sign(data)

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,-1.0,-1.0
1,1.0,1.0,1.0,-1.0
2,-1.0,-1.0,1.0,-1.0
3,1.0,1.0,-1.0,-1.0
4,-1.0,-1.0,1.0,-1.0
...,...,...,...,...
995,1.0,1.0,1.0,-1.0
996,-1.0,1.0,-1.0,-1.0
997,1.0,-1.0,1.0,-1.0
998,1.0,1.0,1.0,-1.0


<a id="str"></a>

# 4. 字符串操作

[返回目录](#menu)
## 4.1 pandas中的向量化字符串函数

In [78]:
data = {
    'Dave': 'dave@google.com', 
    'Steve': 'steve@gmail.com', 
    'Rob': 'rob@gmail.com', 
    'Wes': np.nan,
}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [79]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [80]:
# 通过对Series.str属性进行调用，进行正则表达式的应用
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [81]:
import re
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [82]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [83]:
# 访问嵌入式列表中的元素
data.str.get(1)

Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object

In [84]:
data.str[0]

Dave       d
Steve      s
Rob        r
Wes      NaN
dtype: object

In [85]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [86]:
data.str.split('@')

Dave     [dave, google.com]
Steve    [steve, gmail.com]
Rob        [rob, gmail.com]
Wes                     NaN
dtype: object