# 7、数据清洗、数据准备
## 7.1缺失数据处理

In [25]:
import pandas as pd
import numpy as np

In [3]:
data_na = pd.Series(['dd','bvs',np.nan,'ds'])
data_na

0     dd
1    bvs
2    NaN
3     ds
dtype: object

In [4]:
data_na.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
data_1 = data_na.dropna()
data_1

0     dd
1    bvs
3     ds
dtype: object

In [8]:
data_2 = data_na.fillna(100)
data_2

0     dd
1    bvs
2    100
3     ds
dtype: object

In [9]:
data_na.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [11]:
data_df = pd.DataFrame([[1,2,3],
                       [np.nan,np.nan,np.nan],
                       [np.nan,33,-0.56]])
data_df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,,
2,,33.0,-0.56


In [12]:
cleaned = data_df.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [13]:
cleaned = data_df.dropna(how = 'all')
cleaned

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
2,,33.0,-0.56


In [14]:
data_df[4] = np.nan
data_df

Unnamed: 0,0,1,2,4
0,1.0,2.0,3.0,
1,,,,
2,,33.0,-0.56,


In [15]:
cleaned = data_df.dropna(axis=1,how='all')
cleaned

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,,
2,,33.0,-0.56


In [20]:
filled=data_df.fillna({1:99,2:-99})
filled

Unnamed: 0,0,1,2,4
0,1.0,2.0,3.0,
1,,99.0,-99.0,
2,,33.0,-0.56,


## 7.2数据转换
过滤，清理，转换

### 7.2.1移除重复的数据

In [28]:

data = pd.DataFrame({'k1':['one','two','four'] *2+['four'],
                    'k2':[1,1,2,3,3,4,4,]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,four,2
3,one,3
4,two,3
5,four,4
6,four,4


In [29]:
#data[0,0] = 'two'
#print(data)
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [31]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,four,2
3,one,3
4,two,3
5,four,4


In [37]:
data['v1'] = range(len(data))
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,four,2,2
3,one,3,3
4,two,3,4
5,four,4,5
6,four,4,6


In [39]:
data1 = data['k1'].str.upper()
data1

0     ONE
1     TWO
2    FOUR
3     ONE
4     TWO
5    FOUR
6    FOUR
Name: k1, dtype: object

In [41]:
print(data)
data2 = data.replace([0,2],np.nan)
data2

     k1  k2  v1
0   one   1   0
1   two   1   1
2  four   2   2
3   one   3   3
4   two   3   4
5  four   4   5
6  four   4   6


Unnamed: 0,k1,k2,v1
0,one,1.0,
1,two,1.0,1.0
2,four,,
3,one,3.0,3.0
4,two,3.0,4.0
5,four,4.0,5.0
6,four,4.0,6.0


In [42]:
data4 = data.k1.str.replace('one','yi')
data4

0      yi
1     two
2    four
3      yi
4     two
5    four
6    four
Name: k1, dtype: object

In [52]:
#离散化和面元划分 Categorical对象，pandas.cut
ages = [20,22,25,27,26,21,37,31,61,45,41,32]
#将上述ages中的数据，划分为18~25,26~35,35~60，和60以上几个年龄段
bins = [18,25,35,60,100]

cats = pd.cut(ages,bins,right = True)     #  面元划分
cats                         #   Categorical对象

[(18, 25], (18, 25], (18, 25], (25, 35], (25, 35], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [46]:
cats.codes

array([0, 0, 0, 1, 1, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [47]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [48]:
pd.value_counts(cats)   #统计各个年龄段的数量
    #pd.value_counts(cats)是pandas.cut结果的面元计数

(25, 35]     4
(18, 25]     4
(35, 60]     3
(60, 100]    1
dtype: int64

In [53]:
#传递一个列表和数组到labels，以设置自己的面元名称
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels = group_names)

[Youth, Youth, Youth, YoungAdult, YoungAdult, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [4]:
#检测和异常值
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.002116,0.010625,-0.004759,-0.034907
std,0.960266,1.017079,0.986821,0.989071
min,-2.839504,-3.618282,-2.78411,-3.078533
25%,-0.650463,-0.678566,-0.62538,-0.693974
50%,-0.014426,-0.028634,0.007199,-0.083839
75%,0.639644,0.71734,0.674325,0.623301
max,3.184682,2.931759,2.855564,3.228716


In [7]:
data[3][np.abs(data[3] > 3)]

941    3.228716
Name: 3, dtype: float64

In [11]:
data[(np.abs(data)>2.9).any(1)]

Unnamed: 0,0,1,2,3
190,-0.780063,2.931759,-0.466594,-1.045298
247,-0.45258,-2.906398,-1.654729,1.56485
287,0.0479,-3.618282,-0.639,0.021821
418,-0.241971,2.923613,0.294141,0.946991
495,1.168066,-2.941583,-1.338274,-0.299848
600,-0.138347,-0.314478,-0.228487,2.958455
634,0.371528,0.376497,-0.097332,-3.078533
785,3.184682,0.860759,-0.947748,-0.665905
941,0.984894,-0.36323,-0.210915,3.228716
989,2.900547,1.414733,-1.307869,0.619419


In [14]:
data[np.abs(data)>2.9] = np.sign(data) * 3
data[(np.abs(data)>2.9).any(1)]

Unnamed: 0,0,1,2,3
190,-0.780063,3.0,-0.466594,-1.045298
247,-0.45258,-3.0,-1.654729,1.56485
287,0.0479,-3.0,-0.639,0.021821
418,-0.241971,3.0,0.294141,0.946991
495,1.168066,-3.0,-1.338274,-0.299848
600,-0.138347,-0.314478,-0.228487,3.0
634,0.371528,0.376497,-0.097332,-3.0
785,3.0,0.860759,-0.947748,-0.665905
941,0.984894,-0.36323,-0.210915,3.0
989,3.0,1.414733,-1.307869,0.619419


In [15]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.002031,0.011236,-0.004759,-0.035016
std,0.959977,1.015923,0.986821,0.988229
min,-2.839504,-3.0,-2.78411,-3.0
25%,-0.650463,-0.678566,-0.62538,-0.693974
50%,-0.014426,-0.028634,0.007199,-0.083839
75%,0.639644,0.71734,0.674325,0.623301
max,3.0,3.0,2.855564,3.0


In [20]:
#排列和随机采样。用numpy.random.perrmutation函数，
df = pd.DataFrame(np.arange(5*4).reshape(5,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


### 7.2.7随机排序

In [21]:

sampler = np.random.permutation(5)
sampler

array([1, 0, 4, 3, 2])

In [22]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11


In [23]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11


In [24]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
2,8,9,10,11
3,12,13,14,15


In [26]:
#随机采样
choice = pd.Series([5,2,7,1,9,23,-3])
choice

0     5
1     2
2     7
3     1
4     9
5    23
6    -3
dtype: int64

In [27]:
draws = choice.sample(n=10,replace=True)
draws

3     1
0     5
0     5
3     1
6    -3
6    -3
6    -3
5    23
6    -3
5    23
dtype: int64

###  7.2.8计算指标/哑变量

In [29]:

#用于统计建模或机器学习的转换方式：将分类变量转换为哑变量或指标矩阵
#DataFrame中某一列含有k个不同的值，则可以派生出一个k列矩阵或DataFrame（其值全为1和0）
#pandas的get_dummies函数可以实现这个功能。
df = pd.DataFrame({'key':['b','c','a','c','b','a'],
                  'data1':range(6)})
df



Unnamed: 0,key,data1
0,b,0
1,c,1
2,a,2
3,c,3
4,b,4
5,a,5


In [30]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,1
4,0,1,0
5,1,0,0


In [31]:
#给指标DataFrame的列加上一个前缀，以便于和其他数据进行合并。
#要实现这个功能，需要使用 get_dummies函数的prefix参数。
dummies = pd.get_dummies(df['key'],prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,1
4,0,1,0
5,1,0,0


In [34]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,0,1
2,2,1,0,0
3,3,0,0,1
4,4,0,1,0
5,5,1,0,0


## 7.3 字符串操作
pandas可以对整组的数据应用字符串表达式和正则表达式，并且能处理缺失数据。
### 7.3.1 字符串对象方法

In [1]:
#python的内置字符串方法
val = 'a,b,guido'
val.split(',')#拆分字符串

['a', 'b', 'guido']

In [6]:
val = ' ad dds '
val.strip()

'ad dds'

In [7]:
val = 'a,b,guido'
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [8]:
first,second,third = pieces
print(first)
print(second)
print(third)

a
b
guido


In [10]:
first+'::'+second+'::'+third

'a::b::guido'

In [11]:
'::'.join(pieces)

'a::b::guido'

In [12]:
#检测子字符串
'guido' in pieces

True

In [14]:
val.index(',')

1

In [15]:
val.count(',')

2

In [17]:
val.replace(',','::')

'a::b::guido'

### 7.3.2正则表达式（regex）：
灵活的在文本中搜索或匹配字符串模式的方法。

python的内置re模块，处理字符串的正则表达式。
re模块内的函数，包括三类：1、模式匹配，2、替换，3、拆分


In [18]:
import re
text = "foo   bar\t baz   \tqux"

In [19]:
re.split('\s+',text)  #一个或多个空格：\s+

['foo', 'bar', 'baz', 'qux']

In [32]:
#对许多字符串应用同一条正则表达式，则用re.compile创建regex对象。可以节省CPU时间
# match和search、findall功能类似。
#findall返回字符串中所有匹配的项
#search只返回第一个匹配项
#match只匹配字符串的首部
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@163.com
"""
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)


In [33]:
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

### 7.3.3pandas的矢量化字符串函数


In [26]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [27]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [28]:
pattern
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [dave@google.com]
Steve    [steve@gmail.com]
Rob        [rob@gmail.com]
Wes                    NaN
dtype: object

In [29]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [30]:
matches.str.get(1)
matches.str[0]

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [31]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object