In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 7.1 处理缺失值

In [2]:
string_data = pd.Series(['aada', 'artih', np.nan, 'avasda'])

In [3]:
string_data

0      aada
1     artih
2       NaN
3    avasda
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data.dropna()

0      aada
1     artih
3    avasda
dtype: object

In [6]:
string_data.fillna(value=1)

0      aada
1     artih
2         1
3    avasda
dtype: object

## 7.1.1 过滤缺失值

In [153]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [154]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [155]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [156]:
data = pd.DataFrame([
    [1, 6.5, 3], [1, np.nan, np.nan],
    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3]
])

In [157]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [160]:
data.dropna?

In [14]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [15]:
data[4] = np.nan

In [16]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [17]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [19]:
df = pd.DataFrame(np.random.randn(7,3))

In [20]:
df

Unnamed: 0,0,1,2
0,1.698583,-0.096885,0.359764
1,-0.650751,-2.491068,-0.008763
2,1.05851,-1.275792,-1.871036
3,0.708676,-0.01426,-0.833429
4,0.019437,-1.267955,0.388735
5,-1.455162,-0.516465,1.309781
6,0.935741,-1.347466,0.287435


In [21]:
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

In [22]:
df

Unnamed: 0,0,1,2
0,1.698583,,
1,-0.650751,,
2,1.05851,,-1.871036
3,0.708676,,-0.833429
4,0.019437,-1.267955,0.388735
5,-1.455162,-0.516465,1.309781
6,0.935741,-1.347466,0.287435


In [23]:
df.dropna()

Unnamed: 0,0,1,2
4,0.019437,-1.267955,0.388735
5,-1.455162,-0.516465,1.309781
6,0.935741,-1.347466,0.287435


In [24]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.05851,,-1.871036
3,0.708676,,-0.833429
4,0.019437,-1.267955,0.388735
5,-1.455162,-0.516465,1.309781
6,0.935741,-1.347466,0.287435


## 7.1.2 补全缺失值

In [158]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [161]:
df.fillna?

In [162]:
df.add?

In [26]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.698583,0.0,0.0
1,-0.650751,0.0,0.0
2,1.05851,0.0,-1.871036
3,0.708676,0.0,-0.833429
4,0.019437,-1.267955,0.388735
5,-1.455162,-0.516465,1.309781
6,0.935741,-1.347466,0.287435


In [27]:
df.fillna({1:'333', 2:'!'})

Unnamed: 0,0,1,2
0,1.698583,333.0,!
1,-0.650751,333.0,!
2,1.05851,333.0,-1.87104
3,0.708676,333.0,-0.833429
4,0.019437,-1.26796,0.388735
5,-1.455162,-0.516465,1.30978
6,0.935741,-1.34747,0.287435


In [28]:
df[2].dtype

dtype('float64')

In [29]:
df.fillna(0, inplace = True)

In [30]:
df

Unnamed: 0,0,1,2
0,1.698583,0.0,0.0
1,-0.650751,0.0,0.0
2,1.05851,0.0,-1.871036
3,0.708676,0.0,-0.833429
4,0.019437,-1.267955,0.388735
5,-1.455162,-0.516465,1.309781
6,0.935741,-1.347466,0.287435


In [31]:
df = pd.DataFrame(np.random.randn(6,3))

In [32]:
df

Unnamed: 0,0,1,2
0,0.276193,-0.287007,-0.542397
1,0.138066,-1.035432,0.846572
2,-1.003888,0.097379,0.822144
3,1.641355,0.269786,-1.333923
4,-0.196128,-0.424793,1.489988
5,-0.198384,-1.690034,0.146624


In [33]:
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan

In [34]:
df

Unnamed: 0,0,1,2
0,0.276193,-0.287007,-0.542397
1,0.138066,-1.035432,0.846572
2,-1.003888,,0.822144
3,1.641355,,-1.333923
4,-0.196128,,
5,-0.198384,,


In [35]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.276193,-0.287007,-0.542397
1,0.138066,-1.035432,0.846572
2,-1.003888,-1.035432,0.822144
3,1.641355,-1.035432,-1.333923
4,-0.196128,-1.035432,-1.333923
5,-0.198384,-1.035432,-1.333923


In [36]:
df.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,0.276193,-0.287007,-0.542397
1,0.138066,-1.035432,0.846572
2,-1.003888,-1.035432,0.822144
3,1.641355,,-1.333923
4,-0.196128,,-1.333923
5,-0.198384,,


In [37]:
df

Unnamed: 0,0,1,2
0,0.276193,-0.287007,-0.542397
1,0.138066,-1.035432,0.846572
2,-1.003888,,0.822144
3,1.641355,,-1.333923
4,-0.196128,,
5,-0.198384,,


In [38]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.276193,-0.287007,-0.542397
1,0.138066,-1.035432,0.846572
2,-1.003888,-0.661219,0.822144
3,1.641355,-0.661219,-1.333923
4,-0.196128,-0.661219,-0.051901
5,-0.198384,-0.661219,-0.051901


In [39]:
df[1].mean()

-0.6612193503691546

# 7.2 数据转换

## 7.2.1 删除重复值

In [163]:
data = pd.DataFrame({
    'k1': ['one', 'two']*3 + ['two'], 
    'k2': [1,1,2,3,3,4,4]
})

In [164]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [42]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [165]:
data.duplicated?

In [166]:
data.drop_duplicates?

In [45]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [46]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [47]:
data['v1'] = range(7)

In [48]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [49]:
data.drop_duplicates(subset=['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [50]:
data.drop_duplicates(subset=['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [51]:
data.drop_duplicates(subset=['k1'], keep='last')

Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6


In [52]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


## 7.2.2 使用函数或映射进行数据转换

In [53]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                               'pastrami', 'honey ham', 'nova lox'],
                      'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [54]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [55]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon',
    'corned beef': 'cow'
}

In [56]:
meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon',
 'corned beef': 'cow'}

In [57]:
lowercased = data.food.str.lower()

In [58]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [59]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [60]:
data['animal'] = lowercased.map(meat_to_animal)

In [61]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [62]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [63]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


## 7.2.3 替代值

In [64]:
data = pd.Series([1, -999, 2, -999, -1000, 3.])

In [65]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [66]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [67]:
data.replace([-999, -1000], 0)

0    1.0
1    0.0
2    2.0
3    0.0
4    0.0
5    3.0
dtype: float64

In [68]:
data.replace([-999, -1000], [np.nan, 111])

0      1.0
1      NaN
2      2.0
3      NaN
4    111.0
5      3.0
dtype: float64

In [69]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [70]:
data.replace({-999:np.nan, -1000:222})

0      1.0
1      NaN
2      2.0
3      NaN
4    222.0
5      3.0
dtype: float64

## 7.2.4 重命名轴索引

In [71]:
data = pd.DataFrame(np.arange(12).reshape(3,4),
                   index = ['Ohio','Colorado', 'New York'],
                   columns='one two three dour'.split())

In [72]:
data

Unnamed: 0,one,two,three,dour
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [73]:
transform = lambda x: x[:4].upper()

In [74]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [75]:
data.index = data.index.map(transform)

In [76]:
data

Unnamed: 0,one,two,three,dour
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [77]:
data.rename?

In [78]:
data.rename(index = str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,DOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [79]:
data.rename(index = {'OHIO': 'INDIANA'},
           columns = {'three': 'peek'})

Unnamed: 0,one,two,peek,dour
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [80]:
data

Unnamed: 0,one,two,three,dour
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## 7.2.5 离散化和分箱

In [81]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [82]:
bins = [18, 25, 35, 60, 100]

In [83]:
cuts = pd.cut(ages, bins)

In [84]:
pd.cut?

In [85]:
cuts

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [86]:
cuts.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [87]:
cuts.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [88]:
pd.value_counts(cuts)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [89]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [90]:
cut2 = pd.cut(ages, bins, labels='Youth YoungAdult MiddleAged Senior'.split())

In [91]:
cut2

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [92]:
pd.cut(ages, 4)

[(19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], ..., (30.25, 40.5], (50.75, 61.0], (40.5, 50.75], (40.5, 50.75], (30.25, 40.5]]
Length: 12
Categories (4, interval[float64]): [(19.959, 30.25] < (30.25, 40.5] < (40.5, 50.75] < (50.75, 61.0]]

In [93]:
pd.cut(ages, 4).codes

array([0, 0, 0, 0, 0, 0, 1, 1, 3, 2, 2, 1], dtype=int8)

In [94]:
data = np.random.randn(20)

In [95]:
data_cut = pd.cut(data, 4, precision=2)

In [96]:
data

array([ 1.36531418, -0.47708487, -1.58103825, -0.01666498,  0.63223774,
       -0.41935197, -0.5634383 ,  0.16348248, -0.56997468, -1.27327144,
       -0.75079257,  1.59171324, -0.14876594, -1.42406317, -1.0414892 ,
       -0.16021371, -1.46227033,  1.00029722, -0.62544718, -0.93993658])

In [97]:
data_cut

[(0.8, 1.59], (-0.79, 0.0053], (-1.58, -0.79], (-0.79, 0.0053], (0.0053, 0.8], ..., (-0.79, 0.0053], (-1.58, -0.79], (0.8, 1.59], (-0.79, 0.0053], (-1.58, -0.79]]
Length: 20
Categories (4, interval[float64]): [(-1.58, -0.79] < (-0.79, 0.0053] < (0.0053, 0.8] < (0.8, 1.59]]

In [98]:
data_cut.codes

array([3, 1, 0, 1, 2, 1, 1, 2, 1, 0, 1, 3, 1, 0, 0, 1, 0, 3, 1, 0],
      dtype=int8)

In [168]:
data = np.arange(41)

In [172]:
data = np.random.randn(41)

In [173]:
data

array([-2.19527551,  3.0615092 , -1.82980664,  0.28441967,  0.63799675,
        0.26617178,  1.14697558, -0.18807102, -0.96114275,  0.13504275,
        2.61263393, -0.99038893,  0.78040446,  0.45784629, -0.36861784,
       -0.587866  ,  0.15595849, -1.66090807, -0.01544135, -0.09705853,
        0.31844102, -0.2187118 ,  0.29779837,  1.00146427,  0.13690257,
       -0.60301171, -0.00515908,  0.33245977,  0.04129348, -0.48380845,
       -0.7018444 ,  0.86443867, -0.05868518, -1.19557294, -0.25913044,
       -0.60276668, -0.36151823, -0.82256768,  1.61880694,  0.36475311,
       -1.70919588])

In [174]:
data_cut  = pd.qcut(data, 4)

In [175]:
data_cut

[(-2.1959999999999997, -0.603], (0.332, 3.062], (-2.1959999999999997, -0.603], (-0.0154, 0.332], (0.332, 3.062], ..., (-0.603, -0.0154], (-2.1959999999999997, -0.603], (0.332, 3.062], (0.332, 3.062], (-2.1959999999999997, -0.603]]
Length: 41
Categories (4, interval[float64]): [(-2.1959999999999997, -0.603] < (-0.603, -0.0154] < (-0.0154, 0.332] < (0.332, 3.062]]

In [176]:
pd.value_counts(data_cut)

(-2.1959999999999997, -0.603]    11
(0.332, 3.062]                   10
(-0.0154, 0.332]                 10
(-0.603, -0.0154]                10
dtype: int64

In [102]:
data

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40])

In [103]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-0.001, 4.0], (-0.001, 4.0], (-0.001, 4.0], (-0.001, 4.0], (-0.001, 4.0], ..., (20.0, 36.0], (36.0, 40.0], (36.0, 40.0], (36.0, 40.0], (36.0, 40.0]]
Length: 41
Categories (4, interval[float64]): [(-0.001, 4.0] < (4.0, 20.0] < (20.0, 36.0] < (36.0, 40.0]]

In [177]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).codes

array([0, 3, 0, 2, 2, 2, 3, 1, 1, 2, 3, 1, 2, 2, 1, 1, 2, 0, 1, 1, 2, 1,
       2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 0, 1, 1, 1, 1, 3, 2, 0],
      dtype=int8)

In [178]:
pd.value_counts(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]))

(-0.0154, 1.001]                 16
(-1.196, -0.0154]                16
(-2.1959999999999997, -1.196]     5
(1.001, 3.062]                    4
dtype: int64

## 7.2.6 检测和过滤异常值

In [106]:
data = pd.DataFrame(np.random.randn(1000,4))

In [107]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.021273,0.047176,0.028122,-0.006299
std,0.988716,0.979614,0.996211,0.971379
min,-3.140794,-2.874432,-2.53725,-3.886111
25%,-0.674228,-0.581949,-0.645557,-0.594414
50%,0.039931,0.075096,0.036867,0.016991
75%,0.730029,0.69538,0.690724,0.639306
max,2.817579,3.143494,3.273754,3.149884


In [167]:
data.any?

In [109]:
data[ (np.abs(data)>3 ).any(1)]

Unnamed: 0,0,1,2,3
4,-3.033231,0.55856,-0.435176,1.024407
80,-0.161321,-0.189689,-1.684609,-3.886111
223,-1.414401,3.019565,0.545418,-0.376207
286,0.245204,-1.823121,3.273754,1.770167
559,-0.121477,0.60023,3.124156,0.85914
571,-1.499241,0.630536,3.046618,0.255155
737,0.184835,1.582032,0.84174,3.063098
747,0.270538,3.143494,0.458519,-0.562011
800,-0.680144,0.680437,0.815487,3.149884
975,-3.140794,1.508627,1.110547,0.38144


In [110]:
(np.abs(data)>3).any(1)

0      False
1      False
2      False
3      False
4       True
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [111]:
data2 = pd.DataFrame(np.arange(6).reshape(2,3))

In [112]:
data2

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5


In [113]:
data2.iloc[1,1] = False

In [114]:
data2

Unnamed: 0,0,1,2
0,0,1,2
1,3,False,5


In [115]:
data2[(data2>1).any(0)]

  data2[(data2>1).any(0)]


Unnamed: 0,0,1,2
0,0,1,2


In [116]:
(data2>1)

Unnamed: 0,0,1,2
0,False,False,True
1,True,False,True


In [117]:
(data2>1).all(1)

0    False
1    False
dtype: bool

In [118]:
np.sign(data)*3

Unnamed: 0,0,1,2,3
0,-3.0,-3.0,-3.0,-3.0
1,3.0,-3.0,3.0,3.0
2,3.0,3.0,3.0,-3.0
3,-3.0,3.0,3.0,3.0
4,-3.0,3.0,-3.0,3.0
...,...,...,...,...
995,-3.0,-3.0,3.0,-3.0
996,-3.0,3.0,-3.0,3.0
997,-3.0,-3.0,-3.0,3.0
998,-3.0,3.0,3.0,-3.0


In [119]:
data

Unnamed: 0,0,1,2,3
0,-0.327193,-0.035712,-1.789307,-0.171729
1,1.087814,-0.265725,0.882855,0.374944
2,0.578446,0.213773,0.607258,-0.387193
3,-0.035099,0.150245,2.141172,1.340574
4,-3.033231,0.558560,-0.435176,1.024407
...,...,...,...,...
995,-0.427452,-0.490478,0.015854,-0.239893
996,-0.541636,0.220536,-0.131195,1.331339
997,-0.022820,-1.449398,-0.220609,0.873517
998,-0.935828,0.032023,0.037051,-0.732106


In [120]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.021273,0.047176,0.028122,-0.006299
std,0.988716,0.979614,0.996211,0.971379
min,-3.140794,-2.874432,-2.53725,-3.886111
25%,-0.674228,-0.581949,-0.645557,-0.594414
50%,0.039931,0.075096,0.036867,0.016991
75%,0.730029,0.69538,0.690724,0.639306
max,2.817579,3.143494,3.273754,3.149884


In [121]:
col = data[2]

In [122]:
col[np.abs(col>3)]

286    3.273754
559    3.124156
571    3.046618
Name: 2, dtype: float64

In [123]:
data[(np.abs(data)>3).all(1)]

Unnamed: 0,0,1,2,3


In [124]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
4,-3.033231,0.55856,-0.435176,1.024407
80,-0.161321,-0.189689,-1.684609,-3.886111
223,-1.414401,3.019565,0.545418,-0.376207
286,0.245204,-1.823121,3.273754,1.770167
559,-0.121477,0.60023,3.124156,0.85914
571,-1.499241,0.630536,3.046618,0.255155
737,0.184835,1.582032,0.84174,3.063098
747,0.270538,3.143494,0.458519,-0.562011
800,-0.680144,0.680437,0.815487,3.149884
975,-3.140794,1.508627,1.110547,0.38144


In [125]:
(np.abs(data)>3).any(0)

0    True
1    True
2    True
3    True
dtype: bool

In [126]:
data[(np.abs(data))>3] = np.sign(data)*3

In [127]:
data

Unnamed: 0,0,1,2,3
0,-0.327193,-0.035712,-1.789307,-0.171729
1,1.087814,-0.265725,0.882855,0.374944
2,0.578446,0.213773,0.607258,-0.387193
3,-0.035099,0.150245,2.141172,1.340574
4,-3.000000,0.558560,-0.435176,1.024407
...,...,...,...,...
995,-0.427452,-0.490478,0.015854,-0.239893
996,-0.541636,0.220536,-0.131195,1.331339
997,-0.022820,-1.449398,-0.220609,0.873517
998,-0.935828,0.032023,0.037051,-0.732106


In [128]:
data[(np.abs(data)>=3).any(1)]

Unnamed: 0,0,1,2,3
4,-3.0,0.55856,-0.435176,1.024407
80,-0.161321,-0.189689,-1.684609,-3.0
223,-1.414401,3.0,0.545418,-0.376207
286,0.245204,-1.823121,3.0,1.770167
559,-0.121477,0.60023,3.0,0.85914
571,-1.499241,0.630536,3.0,0.255155
737,0.184835,1.582032,0.84174,3.0
747,0.270538,3.0,0.458519,-0.562011
800,-0.680144,0.680437,0.815487,3.0
975,-3.0,1.508627,1.110547,0.38144


## 7.2.7 置换和随机取样

In [129]:
df = pd.DataFrame(np.arange(20).reshape(5,4))

In [130]:
sampler = np.random.permutation(5)

In [131]:
sampler

array([1, 2, 3, 0, 4])

In [132]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [133]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
0,0,1,2,3
4,16,17,18,19


In [134]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
0,0,1,2,3
4,16,17,18,19


In [135]:
df.sample?

In [136]:
df.sample(n=5)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
3,12,13,14,15


In [137]:
df.sample(n=9,replace=True)

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
2,8,9,10,11
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19


## 7.2.8 计算指标/虚拟变量

In [138]:
df = pd.DataFrame({
    'key': list('bbacab'),
    'data1': range(6),
})

In [151]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [140]:
lista = list()

In [141]:
lista

[]

In [142]:
x = 'a b c d e f g h'

In [143]:
lista.extend(x.split())

In [144]:
lista

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']

In [145]:
x.split()

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']

In [146]:
x

'a b c d e f g h'

In [147]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [148]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [149]:
dumies = pd.get_dummies(df['key'], prefix='key')

In [150]:
dumies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


# 7.3 字符串操作

## 7.3.1  字符串对象方法

In [8]:
val = 'a,b,  guido'

In [9]:
val.split(',')

['a', 'b', '  guido']

In [10]:
pieces = [x.strip() for x in val.split(',')]

In [11]:
pieces

['a', 'b', 'guido']

In [12]:
first, second, third = pieces

In [13]:
first

'a'

In [14]:
second

'b'

In [15]:
'::'.join(pieces)

'a::b::guido'

In [16]:
val

'a,b,  guido'

In [17]:
'guido' in val

True

In [18]:
val.find(',')

1

In [20]:
val.index(',')

1

In [21]:
val.find(':')

-1

In [23]:
val.count(',')

2

In [24]:
val.replace(',', '')

'ab  guido'

In [25]:
val

'a,b,  guido'

In [26]:
val.endswith('do')

True

In [27]:
val.startswith('ab')

False

## 7.3.2 正则表达式

In [28]:
import re 

In [29]:
text = 'foo   bar\t  baz  \tqux'

In [30]:
text

'foo   bar\t  baz  \tqux'

In [31]:
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [32]:
regex = re.compile('\s+')

In [34]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [35]:
regex.findall(text)

['   ', '\t  ', '  \t']

In [2]:
text = """
Dave dave@geogle.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [3]:
text

'\nDave dave@geogle.com\nRob rob@gmail.com\nRyan ryan@yahoo.com\n'

In [5]:
import re

In [4]:
pattern = r'[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}'

In [6]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [7]:
regex.findall(text)

['dave@geogle.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [8]:
m = regex.search(text)

In [9]:
m

<re.Match object; span=(6, 21), match='dave@geogle.com'>

In [10]:
m.group()

'dave@geogle.com'

In [15]:
print(regex.match(text))

None


In [17]:
print(regex.sub('asdasd', text))


Dave asdasd
Rob asdasd
Ryan asdasd



In [18]:
pattern = r'([a-z0-9._+%+-]+)@([a-z0-9.-]+)\.([a-z]{2,4})'

In [19]:
regex = re.compile(pattern, flags=re.I)

In [21]:
regex.findall(text)

[('dave', 'geogle', 'com'), ('rob', 'gmail', 'com'), ('ryan', 'yahoo', 'com')]

In [22]:
print(text)


Dave dave@geogle.com
Rob rob@gmail.com
Ryan ryan@yahoo.com



In [24]:
print(regex.sub(r'user_name: \1, Domain: \2, Suffix: \3', text))


Dave user_name: dave, Domain: geogle, Suffix: com
Rob user_name: rob, Domain: gmail, Suffix: com
Ryan user_name: ryan, Domain: yahoo, Suffix: com



## 7.3.3 pandas 中的向量化字符串函数

In [25]:
data = {
    'Dave': 'dave@geogle.com', 
    'Steve': 'steve@gmail.com',
    'Rob': 'rob@gmail.com', 
    'Wes': np.nan
}

In [26]:
data = pd.Series(data)

In [27]:
data

Dave     dave@geogle.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [29]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [30]:
pattern

'([a-z0-9._+%+-]+)@([a-z0-9.-]+)\\.([a-z]{2,4})'

In [32]:
matches = data.str.findall(pattern, flags=re.I)

In [39]:
matches.str.get(0)

Dave     (dave, geogle, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [40]:
matches

Dave     [(dave, geogle, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [41]:
data

Dave     dave@geogle.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [37]:
data.str.get(1)

Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object

In [42]:
data.str[1]

Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object

In [43]:
matches.str[0]

Dave     (dave, geogle, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [44]:
data.str[:3]

Dave     dav
Steve    ste
Rob      rob
Wes      NaN
dtype: object