# 数据处理工作内容：
1. 加载数据
2. 缺失值处理
3. 数据转换

# pandas
- pandas含有使数据清洗和分析工作变得更快更简单的数据结构和操作工具。pandas经常和其它工具一同使用，如数值计算工具NumPy和SciPy，分析库statsmodels和scikit-learn，和数据可视化库matplotlib。
- pandas是专门为处理表格和混杂数据设计的。
- pandas的数据结构 Series, DataFrame

# pandas数据结构 series与DataFrame对象


## 1. Series
Series是一种类似于一维数组的对象，它由一组数据（各种NumPy数据类型）以及一组与之相关的数据标签（即索引）组成。  Series的字符串表现形式为：索引在左边，值在右边。  可以将Series看成是一个定长的有序字典，因为它是索引值到数据值的一个映射。

# 创建Series对象
Series(series) Series(series, index) Series(dict) Series(dict,key_index) 

In [7]:
import pandas as pd
from pandas import Series, DataFrame
data = [i for i in range(20) if i % 5 == 0]
obj = pd.Series(data)
obj

0     0
1     5
2    10
3    15
dtype: int64

In [8]:
index = ['a', 'b', 'c', 'd']
obj1 = pd.Series(data, index=index)
obj1

a     0
b     5
c    10
d    15
dtype: int64

In [9]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj2 = pd.Series(sdata)
obj2

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [10]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj3 = pd.Series(sdata, index=states)
obj3

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [11]:
obj3.index
obj3.values
pd.isnull(obj3)
pd.isna(obj3)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [12]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

# 创建DateFrame
DateFrame({key:series}) DateFrame(table, columns) DateFrame(table, columns, index) 

In [13]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [14]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [15]:
frame1 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
   ....:                       index=['one', 'two', 'three', 'four',
   ....:                              'five', 'six'])
frame1

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [16]:
frame1.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

# 索引对象、方法、属性
- loc(标签)
- iloc(整数)

In [19]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [20]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [21]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [22]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [42]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [43]:
data.iloc[2, [3, 0, 2]]

four     11
one       8
three    10
Name: Utah, dtype: int32

# 缺失值处理

In [44]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'cvocado'])

In [45]:
string_data

0     aardvark
1    artichoke
2          NaN
3      cvocado
dtype: object

In [46]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [47]:
string_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [48]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [49]:
from numpy import nan as NA

In [50]:
data = pd.Series([1, NA, 5, NA, 8])
data.dropna()

0    1.0
2    5.0
4    8.0
dtype: float64

In [51]:
data[data.notnull()]

0    1.0
2    5.0
4    8.0
dtype: float64

In [52]:
data = pd.DataFrame([[1, 5, 6], [1, NA, NA], [NA, NA, NA], [NA, 5, 2]])
data

Unnamed: 0,0,1,2
0,1.0,5.0,6.0
1,1.0,,
2,,,
3,,5.0,2.0


In [53]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,5.0,6.0


In [54]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,5.0,6.0
1,1.0,,
3,,5.0,2.0


In [55]:
data[4] = NA

In [56]:
data

Unnamed: 0,0,1,2,4
0,1.0,5.0,6.0,
1,1.0,,,
2,,,,
3,,5.0,2.0,


In [57]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,5.0,6.0
1,1.0,,
2,,,
3,,5.0,2.0


In [59]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,1.205254,0.619788,-0.544577
1,-0.164286,-1.156731,-1.723789
2,1.154312,-1.12543,-0.228787
3,-1.0948,0.182234,-2.963736
4,-0.539134,-0.117883,0.209211
5,-0.643996,0.457459,-0.788414
6,-1.798608,1.836747,1.160875


In [60]:
df.iloc[:4, 1] = NA

In [61]:
df.iloc[:2, 2] = NA

In [62]:
df

Unnamed: 0,0,1,2
0,1.205254,,
1,-0.164286,,
2,1.154312,,-0.228787
3,-1.0948,,-2.963736
4,-0.539134,-0.117883,0.209211
5,-0.643996,0.457459,-0.788414
6,-1.798608,1.836747,1.160875


In [63]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.539134,-0.117883,0.209211
5,-0.643996,0.457459,-0.788414
6,-1.798608,1.836747,1.160875


In [64]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.154312,,-0.228787
3,-1.0948,,-2.963736
4,-0.539134,-0.117883,0.209211
5,-0.643996,0.457459,-0.788414
6,-1.798608,1.836747,1.160875


In [66]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.205254,0.0,0.0
1,-0.164286,0.0,0.0
2,1.154312,0.0,-0.228787
3,-1.0948,0.0,-2.963736
4,-0.539134,-0.117883,0.209211
5,-0.643996,0.457459,-0.788414
6,-1.798608,1.836747,1.160875


In [67]:
df.fillna({1:0.5, 2:0})

Unnamed: 0,0,1,2
0,1.205254,0.5,0.0
1,-0.164286,0.5,0.0
2,1.154312,0.5,-0.228787
3,-1.0948,0.5,-2.963736
4,-0.539134,-0.117883,0.209211
5,-0.643996,0.457459,-0.788414
6,-1.798608,1.836747,1.160875


In [68]:
df.fillna(0, inplace=True)

In [69]:
df

Unnamed: 0,0,1,2
0,1.205254,0.0,0.0
1,-0.164286,0.0,0.0
2,1.154312,0.0,-0.228787
3,-1.0948,0.0,-2.963736
4,-0.539134,-0.117883,0.209211
5,-0.643996,0.457459,-0.788414
6,-1.798608,1.836747,1.160875


In [70]:
df = pd.DataFrame({'k1':['one', 'two'] * 3 + ['two'],
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [71]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [72]:
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [73]:
df =   pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
   ....:                               'Pastrami', 'corned beef', 'Bacon',
   ....:                               'pastrami', 'honey ham', 'nova lox'],
   ....:                      'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})


In [74]:
df

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [75]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])


In [76]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [77]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [78]:
age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
age

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [79]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(age, bins)

In [81]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [82]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [83]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [84]:
pd.cut(age, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [85]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.017372,-0.032124,-0.018907,-0.029156
std,1.003735,0.987042,0.981933,1.011484
min,-3.281215,-4.199659,-2.803232,-3.527656
25%,-0.683104,-0.703133,-0.669977,-0.692925
50%,-0.006132,-0.029653,0.000482,-0.030475
75%,0.747556,0.629519,0.650081,0.611962
max,3.474955,3.137596,3.473935,3.096573


In [89]:
col = data[3]
col[np.abs(col) > 3]

247    3.096573
271   -3.527656
347   -3.248131
557   -3.037978
Name: 3, dtype: float64

In [92]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
55,3.474955,-1.252484,-0.335394,-0.458383
65,0.237023,-4.199659,-0.464229,0.297923
247,0.216141,1.341801,-0.821292,3.096573
271,0.330552,1.116371,-0.054725,-3.527656
321,0.176774,-3.349604,-0.409875,-0.135464
347,-0.092992,0.566816,-1.090054,-3.248131
529,0.735714,1.717256,3.473935,-0.951519
557,-0.512901,0.777808,0.640933,-3.037978
592,-3.281215,-0.010456,-1.712856,1.672983
922,0.103437,3.137596,-1.151098,0.371219


In [93]:
df = pd.DataFrame(np.arange(20).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [94]:
sampler = np.random.permutation(5)

In [95]:
sampler

array([1, 2, 3, 4, 0])

In [96]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3


In [97]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [98]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [99]:
import re

In [100]:
text = "foo   bar\t baz \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [101]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [102]:
regex.findall(text)

['   ', '\t ', ' \t']

In [104]:
data = pd.DataFrame({'x0': [1, 2, 3, 4, 5],
                     'x1': [0.01, -0.01, 0.25, -4.1, 0.],
                     'y': [-1.5, 0., 3.6, 1.3, -2.]})
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [105]:
data.columns

Index(['x0', 'x1', 'y'], dtype='object')

In [108]:
data.values

array([[ 1.  ,  0.01, -1.5 ],
       [ 2.  , -0.01,  0.  ],
       [ 3.  ,  0.25,  3.6 ],
       [ 4.  , -4.1 ,  1.3 ],
       [ 5.  ,  0.  , -2.  ]])

In [109]:
df2 = pd.DataFrame(data.values, columns=['one', 'two', 'three'])

In [110]:
df2

Unnamed: 0,one,two,three
0,1.0,0.01,-1.5
1,2.0,-0.01,0.0
2,3.0,0.25,3.6
3,4.0,-4.1,1.3
4,5.0,0.0,-2.0


In [111]:
df3 = data.copy()

In [113]:
df3['strings'] = ['a', 'b', 'c', 'd', 'e']
df3

Unnamed: 0,x0,x1,y,strings
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,c
3,4,-4.1,1.3,d
4,5,0.0,-2.0,e


In [114]:
df3.values

array([[1, 0.01, -1.5, 'a'],
       [2, -0.01, 0.0, 'b'],
       [3, 0.25, 3.6, 'c'],
       [4, -4.1, 1.3, 'd'],
       [5, 0.0, -2.0, 'e']], dtype=object)

In [116]:
narr = df3.values

In [117]:
narr

array([[1, 0.01, -1.5, 'a'],
       [2, -0.01, 0.0, 'b'],
       [3, 0.25, 3.6, 'c'],
       [4, -4.1, 1.3, 'd'],
       [5, 0.0, -2.0, 'e']], dtype=object)

In [118]:
narr.shape

(5, 4)

In [121]:
narr.dtype

dtype('O')

In [122]:
model_cols = ['x0', 'x1']
data.loc[:, model_cols].values

array([[ 1.  ,  0.01],
       [ 2.  , -0.01],
       [ 3.  ,  0.25],
       [ 4.  , -4.1 ],
       [ 5.  ,  0.  ]])

In [125]:
data = pd.DataFrame({
   ....:     'x0': [1, 2, 3, 4, 5],
   ....:     'x1': [0.01, -0.01, 0.25, -4.1, 0.],
   ....:     'y': [-1.5, 0., 3.6, 1.3, -2.]})

In [126]:
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


ModuleNotFoundError: No module named 'pasty'