# 数据清洗和准备

-----处理缺失数据-----

In [11]:
import numpy as np
import pandas as pd


# 滤除缺失数据
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
data.dropna()
data.dropna(how='all')     #how='all'将只丢弃全为NA的那些行或列

data[4]=NA
# data
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.309378,,-1.593937
3,1.471946,,0.707353
4,-0.768693,-0.072991,-0.24445
5,0.526145,-0.85145,-3.134424
6,-1.095725,0.800917,1.568644


In [13]:
# 填充缺失数据
df.fillna(0)

Unnamed: 0,0,1,2
0,2.362131,0.0,0.0
1,1.152228,0.0,0.0
2,-1.309378,0.0,-1.593937
3,1.471946,0.0,0.707353
4,-0.768693,-0.072991,-0.24445
5,0.526145,-0.85145,-3.134424
6,-1.095725,0.800917,1.568644


In [14]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-1.32029,1.133326,-0.396693
1,0.791776,-0.246003,-1.22702
2,1.975507,-0.246003,-0.94739
3,-0.891771,-0.246003,1.169528
4,-0.150664,-0.246003,1.169528
5,-0.591936,-0.246003,1.169528


In [15]:
data=pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

-----数据转换-----

In [18]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})
data.drop_duplicates(keep='last')     #保留最后一个

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
6,two,4


In [30]:
# 利用函数或映射进行数据转换
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon','pastrami', 'honey ham','nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3,5, 6]})
print(data)
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
lowercased=data['food'].str.lower()
lowercased
data['animal']=lowercased.map(meat_to_animal)
data
# data['food'].map(lambda x:meat_to_animal[x.lower()])

          food  ounces
0        bacon     4.0
1  pulled pork     3.0
2        bacon    12.0
3     Pastrami     6.0
4  corned beef     7.5
5        Bacon     8.0
6     pastrami     3.0
7    honey ham     5.0
8     nova lox     6.0


Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [33]:
# 替换值
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace([-999,-1000],[NA,0])
# data.replace({-999:NA,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [59]:
# 重命名轴索引
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
print(data)
transform=lambda x : x[:4].upper()
data.index=data.index.map(transform)
data
data.rename(index=str.title, columns=str.upper,inplace=True)     #就地修改某个数据集
data

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11


Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [73]:
data=pd.DataFrame(np.random.randn(1000,4))
data.describe()
# 找出某列中绝对值大小超过
data[2][np.abs(col)>3]

137   -0.245261
205   -2.159858
904   -0.850841
Name: 2, dtype: float64

In [92]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})
print(df)
# DataFrame的某一列中含有k个不同的值，则可以派生出一个k列矩阵或DataFrame（其值全为1和0）
dummies=pd.get_dummies(df['key'],prefix='keys')
df[['data1']].join(dummies)
# print(df['data1'])
# print(df[['data1']])

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   b      5


Unnamed: 0,data1,keys_a,keys_b,keys_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [130]:
# DataFrame中的某行同属于多个分类，例下genres
mnames=['movie_id','title','genres']
movies=pd.read_csv('E:\\dwg\\Fireflies\\useFiles\\ml-latest-small\\movies.csv',header=None,names=mnames)
movies[:10]
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))     #extend(seq)函数用于在列表末尾一次性追加另一个序列中的多个值（用新列表扩展原来的列表）
genres=pd.unique(all_genres)
genres
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)
# dummies
# gen = movies.genres[1]
# gen
# gen.split('|')
# dummies.columns.get_indexer(gen.split('|'))
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))     #根据dummies.columns获得每个切分字段（类型）的列索引
    dummies.iloc[i, indices] = 1
dummies
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic

Unnamed: 0,movie_id,title,genres,Genre_Adventure,Genre_Animation,Genre_Children,Genre_Comedy,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Horror,Genre_Mystery,Genre_Sci-Fi,Genre_Documentary,Genre_IMAX,Genre_War,Genre_Musical,Genre_Western,Genre_Film-Noir,Genre_(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat (1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina (1995),Comedy|Romance,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck (1995),Adventure|Children,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,Sudden Death (1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
np.random.seed(12345)     #每次生成的随机数都相同
values=np.random.rand(10)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))
# values

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


-----字符串操作-----

In [205]:
# 正则表达式
import re
text = "foo bar\t baz \tqux"
# 如果打算对许多字符串应用同一条正则表达式，强烈建议通过re.compile创建regex对象。这样将可以节省大量的CPU时间
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [None]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'     #匹配mail
regex=re.compile(pattern, flags=re.IGNORECASE)     #不区分大小写

-----pandas的矢量化字符串函数-----