In [6]:
import numpy as np
import pandas as pd
%matplotlib notebook
'''
数据过滤:  fillna, list迭代过滤, isin, dropna, drop_duplicates
'''

'''
fillna: 填充缺失数据   
DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)
   Fill NA/NaN values using the specified method

method : {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None

'''
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1:3] = np.nan
df

Unnamed: 0,0,1,2
0,1.144615,,
1,0.751341,,
2,0.35279,,
3,-0.038378,,
4,-0.446342,-0.547458,-1.518009
5,-2.05114,-0.165499,1.24569
6,0.809246,1.685742,-0.647956


In [12]:
# df.fillna(0)   #将NA填充为0
df.fillna({1: 0.5, 2: 0})  # 不同列来填充不同数据 

Unnamed: 0,0,1,2
0,1.144615,0.5,0.0
1,0.751341,0.5,0.0
2,0.35279,0.5,0.0
3,-0.038378,0.5,0.0
4,-0.446342,-0.547458,-1.518009
5,-2.05114,-0.165499,1.24569
6,0.809246,1.685742,-0.647956


In [23]:
df = df.fillna(0)
df.iloc[4:, 1:3] = np.nan
df

Unnamed: 0,0,1,2
0,1.144615,0.0,0.0
1,0.751341,0.0,0.0
2,0.35279,0.0,0.0
3,-0.038378,0.0,0.0
4,-0.446342,,
5,-2.05114,,
6,0.809246,,


In [26]:
df.fillna(method='ffill', limit=1, axis=1)  # 按照行来填充  只填充后面一个 

Unnamed: 0,0,1,2
0,1.144615,0.0,0.0
1,0.751341,0.0,0.0
2,0.35279,0.0,0.0
3,-0.038378,0.0,0.0
4,-0.446342,-0.446342,
5,-2.05114,-2.05114,
6,0.809246,0.809246,


In [28]:
'''
list迭代过滤:  迭代例子  写的非常精妙 目前只能看懂  写的话勉勉强强  

for x in xyz if x not in a:

print([x for x in xyz if x in a])

gen = (x for x in xyz if x not in a)
for x in gen:
    print x
'''
import json
records = [json.loads(line) for line in open('datasets/bitly_usagov/example.txt')]
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]      # 是个list集合 

['America/New_York',
 'America/Denver',
 'America/New_York',
 'America/Sao_Paulo',
 'America/New_York',
 'America/New_York',
 'Europe/Warsaw',
 '',
 '',
 '']

In [29]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]   # 遍历dict时把俩个东西调换了顺序
    value_key_pairs.sort()           # 默认是升序
    return value_key_pairs[-n:]

In [31]:
'''
isin: DataFrame.isin(values)  
    Return boolean DataFrame showing whether each element in the DataFrame is contained in values.

'''
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
df.isin([1, 3, 12, 'a'])

Unnamed: 0,A,B
0,True,True
1,False,False
2,True,False


In [32]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
df.isin({'A': [1, 3], 'B': [4, 7, 12]})

Unnamed: 0,A,B
0,True,False
1,False,True
2,True,True


In [33]:
df[df.isin([1, 3, 12, 'a'])]  # 这样也能过滤一波

Unnamed: 0,A,B
0,1.0,1.0
1,,
2,3.0,


In [35]:
'''
dropna: 
DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
   Return object with labels on given axis omitted where alternately any or all of the data are missing
'''
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-1.162218,,
1,-0.854831,,
2,0.565986,,0.354174
3,-0.527496,,-0.250903
4,-0.958817,-0.331943,-1.076831
5,1.681882,-1.29802,0.753992
6,-0.451662,-1.106121,-1.364268


In [45]:
df.dropna(thresh=2)   # 一行至少有2个非NaN值

Unnamed: 0,0,1,2
2,0.565986,,0.354174
3,-0.527496,,-0.250903
4,-0.958817,-0.331943,-1.076831
5,1.681882,-1.29802,0.753992
6,-0.451662,-1.106121,-1.364268


In [46]:
df.dropna(thresh=1)  #一行至少有1个非NaN值

Unnamed: 0,0,1,2
0,-1.162218,,
1,-0.854831,,
2,0.565986,,0.354174
3,-0.527496,,-0.250903
4,-0.958817,-0.331943,-1.076831
5,1.681882,-1.29802,0.753992
6,-0.451662,-1.106121,-1.364268


In [47]:
df.dropna(thresh=3) # 一行至少有3个非NaN值

Unnamed: 0,0,1,2
4,-0.958817,-0.331943,-1.076831
5,1.681882,-1.29802,0.753992
6,-0.451662,-1.106121,-1.364268


In [48]:
'''
drop_duplicates:   DataFrame.drop_duplicates(subset=None, keep='first', inplace=False)
    Return DataFrame with duplicate rows removed, optionally only considering certain columns

'''
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [49]:
data.duplicated()  # duplicated和drop_duplicates   这个返回是布尔型Series，表示各行是否是重复行（前面出现过的行）

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [50]:
data.drop_duplicates()  # 而drop_duplicates返回丢弃重复行的数据

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [57]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [59]:
data.drop_duplicates(['k1'], keep='first')  # 针对一行   保留第一个

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [60]:
data.drop_duplicates(['k1'], keep='last')  # 针对一行    保留最后一个

Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6
