In [1]:
import pandas as pd
import numpy as np
df = pd.DataFrame([[np.nan,2,np.nan,0],
                  [3,4,np.nan,1],
                  [np.nan,np.nan,np.nan,4],
                  [np.nan,3,np.nan,4]],
                 columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,4
3,,3.0,,4


In [2]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,4
3,0.0,3.0,0.0,4


In [3]:
# 向前填充
df.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,3.0,4.0,,4
3,3.0,3.0,,4


In [4]:
# 按照指定的值进行填充
values = {"A":0,"B":1,"C":2,"D":3}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,4
3,0.0,3.0,2.0,4


In [5]:
# 向后填充
df.fillna(method='bfill')

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0
1,3.0,4.0,,1
2,,3.0,,4
3,,3.0,,4


In [6]:
# 按照指定的值进行填充,限制数量
values = {"A":0,"B":1,"C":2,"D":3}
df.fillna(value=values,limit=1)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,,1
2,,1.0,,4
3,,3.0,,4


In [10]:
# 按照指定的值进行填充,限制数量,并替换原来的值
values = {"A":0,"B":1,"C":2,"D":3}
df.fillna(value=values,inplace=True)
df

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,4
3,0.0,3.0,2.0,4


In [12]:
## 发现重复值
## duplicated
data = pd.read_csv('data1.csv')
data.head()

Unnamed: 0,orderid,bikeid,userid,start_time,start_location_x,start_location_y,end_time,end_location_x,end_location_y,track
0,78387,158357,10080,2016-08-20 06:57,121.348,31.389,2016-08-20 07:04,121.357,31.388,"121.347,31.392#121.348,31.389#121.349,31.390#1..."
1,891333,92776,6605,2016-08-29 19:09,121.508,31.279,2016-08-29 19:31,121.489,31.271,"121.489,31.270#121.489,31.271#121.490,31.270#1..."
2,1106623,152045,8876,2016-08-13 16:17,121.383,31.254,2016-08-13 16:36,121.405,31.248,"121.381,31.251#121.382,31.251#121.382,31.252#1..."
3,1389484,196259,10648,2016-08-23 21:34,121.484,31.32,2016-08-23 21:43,121.471,31.325,"121.471,31.325#121.472,31.325#121.473,31.324#1..."
4,188537,78208,11735,2016-08-16 07:32,121.407,31.292,2016-08-16 07:41,121.418,31.288,"121.407,31.291#121.407,31.292#121.408,31.291#1..."


In [14]:
# subset 就是定义到底什么是重复
# 重复的默认定义是所有列的值必须是一致的
# 我们可以设置subset参数，然后根据固定的列来定义重复
data.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
102356    False
102357    False
102358    False
102359    False
102360    False
Length: 102361, dtype: bool

In [15]:
## 显示重复值
data[data.duplicated()]

Unnamed: 0,orderid,bikeid,userid,start_time,start_location_x,start_location_y,end_time,end_location_x,end_location_y,track


In [16]:
data.duplicated(subset=['userid'])

0         False
1         False
2         False
3         False
4         False
          ...  
102356     True
102357     True
102358     True
102359     True
102360     True
Length: 102361, dtype: bool

In [17]:
## keep参数的意思是保留哪个数据 first就是保留第一个，如果是last就是保留最后一个，如果是False就是一个都不保留

In [18]:
## 删除重复值
data.drop_duplicates()

Unnamed: 0,orderid,bikeid,userid,start_time,start_location_x,start_location_y,end_time,end_location_x,end_location_y,track
0,78387,158357,10080,2016-08-20 06:57,121.348,31.389,2016-08-20 07:04,121.357,31.388,"121.347,31.392#121.348,31.389#121.349,31.390#1..."
1,891333,92776,6605,2016-08-29 19:09,121.508,31.279,2016-08-29 19:31,121.489,31.271,"121.489,31.270#121.489,31.271#121.490,31.270#1..."
2,1106623,152045,8876,2016-08-13 16:17,121.383,31.254,2016-08-13 16:36,121.405,31.248,"121.381,31.251#121.382,31.251#121.382,31.252#1..."
3,1389484,196259,10648,2016-08-23 21:34,121.484,31.320,2016-08-23 21:43,121.471,31.325,"121.471,31.325#121.472,31.325#121.473,31.324#1..."
4,188537,78208,11735,2016-08-16 07:32,121.407,31.292,2016-08-16 07:41,121.418,31.288,"121.407,31.291#121.407,31.292#121.408,31.291#1..."
...,...,...,...,...,...,...,...,...,...,...
102356,1479550,214335,1423,2016-08-28 18:03,121.478,31.297,2016-08-28 18:09,121.481,31.304,"121.478,31.297#121.479,31.297#121.480,31.297#1..."
102357,1478273,160487,3067,2016-08-14 20:22,121.320,31.238,2016-08-14 20:28,121.312,31.235,"121.312,31.235#121.313,31.235#121.313,31.236#1..."
102358,367733,179530,12746,2016-08-27 09:54,121.391,31.307,2016-08-27 10:06,121.398,31.306,"121.390,31.307#121.391,31.307#121.392,31.307#1..."
102359,64915,167419,837,2016-08-20 06:04,121.515,31.269,2016-08-20 06:10,121.510,31.272,"121.510,31.272#121.510,31.273#121.511,31.273#1..."


In [19]:
data.drop_duplicates(subset=['userid'])

Unnamed: 0,orderid,bikeid,userid,start_time,start_location_x,start_location_y,end_time,end_location_x,end_location_y,track
0,78387,158357,10080,2016-08-20 06:57,121.348,31.389,2016-08-20 07:04,121.357,31.388,"121.347,31.392#121.348,31.389#121.349,31.390#1..."
1,891333,92776,6605,2016-08-29 19:09,121.508,31.279,2016-08-29 19:31,121.489,31.271,"121.489,31.270#121.489,31.271#121.490,31.270#1..."
2,1106623,152045,8876,2016-08-13 16:17,121.383,31.254,2016-08-13 16:36,121.405,31.248,"121.381,31.251#121.382,31.251#121.382,31.252#1..."
3,1389484,196259,10648,2016-08-23 21:34,121.484,31.320,2016-08-23 21:43,121.471,31.325,"121.471,31.325#121.472,31.325#121.473,31.324#1..."
4,188537,78208,11735,2016-08-16 07:32,121.407,31.292,2016-08-16 07:41,121.418,31.288,"121.407,31.291#121.407,31.292#121.408,31.291#1..."
...,...,...,...,...,...,...,...,...,...,...
101915,479197,138039,15623,2016-08-25 20:17,121.477,31.282,2016-08-25 20:23,121.470,31.283,"121.470,31.283#121.471,31.283#121.472,31.283#1..."
102041,1723586,50072,17481,2016-08-31 21:20,121.494,31.307,2016-08-31 21:26,121.495,31.315,"121.494,31.307#121.495,31.307#121.495,31.308#1..."
102096,1066441,295784,17753,2016-08-31 13:08,121.513,31.302,2016-08-31 13:15,121.507,31.304,"121.507,31.303#121.508,31.303#121.509,31.303#1..."
102175,590021,71898,13606,2016-08-21 17:16,121.482,31.268,2016-08-21 17:24,121.486,31.272,"121.482,31.268#121.483,31.268#121.484,31.268#1..."
