In [1]:
#导入np, plt, pd等包
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Make the graphs a bit prettier, and bigger
plt.style.use('default')

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

plt.rcParams['figure.figsize'] = (15, 5)

**混乱数据的一个重要问题是：你怎么知道它是不是混乱的？**

**我们还是使用NYC311数据框架**

In [2]:
request = pd.read_csv('data/311-service-requests.csv')

  interactivity=interactivity, compiler=compiler, result=result)


# 7.1 如何知道是否混乱？

**我们先看几列。**

**一般情况下，想看一个列是否存在问题，我会使用.unique()来看它的所有值。如果它是数字列，我将画一张图来感觉下数值分布**

**当我们看‘Incident Zip’中的unique时，我们立马知道里面是混乱的**

**其中的一些问题**

**1. 有些是字符串，有些是浮点类型**

**2. 存在一些Nans**

**3. 一些邮编是29616-0759 或者 83**

**4. 有些N/A，pandas并不能识别，比如‘N/A’ 和“NO CLUE”**

**我们可以如下操作：**

**1. 将所有‘N/A’或者‘NO CLUE’换为正规的nan**

**2. 看看83发生了什么，在决定怎么办**

**3. 让所有值都成为字符串**

In [3]:
request['Incident Zip'].unique()

array([11432.0, 11378.0, 10032.0, 10023.0, 10027.0, 11372.0, 11419.0,
       11417.0, 10011.0, 11225.0, 11218.0, 10003.0, 10029.0, 10466.0,
       11219.0, 10025.0, 10310.0, 11236.0, nan, 10033.0, 11216.0, 10016.0,
       10305.0, 10312.0, 10026.0, 10309.0, 10036.0, 11433.0, 11235.0,
       11213.0, 11379.0, 11101.0, 10014.0, 11231.0, 11234.0, 10457.0,
       10459.0, 10465.0, 11207.0, 10002.0, 10034.0, 11233.0, 10453.0,
       10456.0, 10469.0, 11374.0, 11221.0, 11421.0, 11215.0, 10007.0,
       10019.0, 11205.0, 11418.0, 11369.0, 11249.0, 10005.0, 10009.0,
       11211.0, 11412.0, 10458.0, 11229.0, 10065.0, 10030.0, 11222.0,
       10024.0, 10013.0, 11420.0, 11365.0, 10012.0, 11214.0, 11212.0,
       10022.0, 11232.0, 11040.0, 11226.0, 10281.0, 11102.0, 11208.0,
       10001.0, 10472.0, 11414.0, 11223.0, 10040.0, 11220.0, 11373.0,
       11203.0, 11691.0, 11356.0, 10017.0, 10452.0, 10280.0, 11217.0,
       10031.0, 11201.0, 11358.0, 10128.0, 11423.0, 10039.0, 10010.0,
       11209.0,

# 7.2 修复Nan值以及字符/浮点数冲突

**我们可以给pd.read_csv里面添加na_values选项来清理一部分，也可以将Incident Zip声明为字符串，而不是浮点数**

In [4]:
na_values = ['NO CLUE', 'N/A', '0']
request = pd.read_csv('data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})

In [5]:
request['Incident Zip'].unique()

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
       '10305', '10312', '10026', '10309', '10036', '11433', '11235',
       '11213', '11379', '11101', '10014', '11231', '11234', '10457',
       '10459', '10465', '11207', '10002', '10034', '11233', '10453',
       '10456', '10469', '11374', '11221', '11421', '11215', '10007',
       '10019', '11205', '11418', '11369', '11249', '10005', '10009',
       '11211', '11412', '10458', '11229', '10065', '10030', '11222',
       '10024', '10013', '11420', '11365', '10012', '11214', '11212',
       '10022', '11232', '11040', '11226', '10281', '11102', '11208',
       '10001', '10472', '11414', '11223', '10040', '11220', '11373',
       '11203', '11691', '11356', '10017', '10452', '10280', '11217',
       '10031', '11201', '11358', '10128', '11423', '10039', '10010',
       '11209',

# 7.3 那些横线是干嘛的？

In [6]:
rows_with_dash = request['Incident Zip'].str.contains('-').fillna(False)
len(request[rows_with_dash])

5

In [7]:
request[rows_with_dash]

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Action Updated Date,Community Board,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Park Facility Name,Park Borough,School Name,School Number,School Region,School Code,School Phone Number,School Address,School City,School State,School Zip,School Not Found,School or Citywide Complaint,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
29136,26550551,10/24/2013 06:16:34 PM,,DCA,Department of Consumer Affairs,Consumer Complaint,False Advertising,,77092-2016,2700 EAST SELTICE WAY,EAST SELTICE WAY,,,,,,HOUSTON,,,Assigned,11/13/2013 11:15:20 AM,10/29/2013 11:16:16 AM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,,,
30939,26548831,10/24/2013 09:35:10 AM,,DCA,Department of Consumer Affairs,Consumer Complaint,Harassment,,55164-0737,P.O. BOX 64437,64437,,,,,,ST. PAUL,,,Assigned,11/13/2013 02:30:21 PM,10/29/2013 02:31:06 PM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,,,
70539,26488417,10/15/2013 03:40:33 PM,,TLC,Taxi and Limousine Commission,Taxi Complaint,Driver Complaint,Street,11549-3650,365 HOFSTRA UNIVERSITY,HOFSTRA UNIVERSITY,,,,,,HEMSTEAD,,,Assigned,11/30/2013 01:20:33 PM,10/16/2013 01:21:39 PM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,La Guardia Airport,,,,,,,,,,
85821,26468296,10/10/2013 12:36:43 PM,10/26/2013 01:07:07 AM,DCA,Department of Consumer Affairs,Consumer Complaint,Debt Not Owed,,29616-0759,PO BOX 25759,BOX 25759,,,,,,GREENVILLE,,,Closed,10/26/2013 09:20:28 AM,10/26/2013 01:07:07 AM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,,,
89304,26461137,10/09/2013 05:23:46 PM,10/25/2013 01:06:41 AM,DCA,Department of Consumer Affairs,Consumer Complaint,Harassment,,35209-3114,600 BEACON PKWY,BEACON PKWY,,,,,,BIRMINGHAM,,,Closed,10/25/2013 02:43:42 PM,10/25/2013 01:06:41 AM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,,,,,,,,,,,


**这些可能是缺失数据，所以刚开始我像下面这样删除了：**

**request['Incident Zip'][rows_with_dash] = np.nan**

**后来，我朋友Dave指出这种9个字符的邮编是正常的，我们看看字符超过5个的的邮编，确定没问题之后再清空**

In [9]:
long_zip_codes = request['Incident Zip'].str.len() > 5
request['Incident Zip'][long_zip_codes].unique()

array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759',
       '35209-3114'], dtype=object)

**这些看起来都可以清除**

In [10]:
request['Incident Zip'] = request['Incident Zip'].str.slice(0, 5)

**完成！**

**我们再看看一些特殊的邮编，比如‘00000’**

In [12]:
request[request['Incident Zip'] == '00000']

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Action Updated Date,Community Board,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Park Facility Name,Park Borough,School Name,School Number,School Region,School Code,School Phone Number,School Address,School City,School State,School Zip,School Not Found,School or Citywide Complaint,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
42600,26529313,10/22/2013 02:51:06 PM,,TLC,Taxi and Limousine Commission,Taxi Complaint,Driver Complaint,,0,EWR EWR,EWR,,,,,,NEWARK,,,Assigned,12/07/2013 09:53:51 AM,10/23/2013 09:54:43 AM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,Other,,,,,,,,,,
60843,26507389,10/17/2013 05:48:44 PM,,TLC,Taxi and Limousine Commission,Taxi Complaint,Driver Complaint,Street,0,1 NEWARK AIRPORT,NEWARK AIRPORT,,,,,,NEWARK,,,Assigned,12/02/2013 11:59:46 AM,10/18/2013 12:01:08 PM,0 Unspecified,Unspecified,,,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,N,,,,Other,,,,,,,,,,


**这个看起来很奇怪，我们把它们设为nan**

In [13]:
zero_zips = request['Incident Zip'] == '00000'
request.loc[zero_zips, 'Incident Zip'] == np.nan

42600    False
60843    False
Name: Incident Zip, dtype: bool

**现在我们再看看**

In [21]:
unique_zips = request['Incident Zip'].unique()
unique_zips

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
       '10305', '10312', '10026', '10309', '10036', '11433', '11235',
       '11213', '11379', '11101', '10014', '11231', '11234', '10457',
       '10459', '10465', '11207', '10002', '10034', '11233', '10453',
       '10456', '10469', '11374', '11221', '11421', '11215', '10007',
       '10019', '11205', '11418', '11369', '11249', '10005', '10009',
       '11211', '11412', '10458', '11229', '10065', '10030', '11222',
       '10024', '10013', '11420', '11365', '10012', '11214', '11212',
       '10022', '11232', '11040', '11226', '10281', '11102', '11208',
       '10001', '10472', '11414', '11223', '10040', '11220', '11373',
       '11203', '11691', '11356', '10017', '10452', '10280', '11217',
       '10031', '11201', '11358', '10128', '11423', '10039', '10010',
       '11209',

In [22]:
len(unique_zips)

247

In [24]:
zips = request['Incident Zip']
# 我们先认为0或者1开头的邮编是正常的
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# 里面有很多Nan，但我们现在不关注这些数据
is_far = ~(is_close) & zips.notnull()

In [25]:
request[is_far][['Incident Zip', 'Descriptor', 'City']].sort_values('Incident Zip')

Unnamed: 0,Incident Zip,Descriptor,City
71834,23502,Harassment,NORFOLK
47048,23541,Harassment,NORFOLK
85821,29616,Debt Not Owed,GREENVILLE
89304,35209,Harassment,BIRMINGHAM
94201,41042,Harassment,FLORENCE
30939,55164,Harassment,ST. PAUL
80573,61702,Billing Dispute,BLOOMIGTON
13450,70711,Contract Dispute,CLIFTON
12102,77056,Debt Not Owed,HOUSTON
29136,77092,False Advertising,HOUSTON


**用邮编看太累了，我们把用城市名称来看一下**

In [26]:
request['City'].str.upper().value_counts()

BROOKLYN                  31662
NEW YORK                  22664
BRONX                     18438
STATEN ISLAND              4766
JAMAICA                    2246
FLUSHING                   1803
ASTORIA                    1568
RIDGEWOOD                  1073
CORONA                      707
OZONE PARK                  693
LONG ISLAND CITY            678
FAR ROCKAWAY                652
ELMHURST                    647
WOODSIDE                    609
EAST ELMHURST               562
QUEENS VILLAGE              549
FOREST HILLS                541
JACKSON HEIGHTS             541
SOUTH RICHMOND HILL         521
MASPETH                     473
WOODHAVEN                   464
FRESH MEADOWS               435
SPRINGFIELD GARDENS         434
BAYSIDE                     411
SOUTH OZONE PARK            410
RICHMOND HILL               404
REGO PARK                   402
MIDDLE VILLAGE              396
SAINT ALBANS                387
WHITESTONE                  348
                          ...  
ROSLYN  

**这些看起来都是合法的，所以我们留下来**

# 7.4 合并

**下面，我们把所有清理工作合并在一起**

In [28]:
na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('data/311-service-requests.csv', 
                       na_values=na_values, 
                       dtype={'Incident Zip': str})

In [29]:
def fix_zip_codes(zips):
    # Truncate everything to length 5 
    zips = zips.str.slice(0, 5)
    
    # Set 00000 zip codes to nan
    zero_zips = zips == '00000'
    zips[zero_zips] = np.nan
    
    return zips

In [30]:
requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])

In [31]:
requests['Incident Zip'].unique()

array(['11432', '11378', '10032', '10023', '10027', '11372', '11419',
       '11417', '10011', '11225', '11218', '10003', '10029', '10466',
       '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016',
       '10305', '10312', '10026', '10309', '10036', '11433', '11235',
       '11213', '11379', '11101', '10014', '11231', '11234', '10457',
       '10459', '10465', '11207', '10002', '10034', '11233', '10453',
       '10456', '10469', '11374', '11221', '11421', '11215', '10007',
       '10019', '11205', '11418', '11369', '11249', '10005', '10009',
       '11211', '11412', '10458', '11229', '10065', '10030', '11222',
       '10024', '10013', '11420', '11365', '10012', '11214', '11212',
       '10022', '11232', '11040', '11226', '10281', '11102', '11208',
       '10001', '10472', '11414', '11223', '10040', '11220', '11373',
       '11203', '11691', '11356', '10017', '10452', '10280', '11217',
       '10031', '11201', '11358', '10128', '11423', '10039', '10010',
       '11209',