In [1]:
# Loading crime csv to RDD
# Replace with your directory
#crime_csv = sc.textFile('file:/Users/zhuorulin/Documents/DataScience/datasets/crime.csv',use_unicode=False)
crime_csv = sc.textFile('file:/Users/zhuorulin/Documents/DataScience/datasets/NYPD_Complaint_Data_Historic.csv',use_unicode=False)

In [19]:
# Convert csv to DataFrame
from csv import reader # Warning: csv.reader does not support unicode decode
from pyspark.sql import SQLContext
from collections import defaultdict
import datetime
import re

In [3]:
# Use csv.reader to read raw binary
lines_rdd = crime_csv.mapPartitions(reader)\
.map(lambda line: [x.decode('utf-8') for x in line] )# Decode with utf-8 codec
# store columns values. Also a search table for searching column name using index
schemas = lines_rdd.take(1)[0]
# filter out first row
lines = lines_rdd.filter(lambda x: x!=schemas)

In [4]:
# This dict makes it easier to search for column index using column name
colname2idx = defaultdict()
for idx,colname in enumerate(schemas):
    colname2idx[colname] = idx
# Example
print(colname2idx['CMPLNT_TO_DT'])

3


# Null Checking Global

In [5]:
def checkNull(string):
    #Assume Unicode String
    # Step 1: Check for length 0 i.e '' field
    if len(string)==0:
        return 'NULL'
    # Step 2: Check for 'nan'
    elif string=='nan':
        return 'NULL'
    else:
        return 'VALID'

In [6]:
# Example: Check row
NULL_TABLE = lines.map(lambda line:[checkNull(x) for x in line])
print(NULL_TABLE.take(1))

[['VALID', 'VALID', 'VALID', 'NULL', 'NULL', 'VALID', 'VALID', 'VALID', 'VALID', 'VALID', 'VALID', 'VALID', 'VALID', 'VALID', 'VALID', 'VALID', 'VALID', 'NULL', 'NULL', 'VALID', 'VALID', 'VALID', 'VALID', 'VALID']]


In [18]:
lines.map(lambda x:x[colname2idx['KY_CD']]).take(1)

[u'113']

# Date Cleaning
Format Reference: http://strftime.org/

In [15]:
testdate = '10/09/1992'
testdate = datetime.datetime.strptime(testdate,date_format)
testtime = '14:24:32'

1992

In [16]:
date_format = '%m/%d/%Y'
time_format = '%H:%M:%S'
# If date is in wrong format it would not be sucessfully converted to datetime object.
def checkDate(line,date_format):
    try:
        date = datetime.datetime.strptime(line,date_format)
        if (date.year <=2015)&(date.year>=2005):
            return 'VALID'
        else:
            return 'INVALID/Year'
    except:
        return 'INVALID/FORMAT'
def checkTime(line,time_format):
    try:
        datetime.datetime.strptime(line,time_format)
        return 'VALID'
    except:
        return 'INVALID/Format'
########################################
# Example
CMPLNT_FR_DT = lines.map(lambda x: x[colname2idx['CMPLNT_FR_DT']])
CMPLNT_FR_DT = CMPLNT_FR_DT.map(lambda x: checkDate(x,date_format))

In [17]:
# Testing Check Date Method
CMPLNT_FR_DT.countByValue()

defaultdict(int,
            {'INVALID/FORMAT': 655, 'INVALID/Year': 18782, 'VALID': 5081794})

# Regular Expression Cleaning
Reference: https://docs.python.org/2/library/re.html

In [216]:
regex_3_digits = '^\d{3}$'

In [256]:
#Require re package
def checkRegex(line,regex):
    # Input
    # line: a string for check
    # regex: regular expression pattern
    match = re.match(regex,line)
    if match:
        return 'VALID'
    else:
        return 'INVALID'

In [257]:
#KY_CD consists of only 3-digits
KY_CD = lines.map(lambda x:x[colname2idx['KY_CD']]).map(lambda x: checkRegex(x,regex_3_digits))
KY_CD.countByValue()

defaultdict(int, {'VALID': 5101231})

In [260]:
checkRegex('11o',regex_3_digits)

'INVALID'

In [None]:
%%writefile ./crime-data-process/code/utilities.py
from collections import defaultdict
import datetime
import re
##########
#Some Global Variables
##########
date_format = '%m/%d/%Y'
time_format = '%H:%M:%S'
regex_3_digits = '^\d{3}$'

######Functions

def checkDate(line,date_format):
    try:
        date = datetime.datetime.strptime(line,date_format)
        if (date.year <=2015)&(date.year>=2005):
            return 'VALID'
        else:
            return 'INVALID/Year'
    except:
        return 'INVALID/FORMAT'
    
def checkTime(line,time_format):
    try:
        datetime.datetime.strptime(line,time_format)
        return 'VALID'
    except:
        return 'INVALID/Format'
    
def checkRegex(line,regex):
    # Input
    # line: a string for check
    # regex: regular expression pattern
    match = re.match(regex,line)
    if match:
        return 'VALID'
    else:
        return 'INVALID'

# Data Frame Actions

In [130]:
# Converted to DataFrame
df = lines.toDF(schemas)
# Print out schemas to remind filed names
df.printSchema()

root
 |-- CMPLNT_NUM: string (nullable = true)
 |-- CMPLNT_FR_DT: string (nullable = true)
 |-- CMPLNT_FR_TM: string (nullable = true)
 |-- CMPLNT_TO_DT: string (nullable = true)
 |-- CMPLNT_TO_TM: string (nullable = true)
 |-- RPT_DT: string (nullable = true)
 |-- KY_CD: string (nullable = true)
 |-- OFNS_DESC: string (nullable = true)
 |-- PD_CD: string (nullable = true)
 |-- PD_DESC: string (nullable = true)
 |-- CRM_ATPT_CPTD_CD: string (nullable = true)
 |-- LAW_CAT_CD: string (nullable = true)
 |-- JURIS_DESC: string (nullable = true)
 |-- BORO_NM: string (nullable = true)
 |-- ADDR_PCT_CD: string (nullable = true)
 |-- LOC_OF_OCCUR_DESC: string (nullable = true)
 |-- PREM_TYP_DESC: string (nullable = true)
 |-- PARKS_NM: string (nullable = true)
 |-- HADEVELOPT: string (nullable = true)
 |-- X_COORD_CD: string (nullable = true)
 |-- Y_COORD_CD: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Lat_Lon: string (nullable

In [27]:
# Create sql context to interact easier
sqlContext = SQLContext(sc)
df.registerTempTable('crime')
# USE sqlContext.sql(<sql code>) to interact with dataframe

In [9]:
# Or using data frame method
created_date = df.select('Created Date')