In [None]:
########################################
# Base Inputs
########################################
import warnings
import pandas as pd
from datetime import datetime
pd.set_option('display.max_rows',2000)

In [None]:
########################################
# Importing Parking Tickets - TSV Method
########################################
'''
TSV can be downloaded from:
  https://data.lacity.org/A-Well-Run-City/Parking-Citations/wjz9-h9np
  
This code needs the raw data to be named 'Parking_Citations.tsv'
and to be in the same folder as the jupyter notebook
'''

df_raw = pd.read_csv('Parking_Citations.tsv',sep='\t',header=0)

df_raw.head()

In [None]:
########################################
# Importing Agency Codes
########################################
'''
TSV can be downloaded from:
  https://data.lacity.org/A-Well-Run-City/Parking-Citations/wjz9-h9np
  
This code needs the raw data to be named 'agency_codes.tsv'
and to be in the same folder as the jupyter notebook
'''

agency_df_raw = pd.read_csv('agency_codes.tsv',sep='\t',header=0)

agency_df=agency_df_raw

agency_df.columns=['Agency','agency_name','agency_shortname']
agency_df.head()

In [None]:
########################################
# Cleaning Data
########################################
df = df_raw

########################################
# Adding Data
########################################
df['one']=1 #used instead of ticket number to count rows
df = pd.merge(df,agency_df,how='left',on=['Agency']) #adding agency name

########################################
# Removing Unnecessary Data
########################################
df = df[df['Issue Date'].notnull()] #removing anything with an unknown issue date
df = df[df['Fine amount'].notnull()] #removing anything with an unknown fine amount
del df['VIN'] #If these are actually VINs, I'm not sure if they should be released publicly in the first place
del df['Meter Id'] #no need for this
del df['Marked Time'] #not sure what this is. Don't need it
del df['Ticket number'] # no need to keep this right now since it's just a unique value per now
del df['Route'] #don't know how I'd use this
del df['Violation code'] #I can use the violation name instead
del df['Plate Expiry Date'] #I'm going to leave this out to keep things simple

#Lat/Lon is in US Feet coordinates according to the NAD1983StatePlaneCaliforniaVFIPS0405_Feet projection
#Removing for now
del df['Latitude']
del df['Longitude']

del df['Location'] #I would need a serious geocoding package to make use of this partial address --might be able to use the Google Geocoding API
del df['Agency'] #No need for Agency Code now
del df['agency_shortname'] #No need for agency shortname right now

########################################
# Replacing Nulls
########################################

#this part is going to make it easier when I do the group bys later so we can include the nulls
df['Issue time'].fillna(value = -999,inplace = True,axis=0) #replacing nulls with -999
df['RP State Plate'].fillna(value = 'Unknown',inplace = True,axis=0) #replacing nulls with 'Unknown'
df['Make'].fillna(value = 'Unknown',inplace = True,axis=0) #replacing nulls with 'Unknown'
df['Body Style'].fillna(value = 'Unknown',inplace = True,axis=0) #replacing nulls with 'Unknown'
df['Color'].fillna(value = 'Unknown',inplace = True,axis=0) #replacing nulls with 'Unknown'
df['Violation Description'].fillna(value = 'Unknown',inplace = True,axis=0) #replacing nulls with 'Unknown'
df['agency_name'].fillna(value = 'Unknown',inplace = True,axis=0) #replacing nulls with 'Unknown'

########################################
# Time
########################################

# Time
df['Issue time']=df['Issue time'].astype(int)
df['Issue time']=df['Issue time'].apply('{0:0>4}'.format) #nulls will come out as '-999'
df['Issue time']=df['Issue time'].astype(str)

df['issue_hour']=df['Issue time'].str[:2] #nulls will show up as '-9'

'''The next two lines of commented code are for 
minute-level information.  I'm excluding it for now'''
#df['issue_minute']=df['Issue time'].str[2:4]
#df['issue_h_m']=df['issue_hour']+':'+df['issue_minute']+':00'

########################################
# Last Deletes for Space
########################################
'''
These fields were deleted to save space.
Tableau Public can only handle 1M rows
'''

del df['Issue time'] #too much granularity
del df['RP State Plate'] #mostly california
del df['Make'] #I can do without this
del df['Body Style']
del df['Color'] 
del df['agency_name']
########################################
# Renaming Columns
########################################
df.rename(
        columns ={
            'Issue Date':'issue_date'
            ,'Issue time':'issue_time'
            ,'RP State Plate':'state_plate'
            ,'Body Style':'veh_body'
            ,'Violation Description':'violation'
            ,'Fine amount':'fine'
                }
        ,inplace = True)

print('Total Rows: %d' % (df.one.sum()))
print('')
print(df.count())
print('')
print('')
print(df.info())
print('')
df.head(20)

In [None]:
#######################################################
# Aggregating Cleaned Data - reducing size of dataframe
#######################################################
col = df.columns.tolist()
col.remove('one')
df_agg=df.groupby(col).one.sum().reset_index()
print(df_agg.nunique())
print(df_agg.count())
df_agg.head()

In [None]:
########################################
# Writing to tab separated CSV
########################################
'''This will write the csv file to the same
folder as the jupyter notebook is in'''

df_agg.to_csv('tableauData.csv',sep='\t')
print('Done')