In [2]:
import pandas as pd
import psycopg2 as pg
import numpy as np
import datetime

In [3]:
pd.options.display.max_columns = 30
!wget https://data.baltimorecity.gov/api/views/n4ma-fj3m/rows.csv?accessType=DOWNLOAD -O Parking_Citations.csv

--2016-12-02 16:01:02--  https://data.baltimorecity.gov/api/views/n4ma-fj3m/rows.csv?accessType=DOWNLOAD
Resolving data.baltimorecity.gov (data.baltimorecity.gov)... 52.206.68.26
Connecting to data.baltimorecity.gov (data.baltimorecity.gov)|52.206.68.26|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘Parking_Citations.csv’

Parking_Citations.c     [         <=>          ] 143.08M  2.34MB/s   in 70s    

Last-modified header invalid -- time-stamp ignored.
2016-12-02 16:02:13 (2.05 MB/s) - ‘Parking_Citations.csv’ saved [150033075]



In [4]:
df = pd.read_csv('Parking_Citations.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.dtypes


Citation             int64
Tag                 object
ExpMM               object
ExpYY              float64
State               object
Make                object
Address             object
ViolCode           float64
Description         object
ViolFine            object
ViolDate            object
Balance             object
PenaltyDate        float64
OpenFine            object
OpenPenalty         object
NoticeDate          object
ImportDate          object
Neighborhood        object
PoliceDistrict      object
CouncilDistrict    float64
Location            object
dtype: object

In [6]:
# create a 2016 subset
df.ViolDate = pd.to_datetime(df.ViolDate, format="%m/%d/%Y %H:%M:%S %p")
# split Location
df.set_index('ViolDate', inplace=True)
df = df['2016']
df.reset_index(inplace=True)


In [7]:
df.shape

(319688, 21)

In [8]:
# convert to date type
df.ImportDate = df.ImportDate.str.replace(' \+0000', '')
df.NoticeDate = pd.to_datetime(df.NoticeDate, format="%m/%d/%Y")
df.PenaltyDate = pd.to_datetime(df.PenaltyDate, format="%m/%d/%Y")
df.ImportDate = pd.to_datetime(df.ImportDate, format="%m/%d/%Y %H:%M:%S %p")
# split Location
df['Latitude'] = df.Location.str.split(',').str.get(0).str.replace('\(','').map(lambda s: float(s))
df['Longitude'] = df.Location.str.split(',').str.get(1).str.replace('\)','').map(lambda s: float(s))
# remove $ sign
df[['ViolFine', 'Balance','OpenFine','OpenPenalty']] = df[['ViolFine', 'Balance','OpenFine','OpenPenalty']].applymap(lambda s: float(str(s).replace('$', '')))


In [9]:
# Clean EXP_MM 
mm_map = {val: val for val in df.ExpMM.unique()}
mm_map

{nan: nan,
 1.0: 1.0,
 2.0: 2.0,
 3.0: 3.0,
 4.0: 4.0,
 5.0: 5.0,
 6.0: 6.0,
 7.0: 7.0,
 8.0: 8.0,
 9.0: 9.0,
 10.0: 10.0,
 '08': '08',
 12.0: 12.0,
 '01': '01',
 11.0: 11.0,
 '07': '07',
 '02': '02',
 '11': '11',
 '05': '05',
 '12': '12',
 0.0: 0.0,
 '04': '04',
 '00': '00',
 '06': '06',
 '09': '09',
 '10': '10',
 '03': '03',
 'PE': 'PE'}

#####  use 9999 to indictate not a number
#####   use 888 to indiecate permanent 
#####   leave 0 intact since i do not know what it means

In [10]:
mm_map[np.nan] = 9999
mm_map['PE'] = 888
df.ExpMM = df.ExpMM.map(mm_map)
df.ExpMM = df.ExpMM.astype(int)
df.ExpMM.unique()

array([   2,   11,    9,   10, 9999,   12,    1,    5,    4,    3,    8,
          6,    7,  888,    0])

In [11]:
df.ExpYY = df.ExpYY.astype(int)
df.Citation = df.Citation.astype(str)
df.ViolCode = df.ViolCode.astype(str)
df.CouncilDistrict = df.CouncilDistrict.astype(str)

In [12]:
df.dtypes

ViolDate           datetime64[ns]
Citation                   object
Tag                        object
ExpMM                       int64
ExpYY                       int64
State                      object
Make                       object
Address                    object
ViolCode                   object
Description                object
ViolFine                  float64
Balance                   float64
PenaltyDate        datetime64[ns]
OpenFine                  float64
OpenPenalty               float64
NoticeDate         datetime64[ns]
ImportDate         datetime64[ns]
Neighborhood               object
PoliceDistrict             object
CouncilDistrict            object
Location                   object
Latitude                  float64
Longitude                 float64
dtype: object

##### Replace null values based on datatype

In [13]:
rep_na = {
    "datetime64[ns]": datetime.datetime(9999,12,31),
    "object" :  "unknown",
    "float64" : np.nan,
    "int64" : np.nan,
}

In [14]:
for c, t in zip(df.columns, df.dtypes):
    df[c].fillna(rep_na[str(t)], inplace = True)

In [15]:
df.to_csv("clean_data.csv", index=False, header=True)

In [16]:
%load_ext sql
!echo 'redspot' | sudo -S service postgresql restart
!createdb -U dbuser project3

  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")


[sudo] password for jovyan: Restarting PostgreSQL 9.5 database server: main.


In [17]:
%sql postgresql://dbuser@localhost:5432/project3

'Connected: dbuser@project3'

In [20]:
%%sql
DROP TABLE IF EXISTS citation;
CREATE TABLE citation (
    violdate TIMESTAMP,
    citation Varchar(9),
    tag Varchar(9),
    expmm Integer,
    expyy Integer,
    state Varchar(2),
    make Varchar(20),
    address Varchar(255),
    violcode Varchar(25),
    description Varchar(255),
    violfine Real,
    balance Real,
    penaltydate TIMESTAMP,
    openfine Real,
    openpenalty Real,
    noticedate TIMESTAMP,
    importdate TIMESTAMP,
    neighborhood Varchar(100),
    policedistrict Varchar(100),
    councildistrict Varchar(25),
    location Varchar(255),
    latitude Float8,
    longitude Float8,
    Primary Key(citation)
);

Done.
Done.


[]

In [21]:
!pwd

/home/jovyan/work


In [22]:
%%sql
COPY citation FROM '/home/jovyan/work/clean_data.csv'
CSV
HEADER
QUOTE '"'
DELIMITER ',';

IntegrityError: (psycopg2.IntegrityError) duplicate key value violates unique constraint "citation_pkey"
DETAIL:  Key (citation)=(90668410) already exists.
CONTEXT:  COPY citation, line 152066
 [SQL: 'COPY citation FROM \'/home/jovyan/work/clean_data.csv\'\nCSV\nHEADER\nQUOTE \'"\'\nDELIMITER \',\';']

In [23]:
df[df.Citation == '90668410']

Unnamed: 0,ViolDate,Citation,Tag,ExpMM,ExpYY,State,Make,Address,ViolCode,Description,ViolFine,Balance,PenaltyDate,OpenFine,OpenPenalty,NoticeDate,ImportDate,Neighborhood,PoliceDistrict,CouncilDistrict,Location,Latitude,Longitude
122702,2016-03-31 11:59:00,90668410,HAZ2149,8,17,NY,VOLKS,4500 CHARLES ST,16.0,In Transit Zone/Stop,77.0,77.0,9999-12-31 00:00:00,77.0,0.0,9999-12-31 00:00:00,2016-04-02 11:01:00,unknown,unknown,,unknown,,
152064,2016-03-31 11:59:00,90668410,HAZ2149,8,17,NY,VOLKS,4500 CHARLES ST,16.0,In Transit Zone/Stop,77.0,0.0,9999-12-31 00:00:00,0.0,0.0,9999-12-31 00:00:00,2016-04-21 04:02:00,unknown,unknown,,unknown,,


In [30]:
df.duplicated(['Citation'], keep=False).sum()

16

In [27]:
df[df.duplicated(['Citation'], keep=False)]

Unnamed: 0,ViolDate,Citation,Tag,ExpMM,ExpYY,State,Make,Address,ViolCode,Description,ViolFine,Balance,PenaltyDate,OpenFine,OpenPenalty,NoticeDate,ImportDate,Neighborhood,PoliceDistrict,CouncilDistrict,Location,Latitude,Longitude
2336,2016-03-07 04:59:00,872119,6AL4573,11,15,MD,NISS,1110 S POTOMAC ST,22.0,Expired Tags,32.0,32.0,9999-12-31 00:00:00,32.0,0.0,9999-12-31 00:00:00,2016-04-02 11:01:00,unknown,unknown,,unknown,,
2337,2016-03-07 05:26:00,872135,52601M9,2,16,MD,MAZDA,E/S 1000 BLK ROBINSON S,22.0,Expired Tags,32.0,32.0,9999-12-31 00:00:00,32.0,0.0,9999-12-31 00:00:00,2016-04-02 11:01:00,unknown,unknown,,unknown,,
2338,2016-03-07 05:28:00,872143,52601M9,2,16,MD,MAZDA,E/S 1000 S ROBINSON ST,19.0,Exceeding 48 Hours,32.0,32.0,9999-12-31 00:00:00,32.0,0.0,9999-12-31 00:00:00,2016-04-02 11:01:00,unknown,unknown,,unknown,,
2339,2016-03-07 06:48:00,872150,9AV6732,8,15,MD,HOND,1300 S ELLWOOD AV,22.0,Expired Tags,32.0,32.0,9999-12-31 00:00:00,32.0,0.0,9999-12-31 00:00:00,2016-04-02 11:01:00,unknown,unknown,,unknown,,
2340,2016-03-10 08:53:00,872192,49394M4,10,15,MD,JEEP,E/S 3032 ELLIOTT ST,22.0,Expired Tags,32.0,32.0,9999-12-31 00:00:00,32.0,0.0,9999-12-31 00:00:00,2016-04-02 11:01:00,unknown,unknown,,unknown,,
2341,2016-03-11 10:58:00,872226,JKN8018,9,15,PA,FORD,901 POTOMAC ST,22.0,Expired Tags,32.0,32.0,9999-12-31 00:00:00,32.0,0.0,9999-12-31 00:00:00,2016-04-02 11:01:00,unknown,unknown,,unknown,,
5202,2016-03-03 05:50:00,2006716,4CF9258,2,16,MD,HOND,100 ALBEMARLE ST,22.0,Expired Tags,32.0,32.0,9999-12-31 00:00:00,32.0,0.0,9999-12-31 00:00:00,2016-04-02 11:01:00,unknown,unknown,,unknown,,
122702,2016-03-31 11:59:00,90668410,HAZ2149,8,17,NY,VOLKS,4500 CHARLES ST,16.0,In Transit Zone/Stop,77.0,77.0,9999-12-31 00:00:00,77.0,0.0,9999-12-31 00:00:00,2016-04-02 11:01:00,unknown,unknown,,unknown,,
152064,2016-03-31 11:59:00,90668410,HAZ2149,8,17,NY,VOLKS,4500 CHARLES ST,16.0,In Transit Zone/Stop,77.0,0.0,9999-12-31 00:00:00,0.0,0.0,9999-12-31 00:00:00,2016-04-21 04:02:00,unknown,unknown,,unknown,,
161774,2016-03-07 04:59:00,872119,6AL4573,11,17,MD,NISS,1110 S POTOMAC ST,22.0,Expired Tags,32.0,144.0,9999-12-31 00:00:00,32.0,96.0,2016-04-20 00:00:00,2016-10-12 04:02:00,unknown,unknown,,unknown,,


In [31]:
df.drop_duplicates(['Citation'], keep='last', inplace=True)

In [32]:
df.to_csv("clean_data.csv", index=False, header=True)

In [33]:
%%sql
COPY citation FROM '/home/jovyan/work/clean_data.csv'
CSV
HEADER
QUOTE '"'
DELIMITER ',';

319680 rows affected.


[]

In [34]:
%%sql
Select *
From citation
Limit 5;

5 rows affected.


violdate,citation,tag,expmm,expyy,state,make,address,violcode,description,violfine,balance,penaltydate,openfine,openpenalty,noticedate,importdate,neighborhood,policedistrict,councildistrict,location,latitude,longitude
2016-07-02 09:49:00,97036785,6ETF80,2,17,MD,LEXUS,400 ST PAUL PL,18.0,All Other Parking Meter Violations,32.0,0.0,9999-12-31 00:00:00,32.0,48.0,2016-07-20 00:00:00,2016-10-12 04:02:00,unknown,unknown,,unknown,,
2016-06-15 01:31:00,97059282,GLB4684,11,17,NY,KIA,800 BATTERY AVE,11.0,Residential Parking Permit Only,52.0,0.0,9999-12-31 00:00:00,52.0,0.0,9999-12-31 00:00:00,2016-10-18 04:01:00,unknown,unknown,,unknown,,
2016-09-29 12:14:00,97061411,59P149,9,18,MD,CHEVR,700 CALVERT ST,18.0,All Other Parking Meter Violations,32.0,0.0,9999-12-31 00:00:00,32.0,0.0,9999-12-31 00:00:00,2016-10-01 04:02:00,unknown,unknown,,unknown,,
2016-03-11 02:26:00,96191623,9BP3607,10,16,MD,TOYOT,400 ST PAUL ST,18.0,All Other Parking Meter Violations,32.0,0.0,9999-12-31 00:00:00,0.0,0.0,9999-12-31 00:00:00,2016-03-31 04:02:00,unknown,unknown,,unknown,,
2016-03-11 02:39:00,96191631,20971CG,10,17,MD,FORD,100 ST PAUL ST,8.0,No Stopping/Standing Tow Away Zone,52.0,0.0,9999-12-31 00:00:00,0.0,0.0,9999-12-31 00:00:00,2016-04-08 04:02:00,unknown,unknown,,unknown,,
