In [1]:
import pandas as pd
import math, time, datetime
from dateutil.parser import parse
import dateutil
import locale
locale.setlocale(locale.LC_ALL, 'en_US')
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!head -n 10000 violations.csv > small-violations.csv 

In [3]:
df = pd.read_csv("small-violations.csv")

In [4]:
df.head()

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,1283294138,GBB9093,NY,PAS,08/04/2013,46,SUBN,AUDI,P,37250,...,GY,0,2013,-,0,,,,,
1,1283294151,62416MB,NY,COM,08/04/2013,46,VAN,FORD,P,37290,...,WH,0,2012,-,0,,,,,
2,1283294163,78755JZ,NY,COM,08/05/2013,46,P-U,CHEVR,P,37030,...,,0,0,-,0,,,,,
3,1283294175,63009MA,NY,COM,08/05/2013,46,VAN,FORD,P,37270,...,WH,0,2010,-,0,,,,,
4,1283294187,91648MC,NY,COM,08/08/2013,41,TRLR,GMC,P,37240,...,BR,0,2012,-,0,,,,,


In [5]:
df['Date First Observed']

0              0
1              0
2              0
3              0
4              0
5              0
6              0
7              0
8              0
9              0
10             0
11             0
12             0
13             0
14             0
15             0
16             0
17             0
18             0
19             0
20             0
21             0
22             0
23             0
24             0
25             0
26             0
27             0
28             0
29             0
          ...   
9969    20130723
9970           0
9971           0
9972           0
9973           0
9974           0
9975           0
9976           0
9977           0
9978           0
9979           0
9980           0
9981           0
9982           0
9983           0
9984           0
9985           0
9986           0
9987           0
9988           0
9989           0
9990           0
9991           0
9992           0
9993           0
9994           0
9995           0
9996          

In [6]:
df.columns

Index(['Summons Number', 'Plate ID', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3',
       'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct',
       'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad',
       'Violation Time', 'Time First Observed', 'Violation County',
       'Violation In Front Of Or Opposite', 'House Number', 'Street Name',
       'Intersecting Street', 'Date First Observed', 'Law Section',
       'Sub Division', 'Violation Legal Code', 'Days Parking In Effect    ',
       'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color',
       'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number',
       'Feet From Curb', 'Violation Post Code', 'Violation Description',
       'No Standing or Stopping Violation', 'Hydrant Violation',
       'Double Parking Violation'],
      dtype='object')

## 1. I want to make sure my Plate ID is a string. Can't lose the leading zeroes!

In [7]:
df.dtypes #dtype: Data type for data or columns

Summons Number                         int64
Plate ID                              object
Registration State                    object
Plate Type                            object
Issue Date                            object
Violation Code                         int64
Vehicle Body Type                     object
Vehicle Make                          object
Issuing Agency                        object
Street Code1                           int64
Street Code2                           int64
Street Code3                           int64
Vehicle Expiration Date                int64
Violation Location                   float64
Violation Precinct                     int64
Issuer Precinct                        int64
Issuer Code                            int64
Issuer Command                        object
Issuer Squad                           int64
Violation Time                        object
Time First Observed                   object
Violation County                      object
Violation 

In [8]:
print("The data type is",(type(df['Plate ID'][0])))

The data type is <class 'str'>


## 2. I don't think anyone's car was built in 0AD. Discard the '0's as NaN.

In [9]:
df['Vehicle Year'] = df['Vehicle Year'].replace("0","NaN") #str.replace(old, new[, max])
df.head()

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
0,1283294138,GBB9093,NY,PAS,08/04/2013,46,SUBN,AUDI,P,37250,...,GY,0,2013.0,-,0,,,,,
1,1283294151,62416MB,NY,COM,08/04/2013,46,VAN,FORD,P,37290,...,WH,0,2012.0,-,0,,,,,
2,1283294163,78755JZ,NY,COM,08/05/2013,46,P-U,CHEVR,P,37030,...,,0,,-,0,,,,,
3,1283294175,63009MA,NY,COM,08/05/2013,46,VAN,FORD,P,37270,...,WH,0,2010.0,-,0,,,,,
4,1283294187,91648MC,NY,COM,08/08/2013,41,TRLR,GMC,P,37240,...,BR,0,2012.0,-,0,,,,,


## 3. I want the dates to be dates! Read the read_csv documentation to find out how to make pandas automatically parse dates.

In [67]:
# Function to use for converting a sequence of string columns to an array of datetime instances: dateutil.parser.parser 

In [10]:
type(df['Issue Date'][0])

str

In [11]:
def to_dates(date):
    yourdate = dateutil.parser.parse(date)
    return yourdate
df['Issue Date Converted'] = df['Issue Date'].apply(to_dates) #DataFrame.apply(func):apply function to apply to each column/row
df['Issue Date Converted'].head()

0   2013-08-04
1   2013-08-04
2   2013-08-05
3   2013-08-05
4   2013-08-08
Name: Issue Date Converted, dtype: datetime64[ns]

## 4. "Date first observed" is a pretty weird column, but it seems like it has a date hiding inside. Using a function with .apply, transform the string (e.g. "20140324") into a Python date. Make the 0's show up as NaN.

In [12]:
df['Date First Observed'].tail()

9994    0
9995    0
9996    0
9997    0
9998    0
Name: Date First Observed, dtype: int64

In [13]:
import numpy as np #numpy object
def pydate(num):
    num = str(num) #to work with dateutil.parser.parse():it has to be a string
    print(num)
    if num == "0":
        print("replacing 0")
        return np.NaN #if number==0,replace 0 with NaN
    else:
        print("parsing date")
        yourdate = dateutil.parser.parse(num)#recognize the string as a time object
        strf = yourdate.strftime("%Y-%B-%d")#strftime turns a time object into a date and time format
        print(strf)
        return strf

In [14]:
df['Date First Observed Converted'] = df['Date First Observed'].apply(pydate)

0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
20130719
parsing date
2013-July-19
20130719
parsing date
2013-July-19
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
20130715
parsing date
2013-July-15
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0
0
replacing 0

## 5. "Violation time" is... not a time. Make it a time.

In [15]:
df['Violation Time'].head()

0    0752A
1    1240P
2    1243P
3    0232P
4    1239P
Name: Violation Time, dtype: object

In [16]:
type(df['Violation Time'][0])

str

In [17]:
def str_to_time(time_str):
    s = str(time_str).replace("P"," PM").replace("A"," AM") #str(time_str) because str.replace()
    x = s[:2] + ":" + s[2:] 
    return x
str_to_time("1239P")

'12:39 PM'

In [18]:
df['Violation Time Converted'] = df['Violation Time'].apply(str_to_time)

In [19]:
df['Violation Time Converted']

0       07:52 AM
1       12:40 PM
2       12:43 PM
3       02:32 PM
4       12:39 PM
5       06:17 PM
6       07:41 PM
7       04:25 AM
8       04:37 AM
9       08:39 AM
10      08:45 AM
11      09:07 AM
12      05:14 PM
13      06:56 PM
14      11:45 PM
15      05:46 PM
16      11:42 AM
17      07:24 AM
18      07:58 AM
19      07:36 AM
20      08:47 AM
21      11:20 AM
22      10:20 AM
23      03:24 PM
24      10:18 AM
25      07:43 AM
26      08:50 AM
27      09:15 AM
28      10:23 AM
29      11:20 AM
          ...   
9969    12:51 PM
9970    06:45 PM
9971    02:30 AM
9972    03:15 AM
9973    02:30 AM
9974    12:13 PM
9975    08:30 AM
9976    03:05 AM
9977    08:55 AM
9978    10:21 AM
9979    03:30 AM
9980    05:24 PM
9981    05:05 PM
9982    10:18 PM
9983    10:18 PM
9984    02:15 AM
9985    01:30 AM
9986    10:20 PM
9987    10:00 PM
9988    10:00 PM
9989    10:30 PM
9990    11:15 AM
9991    08:42 AM
9992    11:50 AM
9993    11:41 AM
9994    11:40 AM
9995    03:15 PM
9996    03:10 

### 6.  There sure are a lot of colors of cars, too bad so many of them are the same. Make "BLK" and "BLACK", "WT" and "WHITE", and any other combinations that you notice.

In [20]:
df['Vehicle Color'].value_counts()

WHITE    1753
BLACK     896
GY        637
BLUE      607
GRAY      553
RED       525
WH        513
BLK       487
BK        450
SILVE     392
GREY      363
BROWN     305
WHT       299
GREEN     268
BL        192
TAN       173
GRY       167
GOLD      116
BR        116
YELLO     114
RD        104
SILVR      83
WT         62
GR         57
GRN        51
TN         45
ORANG      41
SILV       34
BRN        34
SIL        29
         ... 
SROWN       1
SLIVE       1
TAN/B       1
NAVY/       1
BU          1
PUR         1
BRL         1
BEGE        1
WYH         1
METBL       1
GY/BE       1
BT          1
MARON       1
BLACL       1
GREEB       1
WOOD        1
BG          1
BWN         1
NLACK       1
BK/GY       1
OTHER       1
B;L         1
GD          1
YW/BL       1
GREW        1
PEARL       1
RUST        1
ONG         1
MAR         1
TEAL        1
Name: Vehicle Color, dtype: int64

In [21]:
def to_color(color):
    if color=='BLK'or color=='BK'or color=='BLCK':
        return "BLACK"
    if color=='WH' or color=='WHT'or color=='WT':
        return "WHITE"
    if color=='GY'or color=='GREY'or color=='GRY':
        return "GRAY"
    if color=='BL'or color=='BLE' or color=='B LUE' or color=='BU':
        return "BLUE"
    if color=='BR' or color == 'BRN' or color == 'BRWN':
        return "BROWN"
    if color== 'SILV' or color ==  'SIL' or color == 'SILVR' or color == 'SILVE' or color == 'SL':
        return "SILVER"
    else:
        return color
    

In [22]:
df['Vehicle Color'] = df['Vehicle Color'].apply(to_color)

### 7. Join the data with the Parking Violations Code dataset from the NYC Open Data site.

In [23]:
df2=pd.read_csv("DOF_Parking_Violation_Codes.csv")

In [24]:
df2.head()

Unnamed: 0,CODE,DEFINITION,Manhattan 96th St. & below,All Other Areas
0,10,"Stopping, standing or parking where a sign, st...",$115,$115
1,11,Hotel Loading/Unloading: Standing or parking w...,$115,$115
2,12,Snow Emergency: Standing or parking where stan...,$95,$95
3,13,Taxi Stand: Standing or parking where standing...,$115,$115
4,14,General No Standing: Standing or parking where...,$115,$115


In [25]:
df2.iloc[[38]]#how to select a row from dataframe

Unnamed: 0,CODE,DEFINITION,Manhattan 96th St. & below,All Other Areas
38,53,"Standing or parking in a safety zone, between ...",$115,$115


In [26]:
df2['CODE'].value_counts() #code 37-38 

24       1
93       1
33       1
47       1
37-38    1
86       1
69       1
05       1
56       1
42       1
92       1
74       1
34       1
43       1
04       1
58       1
98       1
53       1
45       1
79       1
51       1
66       1
75       1
22       1
32       1
52       1
72       1
14       1
84       1
97       1
        ..
73       1
99       1
23       1
63       1
59       1
65       1
35       1
07       1
94       1
62       1
48       1
91       1
78       1
49       1
82       1
06       1
21       1
39       1
12       1
96       1
67       1
09       1
31       1
50       1
40       1
10       1
16       1
20       1
11       1
13       1
Name: CODE, dtype: int64

In [27]:
def many_digits_to_single_digit(x):
    try:
        single__digit_code = x[0:2]
        return single_digit_code
    except:
        return None

In [28]:
one_code = df2['CODE'].apply(many_digits_to_single_digit)

### 8. How much money did NYC make off of parking violations?

### 9. What's the most lucrative kind of parking violation? The most frequent?

### 10. New Jersey has bad drivers, but does it have bad parkers, too? How much money does NYC make off of all non-New York vehicles?

### 11. Make a chart of the top few.

In [None]:
new_df['All Other Areas clean'].value_counts().head(5).plot(kind='barh')
plt.ylabel('Number of Fines')
plt.xlabel('Fine in $')

### 12. What time of day do people usually get their tickets? You can break the day up into several blocks - for example 12am-6am, 6am-12pm, 12pm-6pm, 6pm-12am.

In [None]:
time_of_day.value_counts().head(5).plot(kind='barh')

### 13. What's the average ticket cost in NYC?

### 14. Make a graph of the number of tickets per day.

In [None]:
plt.style.use('seaborn-poster')

graph_daily_tickets= new_df['Issue Date'].value_counts(sort=False).plot(kind='bar')
plt.ylabel('Number of tickets')
plt.xlabel('Days')
graph_daily_tickets.axes.get_xaxis().set_ticks([])

### 15. Make a graph of the amount of revenue collected per day.

### 16. Manually construct a dataframe out of https://dmv.ny.gov/statistic/2015licinforce-web.pdf (only NYC boroughts - bronx, queens, manhattan, staten island, brooklyn), having columns for borough name, abbreviation, and number of licensed drivers.

### 17. What's the parking-ticket-$-per-licensed-driver in each borough of NYC? Do this with pandas and the dataframe you just made, not with your head!