In [1]:
import glob
import pandas as pd
import numpy as np
from datetime import datetime
from ggplot import *
%matplotlib inline

In [2]:
file_list = glob.glob('nypd-sqf-data/*.csv')
columns = ['year', 'datestop', 'timestop', 'frisked', 'searched', 'contrabn', 'sex', 'race', 'age']
# columns = ['year', 'datestop']

df = pd.DataFrame()
for file in file_list:
#     print("Loading into dataframe: ", file)
    frame = pd.read_csv(file, usecols=columns, na_values=' ', low_memory=False, nrows=40000)
    df = pd.concat([df, frame], ignore_index=True)

df.head()

Unnamed: 0,year,datestop,timestop,frisked,searched,contrabn,sex,race,age
0,2003,1012003,03:00,Y,N,N,M,B,16
1,2003,1012003,03:00,Y,N,N,M,B,15
2,2003,1012003,03:00,Y,N,N,M,Q,39
3,2003,1012003,16:00,N,N,N,F,Q,20
4,2003,1022003,03:35,Y,N,N,M,B,19


In [3]:
tmp = df[['year', 'datestop', 'timestop']].copy()

# Method 1: Map All values
a = datetime.now()
add_zero = lambda x: str(x).zfill(8)
tmp['datetime'] = pd.to_datetime(tmp['datestop'].map(add_zero), format="%m%d%Y", errors='coerce')
print(datetime.now()-a)

tmp[119998:120005]

0:00:01.829759


Unnamed: 0,year,datestop,timestop,datetime
119998,2005,8132005,245.0,2005-08-13
119999,2005,8132005,300.0,2005-08-13
120000,2006,,,NaT
120001,2006,,,NaT
120002,2006,2006-01-01,1410.0,NaT
120003,2006,2006-01-01,850.0,NaT
120004,2006,2006-01-02,1700.0,NaT


In [4]:
# Method 2: Map Each Element
a = datetime.now()

clean_time = lambda x: pd.to_datetime(str(x).zfill(8), format="%m%d%Y", errors='ignore')
tmp['datetime2'] = tmp['datestop'].apply(clean_time)

print(datetime.now()-a)

tmp[119998:120005]

0:00:51.241680


Unnamed: 0,year,datestop,timestop,datetime,datetime2
119998,2005,8132005,245.0,2005-08-13,2005-08-13 00:00:00
119999,2005,8132005,300.0,2005-08-13,2005-08-13 00:00:00
120000,2006,,,NaT,00000nan
120001,2006,,,NaT,00000nan
120002,2006,2006-01-01,1410.0,NaT,2006-01-01
120003,2006,2006-01-01,850.0,NaT,2006-01-01
120004,2006,2006-01-02,1700.0,NaT,2006-01-02


In [5]:
tmp.shape

(480000, 5)

In [None]:
# Clean Age Column - Remove ages >100, and <3.
df['age'] = pd.to_numeric(df.age, errors='coerce')
df['age'] = df['age'].apply(lambda x: np.nan if x > 100 else np.nan if x < 3 else x )

In [None]:
# Drop missing dates, parse 'datestop' into datetime format
df = df.dropna(subset=['datestop'])

# add_zero = lambda x: str(x).zfill(8)
# clean_time = lambda x: pd.to_datetime(x, format="%m%d%Y", errors='ignore')
# df['date'] = df['datestop'].map(add_zero).apply(clean_time)

clean_time = lambda x: pd.to_datetime(str(x).zfill(8), format="%m%d%Y", errors='ignore')
df['date'] = df['datestop'].apply(clean_time)

In [None]:
# Map Y/N values for frisked/searched/contrabn columns
df['frisked'] = df['frisked'].map({'Y':1, 'N':0})
df['searched'] = df['searched'].map({'Y':1, 'N':0})
df['contrabn'] = df['contrabn'].map({'Y':1, 'N':0})

#
df.head()

In [None]:
# Races of ~90% of people stopped
race90 = df.loc[(df.race == 'B') | (df.race == 'P') | (df.race == 'Q') | (df.race == 'W')].copy()


In [None]:
for year in range(2003,2015):
    print("stops in %i" % year, len(df.loc[(df.year == year)]))