This script retrieves Stop and Frisk data from the NYPD website https://www.nyc.gov/site/nypd/stats/reports-analysis/stopfrisk.page and combines the data into a single repository - Stop_and_Frisk_Data.csv.

In [35]:
#import pandas - will write downloaded data to data frames
import pandas as pd
#import zipfile for zip file extraction
import zipfile
#import urlretrieve from urllib, wich is packaged with Python and can retrieve files directly from the web
from urllib.request import urlretrieve

In [33]:
#files from 2003 through 2014 in zip format - csv within each zip with naming convention YYYY.csv

#loop through years
years = ['2012','2013','2014']
counter = 0 

for year in years:
    zipurl = 'https://www.nyc.gov/assets/nypd/downloads/zip/analysis_and_planning/stop-question-frisk/sqf-' + year + '-csv.zip'
    csvfile = year + '.csv'
    
    data = urlretrieve(zipurl)
    
    #extract zip files - the first element in data is the temp file with the zip contents
    #ref https://gist.github.com/ZeccaLehn/140edc75ff9d2c7cf9f660028763c9f5
    z = zipfile.ZipFile(data[0])
    z.extractall()
    
    #read extracted csv file into dataframe
    df = pd.read_csv(csvfile,low_memory=False)
    
    if counter == 0:
        all_data = df
    else:
        all_data = pd.concat([all_data, df], axis=0)
    
    counter+=1

In [34]:
all_data

Unnamed: 0,year,pct,ser_num,datestop,timestop,recstat,inout,trhsloc,perobs,crimsusp,...,beat,post,xcoord,ycoord,dettypcm,linecm,detailcm,dettypCM,lineCM,detailCM
0,2012,40,17,1012012,115,1,O,P,2.0,ROBBERY,...,*,,1008031,233036,CM,1,85,,,
1,2012,23,691,1012012,310,1,I,P,2.0,M,...,,12,1000852,228179,CM,1,9,,,
2,2012,81,3714,1012012,2000,1,O,P,1.0,ROBBERY,...,3,,1001869,190702,CM,1,85,,,
3,2012,81,633,1022012,1245,1,O,P,3.0,ROBBERY,...,*,,1005306,186668,CM,1,85,,,
4,2012,66,36,1042012,2220,A,O,P,2.0,FELONY,...,,,986887,173599,CM,1,46,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45782,2014,106,2283,12312014,1838,1,O,P,2.0,FELONY,...,,,1027746,177325,,,,CM,1.0,20.0
45783,2014,72,205,12312014,1910,A,O,P,3.0,FEL,...,10,,983722,179774,,,,CM,1.0,85.0
45784,2014,60,710,12312014,2016,1,I,H,1.0,MISD.,...,,,990272,154743,,,,CM,1.0,31.0
45785,2014,79,1475,12312014,2320,1,O,P,1.0,ASSAULT,...,13,,997168,192448,,,,CM,1.0,9.0


In [40]:
#files from 2015 and 2016 in csv format
#csv files can be downloaded and read directly into a pandas dataframe using read_csv

#loop through years
years = ['2015']

for year in years:
    csvurl = 'https://www.nyc.gov/assets/nypd/downloads/excel/analysis_and_planning/stop-question-frisk/sqf-' + year + '.csv'

    df = pd.read_csv(csvurl)
    
    all_data = pd.concat([all_data, df], axis=0)  

In [41]:
all_data

Unnamed: 0,year,pct,ser_num,datestop,timestop,recstat,inout,trhsloc,perobs,crimsusp,...,beat,post,xcoord,ycoord,dettypcm,linecm,detailcm,dettypCM,lineCM,detailCM
0,2012,40,17,1012012,115,1,O,P,2.0,ROBBERY,...,*,,1008031,233036,CM,1,85,,,
1,2012,23,691,1012012,310,1,I,P,2.0,M,...,,12,1000852,228179,CM,1,9,,,
2,2012,81,3714,1012012,2000,1,O,P,1.0,ROBBERY,...,3,,1001869,190702,CM,1,85,,,
3,2012,81,633,1022012,1245,1,O,P,3.0,ROBBERY,...,*,,1005306,186668,CM,1,85,,,
4,2012,66,36,1042012,2220,A,O,P,2.0,FELONY,...,,,986887,173599,CM,1,46,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22558,2015,73,302,12312015,2250,1,O,P,1.0,FELONY,...,,,1008129,179818,,,,CM,1.0,20.0
22559,2015,88,114,12312015,2305,1,I,H,1.0,FELONY,...,,,989931,192868,,,,CM,1.0,20.0
22560,2015,88,113,12312015,2305,1,I,H,1.0,FELONY,...,,,989931,192868,,,,CM,1.0,20.0
22561,2015,19,258,12312015,2345,1,O,P,1.0,FEL,...,,,995795,217850,,,,CM,1.0,14.0


In [42]:
#create combined csv file with all selected years' data
#index=False to remove the index numbers of the rows

all_data.to_csv("stop_and_frisk.csv", index=False, encoding='utf-8')