In [120]:
%matplotlib inline
import sys
print('Python version:', sys.version)

import numpy as np
print('Numpy version:', np.__version__)

import pandas as pd
print('Pandas version:', pd.__version__)

import matplotlib as mpl
import matplotlib.pyplot as plt
print('Matplotlib version:', mpl.__version__)

import seaborn as sns
print('Seaborn version:', sns.__version__)

import datetime
import time

CONST_NUM_BOROUGHS = 5
max_turn_borough = 0

sns.set()
pal = sns.hls_palette(10, h=.5)
sns.set_palette(pal)

# Avoid display of scientific notation and show precision of 4 decimals:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Set option for showing all columns for cell display of a dataframe
pd.options.display.max_columns = None

Python version: 3.7.5 (default, Oct 25 2019, 15:51:11) 
[GCC 7.3.0]
Numpy version: 1.17.4
Pandas version: 0.25.3
Matplotlib version: 3.1.1
Seaborn version: 0.9.0


In [2]:
# Load complete dataset. No data dictionary is available.
# Source: https://data.cityofnewyork.us/Business/Legally-Operating-Businesses/w7w3-xahh
# Direct source: https://data.cityofnewyork.us/api/views/w7w3-xahh/rows.csv?accessType=DOWNLOAD&bom=true&format=true
df_business = pd.read_csv("./data/Legally_Operating_Businesses.csv")

# Preliminary data check
# print(df_business.head())
# print(df_business.tail())
# print(df_business.dtypes)
df_business.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202013 entries, 0 to 202012
Data columns (total 27 columns):
DCA License Number               202013 non-null object
License Type                     202013 non-null object
License Expiration Date          201980 non-null object
License Status                   202013 non-null object
License Creation Date            202013 non-null object
Industry                         202013 non-null object
Business Name                    201976 non-null object
Business Name 2                  32889 non-null object
Address Building                 128495 non-null object
Address Street Name              129243 non-null object
Secondary Address Street Name    2000 non-null object
Address City                     201307 non-null object
Address State                    201184 non-null object
Address ZIP                      201283 non-null object
Contact Phone Number             192319 non-null object
Address Borough                  128305 non-null obj

In [3]:
# Change objects to datetime format
df_business['License Expiration Date'] = pd.to_datetime(df_business['License Expiration Date'])
df_business['License Creation Date'] = pd.to_datetime(df_business['License Creation Date'])
# print(list(df_business['Address Building'].unique()))
# df_business['Address Building'] = df_business['Address Building'].apply(pd.to_numeric, errors='coerce')

In [4]:
# Prelimanry data exploration - mean, median, mode (statistical analysis), and visualization


In [5]:
df1 = df_business[-11:-1]
# df1.info()

In [6]:
dfx = df1.drop(['Business Name', 'Business Name 2', 'Address Building', 'Address Street Name', 'Secondary Address Street Name', 'Address State', 'Contact Phone Number', 'Longitude', 'Latitude', 'Location'], axis=1)
dfx

Unnamed: 0,DCA License Number,License Type,License Expiration Date,License Status,License Creation Date,Industry,Address City,Address ZIP,Address Borough,Borough Code,Community Board,Council District,BIN,BBL,NTA,Census Tract,Detail
202002,2036428-DCA,Business,2016-12-31,Inactive,2016-04-20,Electronics Store,BROOKLYN,11220,Brooklyn,3.0,312.0,38.0,3137807.0,3056440001.0,BK34,108.0,
202003,1383504-DCA,Business,2016-12-31,Inactive,2011-03-01,Tobacco Retail Dealer,BROOKLYN,11207,Brooklyn,3.0,305.0,37.0,3083863.0,3037230038.0,BK85,1144.0,
202004,1456410-DCA,Business,2013-12-31,Inactive,2012-12-24,Tobacco Retail Dealer,ASTORIA,11103,Queens,4.0,401.0,22.0,4010651.0,4006600048.0,QN70,155.0,
202005,1419429-DCA,Business,2013-06-30,Inactive,2012-02-10,Home Improvement Contractor,BROOKLYN,11234,Brooklyn,3.0,318.0,46.0,3212139.0,3077040008.0,BK45,644.0,
202006,2053838-DCA,Business,2020-05-01,Active,2017-06-02,Employment Agency,OZONE PARK,11417,Queens,4.0,410.0,28.0,4201627.0,4095110040.0,QN55,98.0,
202007,2016525-DCA,Business,2021-07-31,Active,2014-12-16,Secondhand Dealer - Auto,BROOKLYN,11223,Brooklyn,3.0,313.0,47.0,3194295.0,3071670091.0,BK26,386.0,
202008,2026189-2-DCA,Business,2020-12-31,Active,2015-07-27,Tobacco Retail Dealer,NEW YORK,10003,Manhattan,1.0,106.0,2.0,1020489.0,1009240031.0,MN21,48.0,
202009,1003888-DCA,Business,2021-07-31,Active,1999-02-25,Secondhand Dealer - General,BROOKLYN,11237,Brooklyn,3.0,304.0,37.0,3075774.0,3033110001.0,BK77,441.0,
202010,1252203-DCA,Business,2013-09-15,Inactive,2007-04-14,Sidewalk Cafe,NEW YORK,10003,Manhattan,1.0,102.0,2.0,1084682.0,1005640019.0,MN23,61.0,"Sidewalk Cafe Type: Unenclosed, Square Feet: 7..."
202011,2082881-2-DCA,Business,2020-11-30,Active,2019-03-05,Electronic Cigarette Dealer,BROOKLYN,11201,Brooklyn,3.0,302.0,33.0,3002085.0,3002490035.0,BK09,502.0,


In [7]:
# ['individual', 'business']
list(df_business['License Type'].str.lower().unique())

# ['active', 'inactive']
list(df_business['License Status'].str.lower().unique())

# Around 30 different strings
print(list(df_business['Industry'].str.lower().unique()))

# Too many cities
list(df_business['Address City'].str.lower().unique())

# [nan, 'outside nyc', 'brooklyn', 'queens', 'bronx', 'manhattan', 'staten island']
list(df_business['Address Borough'].str.lower().unique())

# [nan, 3.0, 4.0, 2.0, 1.0, 5.0]
df_business['Borough Code'].unique()

# Around 50 different floats
df_business['Community Board'].unique()

# Around 30 different floats
df_business['Council District'].unique()


# math.isnan(x)

['tow truck driver', 'home improvement salesperson', 'debt collection agency', 'home improvement contractor', 'locksmith', 'pedicab driver', 'general vendor', 'electronics store', 'ticket seller', 'sightseeing guide', 'process server individual', 'auctioneer', 'laundries', 'dealer in products', 'electronic & appliance service', 'secondhand dealer - general', 'stoop line stand', 'electronic cigarette dealer', 'tobacco retail dealer', 'amusement device temporary', 'tow truck company', 'auction house premises', 'pedicab business', 'newsstand', 'pool or billiard room', 'horse drawn driver', 'process serving agency', 'parking lot', 'secondhand dealer - auto', 'laundry', 'scale dealer repairer', 'laundry jobber', 'pawnbroker', 'sidewalk cafe', 'garage', 'bingo game operator', 'commercial lessor', 'car wash', 'amusement arcade', 'amusement device portable', 'garage and parking lot', 'locksmith apprentice', 'gaming cafe', 'special sale', 'employment agency', 'sightseeing bus', 'games of chance

array([nan, 32.,  1., 30., 42.,  4., 40.,  3., 10.,  2., 33.,  7., 27.,
       31., 43., 19., 37., 38., 11., 28., 14., 17., 26., 44., 51., 41.,
       13., 36., 25., 15., 47., 22.,  8., 34.,  5., 49., 39.,  9., 29.,
       18.,  6., 48., 23., 24., 45., 12., 20., 21., 46., 16., 35., 50.])

In [111]:
df_application = pd.read_csv("./data/License_Applications.csv")
# print(df_application.head())
# print(df_application.tail())
# print(df_business.dtypes)
df_application['Start Date'] = pd.to_datetime(df_application['Start Date'])
df_application['End Date'] = pd.to_datetime(df_application['End Date'])
df_application['Temp Op Letter Issued'] = pd.to_datetime(df_application['Temp Op Letter Issued'])
df_application['Temp Op Letter Expiration'] = pd.to_datetime(df_application['Temp Op Letter Expiration'])
df_application['Zip'] = df_application['Zip'].apply(pd.to_numeric, errors='coerce')

# df_application['Address Building'] = df_application['Address Building'].apply(pd.to_numeric, errors='coerce')
# print(list(df_application['Building Number'].unique())) # Too many dirty values such as strings
# I could clean some other columns as well, such as active vehicles, but probably not needed for this assignment
# df_business['License Creation Date'] = pd.to_datetime(df_business['License Creation Date'])
# df_business['Address Building'] = df_business['Address Building'].apply(pd.to_numeric, errors='coerce')

df_application.info()
# print(list(df_application['Zip'])[0:1000])
print(df_application['Zip'].isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364238 entries, 0 to 364237
Data columns (total 25 columns):
Application ID               364238 non-null object
License Number               352642 non-null object
License Type                 364238 non-null object
Application or Renewal       364238 non-null object
Business Name                364016 non-null object
Status                       364238 non-null object
Start Date                   364238 non-null datetime64[ns]
End Date                     362749 non-null datetime64[ns]
Temp Op Letter Issued        751 non-null datetime64[ns]
Temp Op Letter Expiration    733 non-null datetime64[ns]
License Category             364238 non-null object
Application Category         364238 non-null object
Building Number              228467 non-null object
Street                       231213 non-null object
Street 2                     4877 non-null object
Unit Type                    27293 non-null object
Unit                         24957

In [9]:
df2 = df_application[-11:-1]
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 364227 to 364236
Data columns (total 25 columns):
Application ID               10 non-null object
License Number               9 non-null object
License Type                 10 non-null object
Application or Renewal       10 non-null object
Business Name                10 non-null object
Status                       10 non-null object
Start Date                   10 non-null datetime64[ns]
End Date                     8 non-null datetime64[ns]
Temp Op Letter Issued        0 non-null datetime64[ns]
Temp Op Letter Expiration    0 non-null datetime64[ns]
License Category             10 non-null object
Application Category         10 non-null object
Building Number              6 non-null object
Street                       7 non-null object
Street 2                     1 non-null object
Unit Type                    1 non-null object
Unit                         1 non-null object
Description                  0 non-null object
Ci

In [11]:
dfy = df2.drop(['Business Name', 'Building Number', 'Street', 'Street 2', 'State', 'Contact Phone', 'Longitude', 'Latitude', 'Active Vehicles'], axis=1)
dfy

Unnamed: 0,Application ID,License Number,License Type,Application or Renewal,Status,Start Date,End Date,Temp Op Letter Issued,Temp Op Letter Expiration,License Category,Application Category,Unit Type,Unit,Description,City,Zip
364227,1146-2020-AELE,,Business,Application,Pending,2020-02-06,NaT,NaT,NaT,Electronics Store,Basic,,,,STATEN ISLAND,10304
364228,2713-2020-RPSI,2052124-DCA,Individual,Renewal,Pending,2020-02-05,NaT,NaT,NaT,Process Server Individual,Basic,,,,BRONX,10463
364229,4635-2017-RDEB,2030459-DCA,Business,Renewal,Issued,2017-01-31,2017-02-01,NaT,NaT,Debt Collection Agency,Basic,STE,124.0,,VIRGINIA BEACH,23452
364230,691-2020-AHIC,2094086-DCA,Business,Application,Issued,2020-01-24,2020-01-31,NaT,NaT,Home Improvement Contractor,Special,,,,QUEENS VILLAGE,11427
364231,1024-2020-ALAU,2094237-DCA,Business,Application,Issued,2020-02-04,2020-02-06,NaT,NaT,Laundries,Basic,,,,NEW YORK,10019
364232,930-2020-ALCK,2094150-DCA,Individual,Application,Issued,2020-01-31,2020-02-04,NaT,NaT,Locksmith,Basic,,,,HOLBROOK,11741
364233,14806-2018-RNWS,0815544-DCA,Business,Renewal,Issued,2018-09-21,2018-09-22,NaT,NaT,Newsstand,Special,,,,NEW YORK,10023
364234,1297-2018-RSSG,2042693-DCA,Individual,Renewal,Issued,2018-01-23,2018-01-24,NaT,NaT,Sightseeing Guide,Basic,,,,GLEN MILLS,19342
364235,743-2020-AECD,2094158-1-DCA,Business,Application,Issued,2020-01-27,2020-02-04,NaT,NaT,Electronic Cigarette Dealer,Basic,,,,NEW YORK,10001
364236,1852-2018-RSLS,2010885-DCA,Business,Renewal,Issued,2018-01-29,2018-01-30,NaT,NaT,Stoop Line Stand,Basic,,,,BROOKLYN,11223


In [12]:
df_zip = pd.read_csv("./data/zip_borough.csv")

# Preliminary data check
print(df_zip.head())
# print(df_business.tail())
# print(df_business.dtypes)
df_zip.info()

     zip    borough
0  10001  Manhattan
1  10002  Manhattan
2  10003  Manhattan
3  10004  Manhattan
4  10005  Manhattan
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 2 columns):
zip        240 non-null int64
borough    240 non-null object
dtypes: int64(1), object(1)
memory usage: 3.9+ KB


In [34]:
# zip_dict = df_zip.to_dict()
# zip_dict
# print(df_zip['borough'])
zip_dict = pd.Series(df_zip['borough'].values, index=df_zip['zip']).to_dict()
# zip_dict = dict(zip(df_zip['zip'], df_zip['borough']))
print(zip_dict)
# for key in zip_dict.values():
#     print(type(key))
# zip_dict['borough']
# pd.Series(df.Letter.values,index=df.Position).to_dict()

{10001: 'Manhattan', 10002: 'Manhattan', 10003: 'Manhattan', 10004: 'Manhattan', 10005: 'Manhattan', 10006: 'Manhattan', 10007: 'Manhattan', 10009: 'Manhattan', 10010: 'Manhattan', 10011: 'Manhattan', 10012: 'Manhattan', 10013: 'Manhattan', 10014: 'Manhattan', 10015: 'Manhattan', 10016: 'Manhattan', 10017: 'Manhattan', 10018: 'Manhattan', 10019: 'Manhattan', 10020: 'Manhattan', 10021: 'Manhattan', 10022: 'Manhattan', 10023: 'Manhattan', 10024: 'Manhattan', 10025: 'Manhattan', 10026: 'Manhattan', 10027: 'Manhattan', 10028: 'Manhattan', 10029: 'Manhattan', 10030: 'Manhattan', 10031: 'Manhattan', 10032: 'Manhattan', 10033: 'Manhattan', 10034: 'Manhattan', 10035: 'Manhattan', 10036: 'Manhattan', 10037: 'Manhattan', 10038: 'Manhattan', 10039: 'Manhattan', 10040: 'Manhattan', 10041: 'Manhattan', 10044: 'Manhattan', 10045: 'Manhattan', 10048: 'Manhattan', 10055: 'Manhattan', 10060: 'Manhattan', 10069: 'Manhattan', 10090: 'Manhattan', 10095: 'Manhattan', 10098: 'Manhattan', 10099: 'Manhattan',

In [156]:
int(df_application.loc[364231]['Zip']) in zip_dict
zip_val = int(df_application.loc[364230]['Zip'])
# turnaround_dict = [[] for i in range(CONST_NUM_BOROUGHS)]
turnaround_dict = {}
print(type(turnaround_dict))
# print(df_application.loc[11423]['Zip'])
print(zip)
borough_val = zip_dict[zip_val]
borough_val

<class 'dict'>
11741


'Queens'

In [157]:
count = 0
for i in range(100):
    
    start = df_application.loc[i]['Start Date']
#     print(start.year)
    end = df_application.loc[i]['End Date']
#     print(int(df_application.loc[i]['Zip']) in zip_dict)
    zip_val = int(df_application.loc[i]['Zip'])
    if start is not pd.NaT and end is not pd.NaT and zip_val in zip_dict and start.year==2017: # I could join these two datasets
        borough_val = zip_dict[zip_val]
#         print(type(borough_val))
        if borough_val not in turnaround_dict.keys():
            turnaround_dict[borough_val] = list()
#         print(type(turnaround_dict[borough_val]))
        turnaround_dict[borough_val].append(end - start)
#         print("adding", count)
        count += 1
# print(turnaround_dict)
# print(count)
# for k, v in turnaround_dict.items():
#     print(k, v)

In [351]:
avg_list = []

max_turn = 0
for key in turnaround_dict.keys():
#     print(avg(turnaround_dict[key]))
#     a = turnaround_dict[key][0]
#     b = turnaround_dict[key][1]
#     print((a+b)/2)
#     avg_list = []
    avg_list.append(sum(turnaround_dict[key], datetime.timedelta(0))/len(turnaround_dict[key]))
    print("Avg for", key, "is =", avg_list[-1])
#     print(type(avg_list[-1]))
    if len(avg_list)==1:
        max_turn = avg_list[0]
    if avg_list[-1] > max_turn:
        max_turn = avg_list[-1]
        max_turn_borough = key
avg_list.sort(reverse=True)
avg_list
print(max_turn, max_turn_borough)

Avg for Bronx is = 7 days 12:00:00
Avg for Brooklyn is = 3 days 13:20:00
Avg for Queens is = 1 days 20:34:17.142857
Avg for Manhattan is = 10 days 06:51:25.714285
Avg for Staten is = 3 days 00:00:00
10 days 06:51:25.714285 Manhattan


In [349]:
df_join = pd.merge(df_application, df_zip, left_on='Zip', right_on='zip', how='inner', indicator=True)

In [161]:
df_join

Unnamed: 0,Application ID,License Number,License Type,Application or Renewal,Business Name,Status,Start Date,End Date,Temp Op Letter Issued,Temp Op Letter Expiration,License Category,Application Category,Building Number,Street,Street 2,Unit Type,Unit,Description,City,State,Zip,Contact Phone,Longitude,Latitude,Active Vehicles,zip,borough,_merge
0,1066-2017-RHIC,1294131-DCA,Business,Renewal,PEYKO TZENOV,Issued,2017-01-09,2017-01-10,NaT,NaT,Home Improvement Contractor,Special,3280,RESERVOIR OVAL E,,,,,BRONX,NY,10467.0000,9178047161,-73.8771,40.8762,,10467,Bronx,both
1,1097-2017-RHIC,2028349-DCA,Business,Renewal,Standard Builders Inc,Issued,2017-01-09,2017-01-10,NaT,NaT,Home Improvement Contractor,Special,831,ARNOW AVE,,APT,A,,BRONX,NY,10467.0000,917-310-2100,-73.8624,40.8678,,10467,Bronx,both
2,19397-2017-ASLS,2064576-DCA,Business,Application,3161 NORWOOD DELI CORP,Issued,2018-01-08,2018-01-09,NaT,NaT,Stoop Line Stand,Basic,3161,BAINBRIDGE AVE,,,,,BRONX,NY,10467.0000,917-370-9549,-73.8795,40.8751,,10467,Bronx,both
3,757-2017-RELE,2033204-DCA,Business,Renewal,"HI-TECH WIRELESS & REPAIRS, INC.",Issued,2017-01-05,2017-01-12,NaT,NaT,Electronics Store,Basic,767,E GUN HILL RD,,,,,BRONX,NY,10467.0000,3477592646,-73.8631,40.8763,,10467,Bronx,both
4,32030-2016-RELE,1291846-DCA,Business,Renewal,"CELL CITY COMMUNICATION, INC.",Issued,2016-12-20,2016-12-29,NaT,NaT,Electronics Store,Basic,3439,JEROME AVE,,,,,BRONX,NY,10467.0000,718-231-2115,-73.8828,40.8814,,10467,Bronx,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307218,10960-2019-AEMP,2090972-DCA,Business,Application,NY EFFICIENT STAFFING LLC,Issued,2019-09-26,2019-09-28,NaT,NaT,Employment Agency,Special,250,PARK AVE,,FL,7,,NEW YORK,NY,10177.0000,9086728556,-73.9756,40.7550,,10177,Manhattan,both
307219,1297647-ACRD,1297647-DCA,Business,Application,"EVANS, WILLIE E",Issued,2008-08-28,2008-08-27,NaT,NaT,Tobacco Retail Dealer,Basic,341,9TH AVE,,,,,NEW YORK,NY,10199.0000,9173181158,-73.9984,40.7505,,10199,Manhattan,both
307220,8786-2019-ASTF,,Individual,Application,Hanson Place Orthodontics,Denied,2019-07-24,2019-08-25,NaT,NaT,Temporary Street Fair Vendor,Basic,,,,,,,BROOKLYN,NY,11243.0000,,,,,11243,Brooklyn,both
307221,1367667-ACRD,1367667-DCA,Business,Application,"HUDSON GROUP (HG) RETAIL, LLC",Issued,2010-08-20,2010-08-20,NaT,NaT,Tobacco Retail Dealer,Basic,,WEST BROADWAY,VESEY STREET,,,WORLD TRADE CENTER PATH STATION,NEW YORK,,10048.0000,2125631030,-74.0117,40.7128,,10048,Manhattan,both


In [353]:
count = 0
lic_category_set = set(df_join['License Category'].unique())
# print(len(lic_category_set))
print(lic_category_set)
lic_category_dict = {x: [] for x in list(lic_category_set)}
# len(lic_category_dict.keys())

for i in range(4000):
    start = df_join.loc[i]['Start Date']
    end = df_join.loc[i]['End Date']
#     print(start)
    if df_join.loc[i]['borough'] == max_turn_borough and start.year==2016 and start is not pd.NaT and end is not pd.NaT:
        count += 1
        new_turn_val = end - start
        pos = df_join.loc[i]['License Category']
        lic_category_dict[pos].append(new_turn_val)

{'Secondhand Dealer - General', 'Home Improvement Contractor', 'Amusement Device Temporary', 'Sightseeing Bus', 'Laundry', 'Process Server Individual', 'Amusement Arcade', 'Gaming Cafe', 'Pedicab Business', 'Locksmith Apprentice', 'Amusement Device Permanent', 'Tow Truck Company', 'Pawnbroker', 'Garage', 'Horse Drawn Cab Owner', 'Booting Company', 'General Vendor Distributor', 'Games of Chance', 'Bingo Game Operator', 'Electronic & Appliance Service', 'Amusement Device Portable', 'Catering Establishment', 'Process Serving Agency', 'Tobacco Retail Dealer', 'Sidewalk Cafe', 'Scrap Metal Processor', 'Laundry Jobber', 'Garage and Parking Lot', 'Commercial Lessor', 'Secondhand Dealer - Auto', 'Locksmith', 'Auctioneer', 'Tow Truck Driver', 'Pool or Billiard Room', 'Laundries', 'Tow Truck Exemption', 'Sightseeing Guide', 'Electronics Store', 'Storage Warehouse', 'Auction House Premises', 'Temporary Street Fair Vendor', 'Ticket Seller', 'Debt Collection Agency', 'Secondhand Dealer - Firearms',

In [354]:
count = 0
# print(datetime.timedelta(0))
max_turn_in_borough = datetime.timedelta(0)
max_turn_category_in_borough = list(lic_category_dict.keys())[0]
# print(max_turn_category_in_borough)
for key in lic_category_dict.keys():
    if len(lic_category_dict[key]) == 1:
        count += 1
#         print(lic_category_dict[key])
        avg_value = lic_category_dict[key][0]
    elif len(lic_category_dict[key]) > 1:
        count += 1
        avg_value = (sum(lic_category_dict[key], datetime.timedelta(0))/len(lic_category_dict[key]))
#     print(avg_value, key)
    if max_turn_in_borough < avg_value:
        max_turn_in_borough = avg_value
        max_turn_category_in_borough = key    
#         print("Avg for", key, "is =", avg_value)

# print(avg_list)
print("max=", max_turn_in_borough, max_turn_category_in_borough)
# print(count)

max= 42 days 06:00:00 Sidewalk Cafe


In [218]:
count = 0
borough_set = set(df_join['borough'].unique())
print(len(borough_set))
print(borough_set)
borough_dict = {x: 0 for x in list(borough_set)}
print(borough_dict)

for i in range(5000):
    cur_borough = df_join.loc[i]['borough']
    start = df_join.loc[i]['Start Date']
    end = df_join.loc[i]['End Date']
#     print(start)
    if start.year==2018 and df_join.loc[i]['Status']=='Denied' and start is not pd.NaT and end is not pd.NaT:
        count += 1
#         print(borough_dict[cur_borough])
        borough_dict[cur_borough] += 1
print(borough_dict)
print(count)
# max_borough_dict = {k: v for k, v in sorted(borough_dict.items(), key=lambda item: item[1])}
# max_borough_dict
max_denial = 0
max_denial_borough = ''
for key,value in borough_dict.items():
    if value > max_denial:
        max_denial = value
        max_denial_borough = key
print(max_denial, max_denial_borough)

5
{'Bronx', 'Brooklyn', 'Staten', 'Manhattan', 'Queens'}
{'Bronx': 0, 'Brooklyn': 0, 'Staten': 0, 'Manhattan': 0, 'Queens': 0}
{'Bronx': 28, 'Brooklyn': 0, 'Staten': 0, 'Manhattan': 6, 'Queens': 0}
34
28 Bronx


In [228]:
count = 0
borough_set = set(df_join['borough'].unique())
print(len(borough_set))
print(borough_set)
borough_dict = {x: 0 for x in list(borough_set)}
print(borough_dict)

# make this into a function based on application or renewal
# I can make these dicts into DF and join them

for i in range(5000):
    cur_borough = df_join.loc[i]['borough']
    start = df_join.loc[i]['Start Date']
    end = df_join.loc[i]['End Date']
#     print(start)
    if start.year==2018 and df_join.loc[i]['Status']=='Denied' and df_join.loc[i]['Application or Renewal']=='Renewal' and start is not pd.NaT and end is not pd.NaT:
        count += 1
#         print(borough_dict[cur_borough])
        borough_dict[cur_borough] += 1
print(borough_dict)
print(count)
# max_borough_dict = {k: v for k, v in sorted(borough_dict.items(), key=lambda item: item[1])}
# max_borough_dict
max_denial = 0
max_denial_borough = ''
for key,value in borough_dict.items():
    if value > max_denial:
        max_denial = value
        max_denial_borough = key
print(max_denial, max_denial_borough)

new_df = pd.DataFrame.from_dict([borough_dict])
new_df

5
{'Bronx', 'Brooklyn', 'Staten', 'Manhattan', 'Queens'}
{'Bronx': 0, 'Brooklyn': 0, 'Staten': 0, 'Manhattan': 0, 'Queens': 0}
{'Bronx': 5, 'Brooklyn': 0, 'Staten': 0, 'Manhattan': 1, 'Queens': 0}
6
5 Bronx


Unnamed: 0,Bronx,Brooklyn,Staten,Manhattan,Queens
0,5,0,0,1,0


2020-12-31 00:00:00


In [226]:
turnaround_time = []
for i in range(100):
    start = df_application.loc[i]['Start Date']
    end = df_application.loc[i]['End Date']
    if start is not pd.NaT and end is not pd.NaT:
        turnaround_time.append(end - start)
len(turnaround_time)

99

In [240]:
df_temp = pd.DataFrame()
df_temp
for i in range(len(df_business)):
    if "delacruz" in str(df_business.loc[i]['Business Name']).lower():
#         print("yes")
        df_temp = df_temp.append(df_business.loc[i])
#         pass
df_temp

Unnamed: 0,Address Borough,Address Building,Address City,Address State,Address Street Name,Address ZIP,BBL,BIN,Borough Code,Business Name,Business Name 2,Census Tract,Community Board,Contact Phone Number,Council District,DCA License Number,Detail,Industry,Latitude,License Creation Date,License Expiration Date,License Status,License Type,Location,Longitude,NTA,Secondary Address Street Name
0,,,NEW YORK,NY,,10027,,,,"DELACRUZ VARGAS, CARLOS",,,,9173731141,,2080385-DCA,,Tow Truck Driver,,2018-12-03,2020-10-31,Active,Individual,,,,
9845,,,NEW YORK,NY,,10011,,,,"DELACRUZ, MICHAEL",,,,2129240404,,1274270-DCA,,Locksmith,,2007-12-14,2021-05-31,Active,Individual,,,,
9910,,,BRONX,NY,,10458,,,,"DELACRUZ, JUAN",,,,6469388068,,2068568-DCA,,Tow Truck Driver,,2018-03-29,2020-10-31,Active,Individual,,,,
12809,,,YONKERS,NY,,10710,,,,"SANTOS DELACRUZ, JONATHAN J",,,,6466837691,,2059109-DCA,General Vendor Type: Citywide Specialized Vend...,General Vendor,,2017-10-06,2018-09-30,Inactive,Individual,,,,
30312,,,COPIAGUE,NY,,11726,,,,"DELACRUZ-INFANTE, RADHAMES",,,,6312159221,,2093747-DCA,,Home Improvement Salesperson,,2020-01-17,2021-02-28,Active,Individual,,,,
32254,,,BRONX,NY,,10461,,,,"DELACRUZ, CHRISTIAN",,,,9175040189,,1468580-DCA,,Tow Truck Driver,,2013-07-08,2018-10-31,Inactive,Individual,,,,
70686,,,OZONE PARK,NY,,11416,,,,"DELACRUZ, YANCEY",,,,9164123988,,1372829-DCA,,Tow Truck Driver,,2010-09-29,2012-10-31,Inactive,Individual,,,,
81132,,,BRONX,NY,,10452,,,,"DELACRUZ, ISIDRO J",,,,347-825-7976,,1454290-DCA,,Tow Truck Driver,,2013-01-18,2016-10-31,Inactive,Individual,,,,
91579,,,BRONX,NY,,10468,,,,"DELACRUZ, JOSE",,,,6467349326,,1300808-DCA,,Tow Truck Driver,,2008-10-01,2014-10-31,Inactive,Individual,,,,
95992,,,OZONE PARK,NY,,11416,,,,"DELACRUZ, GERSHON",,,,9175184419,,1247793-DCA,,Tow Truck Driver,,2007-02-05,2012-10-31,Inactive,Individual,,,,


In [345]:
# From the language of Q4, it seems that the assignment has been created in 2018 and has not been updated.
# Therefore, there were 2 possibilities for me.
# 1. Follow Q4 word-by-word. This would result in an empty file since no business with license expiring in 2018/2019 will have active status in the dataset
# 2. Change the date. Take the date as, say, Late 2020 and go by that.
# Q4 does not specify that garage businesses have to be inside NYC
# More comments

end_2020_datetime = pd.to_datetime('20201231', format='%Y%m%d', errors='coerce')
start_2020_datetime = pd.to_datetime('20200209', format='%Y%m%d', errors='coerce')
# print(dummy_time)

df_parking = pd.DataFrame()
count = 0
for i in range(10000):
    if df_merge.loc[i]['License Status']=="Active" and df_merge.loc[i]['License Expiration Date']<=end_2020_datetime \
    and ("garage" in str(df_merge.loc[i]['License Category']).lower() or "parking" in str(df_merge.loc[i]['License Category']).lower()):
        count += 1
        df_parking = df_parking.append(df_merge.loc[i])
#         print(str(df_merge.loc[i]['License Category']).lower())
print(count)


# for 

# df_parking.drop(df_parking.columns.difference(['Start Date','End Date']), 1, inplace=True)

# header = ["DCA License Number", "License Expiration Date", "License Status", "License Category", "End Date"]
# df_parking.to_csv('parking.csv', columns = header)

df_parking

8


Unnamed: 0,Active Vehicles,Address Borough,Address Building,Address City,Address State,Address Street Name,Address ZIP,Application Category,Application ID,Application or Renewal,BBL,BIN,Borough Code,Building Number,Business Name 2,Business Name_x,Business Name_y,Census Tract,City,Community Board,Contact Phone,Contact Phone Number,Council District,DCA License Number,Description,Detail,End Date,Industry,Latitude_x,Latitude_y,License Category,License Creation Date,License Expiration Date,License Number,License Status,License Type_x,License Type_y,Location,Longitude_x,Longitude_y,NTA,Secondary Address Street Name,Start Date,State,Status,Street,Street 2,Temp Op Letter Expiration,Temp Op Letter Issued,Unit,Unit Type,Zip,_merge
3747,,Brooklyn,247,BROOKLYN,NY,METROPOLITAN AVE,11211,Basic,20160-2017-AGAR,Application,3023520020,3424287,3.0,247,,LM DRIGGS PARKING LLC,LM DRIGGS PARKING LLC,553.0,BROOKLYN,301.0,212-714-3571,212-714-3571,34.0,2063931-DCA,AKA: 626 DRIGGS AVE,"Vehicle Spaces: 81, Bicycle Spaces: 9",2017-12-26,Garage,40.7149,40.7149,Garage,2017-12-26,2020-02-24,2063931-DCA,Active,Business,Business,"(40.71489314441472, -73.95903958561274)",-73.959,-73.959,BK73,,2019-03-21,NY,Pending,METROPOLITAN AVE,,2020-02-24,2019-12-24,,,11211.0,both
3946,,Manhattan,7,NEW YORK,NY,W 21ST ST,10010,Basic,367-2017-AGAR,Application,1008230031,1015532,1.0,7,,7 WEST 21 PARKING LLC,7 WEST 21 PARKING LLC,58.0,NEW YORK,105.0,212-888-7400,212-888-7400,3.0,2047628-DCA,,"Vehicle Spaces: 200, Bicycle Spaces: 20",2017-01-24,Garage,40.7405,40.7405,Garage,2017-01-24,2020-03-19,2047628-DCA,Active,Business,Business,"(40.74053062325372, -73.99112994838053)",-73.9911,-73.9911,MN13,,2019-03-27,NY,Pending,W 21ST ST,,2020-03-19,2019-12-23,,,10010.0,both
4212,,Manhattan,520,NEW YORK,NY,W 30TH ST,10001,Basic,11231-2018-AGAR,Application,1007010016,1089836,1.0,520,,MP 30 YARDS LLC,MP 30 YARDS LLC,99.0,NEW YORK,104.0,2124903460,2124903460,3.0,2072755-DCA,,"Vehicle Spaces: 40, Bicycle Spaces: 0",2018-06-06,Garage,40.7526,40.7526,Garage,2018-06-06,2020-02-04,2072755-DCA,Active,Business,Business,"(40.752585880533076, -74.00225220254441)",-74.0023,-74.0023,MN13,,2019-02-20,NY,Pending,W 30TH ST,,2020-04-27,2020-02-05,,,10001.0,both
4260,,Manhattan,501,NEW YORK,NY,W 30TH ST,10001,Basic,6530-2016-AGAR,Application,1007020010,1089323,1.0,501,,MP HUDSON LLC,MP HUDSON LLC,99.0,NEW YORK,104.0,2124903460,2124903460,3.0,2037749-DCA,,"Vehicle Spaces: 240, Bicycle Spaces: 24",2016-05-17,Garage,40.7521,40.7521,Garage,2016-05-17,2020-02-18,2037749-DCA,Active,Business,Business,"(40.75213850311293, -74.00114774935764)",-74.0011,-74.0011,MN13,,2019-02-19,NY,Pending,W 30TH ST,,2020-02-18,2019-12-23,,,10001.0,both
4481,,Manhattan,549,NEW YORK,NY,W 23RD ST,10011,Basic,1214704-RGAR,Renewal,1006950503,1012345,1.0,549,,555 WEST GARAGE CORP.,555 WEST GARAGE CORP.,99.0,NEW YORK,104.0,2127367171,2127367171,3.0,1214704-DCA,,"Vehicle Spaces: 70, Bicycle Spaces: 7",2013-08-19,Garage,40.7484,40.7484,Garage,2009-03-17,2020-03-18,1214704-DCA,Active,Business,Business,"(40.748353359866705, -74.00572037538063)",-74.0057,-74.0057,MN13,,2019-02-19,NY,Issued,W 23RD ST,,NaT,NaT,,,10011.0,both
4849,,Manhattan,221,NEW YORK,NY,W 29TH ST,10001,Basic,5437-2018-AGAR,Application,1007790027,1000000,1.0,221,,LM 29 PARK LLC,LM 29 PARK LLC,95.0,NEW YORK,105.0,2129672848,2129672848,3.0,2068346-DCA,,"Vehicle Spaces: 45, Bicycle Spaces: 0",2018-03-26,Garage,40.7482,40.7482,Garage,2018-03-26,2020-03-22,2068346-DCA,Active,Business,Business,"(40.7482133493464, -73.99373466827033)",-73.9937,-73.9937,MN17,,2019-02-06,NY,Pending,W 29TH ST,,2020-03-22,2019-12-24,,,10001.0,both
4992,,Queens,13355,FLUSHING,NY,41ST AVE,11355,Basic,943-2019-AGAR,Application,4050377507,4618345,4.0,13355,,MP FLUSHING 41 LLC,MP FLUSHING 41 LLC,871.0,FLUSHING,407.0,212-490-3460,212-490-3460,20.0,2081940-DCA,,"Vehicle Spaces: 156, Bicycle Spaces: 16",2019-02-05,Garage,40.7579,40.7579,Garage,2019-02-05,2020-02-06,2081940-DCA,Active,Business,Business,"(40.75786519962758, -73.83022306254865)",-73.8302,-73.8302,QN22,,2019-01-30,NY,Pending,41ST AVE,,2020-04-30,2020-02-05,,,11355.0,both
8265,,Manhattan,31,NEW YORK,NY,W 52ND ST,10019,Basic,828-2015-AGAR,Application,1012687501,1071418,1.0,31,,SP PLUS CORPORATION,SP PLUS CORPORATION,104.0,NEW YORK,105.0,2125025490,2125025490,4.0,2017914-DCA,,"Vehicle Spaces: 120, Bicycle Spaces: 12",2015-02-03,Garage,40.7605,40.7605,Garage,2015-02-03,2020-03-22,2017914-DCA,Active,Business,Business,"(40.76049140051342, -73.97793734629516)",-73.9779,-73.9779,MN17,,2019-02-20,NY,Issued,W 52ND ST,,NaT,NaT,,,10019.0,both


In [347]:
# df_parking["Last License Activity Date"] = pd.NaT
df_parking = df_parking.copy()
df_parking.drop(df_parking.columns.difference(["DCA License Number", "License Expiration Date", "License Status", "License Category",'Start Date','End Date', 'Temp Op Letter Issued', "Last License Activity Date"]), 1, inplace=True)

for i in range(len(df_parking)):
    if df_parking.iloc[i]["End Date"] is not pd.NaT:
        if df_parking.iloc[i]["Temp Op Letter Issued"] is pd.NaT:
            df_parking.iloc[i]["End Date"] = end_2020_datetime
            print("i'm here")
            print(df_parking.iloc[i])
        
# print(df_parking.loc[4481]["Temp Op Letter Issued"]<df_parking.iloc[i]["Start Date"])


df_parking

i'm here
DCA License Number                 1214704-DCA
End Date                   2013-08-19 00:00:00
License Category                        Garage
License Expiration Date    2020-03-18 00:00:00
License Status                          Active
Start Date                 2019-02-19 00:00:00
Temp Op Letter Issued                      NaT
Name: 4481, dtype: object
i'm here
DCA License Number                 2017914-DCA
End Date                   2015-02-03 00:00:00
License Category                        Garage
License Expiration Date    2020-03-22 00:00:00
License Status                          Active
Start Date                 2019-02-20 00:00:00
Temp Op Letter Issued                      NaT
Name: 8265, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,DCA License Number,End Date,License Category,License Expiration Date,License Status,Start Date,Temp Op Letter Issued
3747,2063931-DCA,2017-12-26,Garage,2020-02-24,Active,2019-03-21,2019-12-24
3946,2047628-DCA,2017-01-24,Garage,2020-03-19,Active,2019-03-27,2019-12-23
4212,2072755-DCA,2018-06-06,Garage,2020-02-04,Active,2019-02-20,2020-02-05
4260,2037749-DCA,2016-05-17,Garage,2020-02-18,Active,2019-02-19,2019-12-23
4481,1214704-DCA,2013-08-19,Garage,2020-03-18,Active,2019-02-19,NaT
4849,2068346-DCA,2018-03-26,Garage,2020-03-22,Active,2019-02-06,2019-12-24
4992,2081940-DCA,2019-02-05,Garage,2020-02-06,Active,2019-01-30,2020-02-05
8265,2017914-DCA,2015-02-03,Garage,2020-03-22,Active,2019-02-20,NaT


In [262]:
df_temp2 = pd.DataFrame()
df_business2 = df_application[0:2000]
duplicate_in_student = df_business2.duplicated(subset=['Business Name'])
for i in range(2000):
    if duplicate_in_student[i]==True:
#     print(df.loc[~duplicate_in_student], end='\n\n')
        df_temp2 = df_temp2.append(df_business2.loc[i])
df_temp2.to_csv('dups.csv')

In [323]:
for i in range(len(df_application)):
    if df_application.loc[i]['License Number']=='2063931-DCA':
        print(df_application.loc[i])

Application ID                     20160-2017-AGAR
License Number                         2063931-DCA
License Type                              Business
Application or Renewal                 Application
Business Name                LM DRIGGS PARKING LLC
Status                                     Pending
Start Date                     2019-03-21 00:00:00
End Date                       2017-12-26 00:00:00
Temp Op Letter Issued          2019-12-24 00:00:00
Temp Op Letter Expiration      2020-02-24 00:00:00
License Category                            Garage
Application Category                         Basic
Building Number                                247
Street                            METROPOLITAN AVE
Street 2                                       NaN
Unit Type                                      NaN
Unit                                           NaN
Description                    AKA: 626 DRIGGS AVE
City                                      BROOKLYN
State                          

In [None]:
# df_application.loc[364232]['End Date'] - df_application.loc[364232]['Start Date']
# type(df_application.loc[364232]['Temp Op Letter Expiration'])
x = df_application.loc[364232]['Temp Op Letter Issued'] 
x is pd.NaT
# type(pd.NaT)
for i in range(99):
    print(turnaround_time[i])

In [None]:
# Similar to Industry in df_business - around 30 different strings
list(df_application['License Category'].str.lower().unique())

# ['special', 'basic']
list(df_application['Application Category'].str.lower().unique())

# Too many cities
list(df_application['City'].str.lower().unique())

list(df_application['Zip'].str.lower().unique())

list(df_application['Temp Op Letter Issued'].unique())

# License number is common between 2 data sets. So we can use license number+license expiry date to cross-check between 2 datasets

# Not sure how to use temp op letter dates in DF2

In [324]:
# There are different naming conventions for name of the business between 2 data sets. So we cannot merge on business name
# Inner join makes the most sense, because otherwise we would have much redundant info. 
# The application dataset (one business can have multiple license application info) has close to double data than the business dataset
# We need Borough information from business dataset which are related to a business whose application dates are in the application dataset

# Theoretically, we could use another dataset to match the address of business from df2 to find out which borough that business belongs to

df_merge = pd.merge(df_business, df_application, left_on='DCA License Number', right_on='License Number', how='inner',indicator=True)
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312876 entries, 0 to 312875
Data columns (total 53 columns):
DCA License Number               312876 non-null object
License Type_x                   312876 non-null object
License Expiration Date          312821 non-null datetime64[ns]
License Status                   312876 non-null object
License Creation Date            312876 non-null datetime64[ns]
Industry                         312876 non-null object
Business Name_x                  312824 non-null object
Business Name 2                  51372 non-null object
Address Building                 200266 non-null object
Address Street Name              201681 non-null object
Secondary Address Street Name    4350 non-null object
Address City                     312422 non-null object
Address State                    312174 non-null object
Address ZIP                      312416 non-null object
Contact Phone Number             294762 non-null object
Address Borough                  201

In [270]:
df_merge.tail()

Unnamed: 0,DCA License Number,License Type_x,License Expiration Date,License Status,License Creation Date,Industry,Business Name_x,Business Name 2,Address Building,Address Street Name,Secondary Address Street Name,Address City,Address State,Address ZIP,Contact Phone Number,Address Borough,Borough Code,Community Board,Council District,BIN,BBL,NTA,Census Tract,Detail,Longitude_x,Latitude_x,Location,Application ID,License Number,License Type_y,Application or Renewal,Business Name_y,Status,Start Date,End Date,Temp Op Letter Issued,Temp Op Letter Expiration,License Category,Application Category,Building Number,Street,Street 2,Unit Type,Unit,Description,City,State,Zip,Contact Phone,Longitude_y,Latitude_y,Active Vehicles,_merge
312871,1054840-DCA,Business,2020-12-31,Active,2000-12-05,Tobacco Retail Dealer,K P DELI CORP.,,255,HYLAN BLVD,,STATEN ISLAND,NY,10305,718-448-7730,Staten Island,5.0,501.0,49.0,5090837.0,5028530001.0,SI37,8.0,,-74.0704,40.6119,"(40.61193872381227, -74.07037351766044)",28961-2016-RCRD,1054840-DCA,Business,Renewal,K P DELI CORP.,Issued,2016-12-09,2016-12-10,NaT,NaT,Tobacco Retail Dealer,Basic,255,HYLAN BLVD,,,,,STATEN ISLAND,NY,10305.0,718-448-7730,-74.0704,40.6119,,both
312872,1054840-DCA,Business,2020-12-31,Active,2000-12-05,Tobacco Retail Dealer,K P DELI CORP.,,255,HYLAN BLVD,,STATEN ISLAND,NY,10305,718-448-7730,Staten Island,5.0,501.0,49.0,5090837.0,5028530001.0,SI37,8.0,,-74.0704,40.6119,"(40.61193872381227, -74.07037351766044)",23919-2014-RCRD,1054840-DCA,Business,Renewal,K P DELI CORP.,Issued,2014-11-12,2014-11-14,NaT,NaT,Tobacco Retail Dealer,Basic,255,HYLAN BLVD,,,,,STATEN ISLAND,NY,10305.0,718-448-7730,-74.0704,40.6119,,both
312873,1054840-DCA,Business,2020-12-31,Active,2000-12-05,Tobacco Retail Dealer,K P DELI CORP.,,255,HYLAN BLVD,,STATEN ISLAND,NY,10305,718-448-7730,Staten Island,5.0,501.0,49.0,5090837.0,5028530001.0,SI37,8.0,,-74.0704,40.6119,"(40.61193872381227, -74.07037351766044)",24491-2018-RTRD,1054840-DCA,Business,Renewal,K P DELI CORP.,Issued,2018-10-30,2018-10-31,NaT,NaT,Tobacco Retail Dealer,Basic,255,HYLAN BLVD,,,,,STATEN ISLAND,NY,10305.0,718-448-7730,-74.0704,40.6119,,both
312874,1054840-DCA,Business,2020-12-31,Active,2000-12-05,Tobacco Retail Dealer,K P DELI CORP.,,255,HYLAN BLVD,,STATEN ISLAND,NY,10305,718-448-7730,Staten Island,5.0,501.0,49.0,5090837.0,5028530001.0,SI37,8.0,,-74.0704,40.6119,"(40.61193872381227, -74.07037351766044)",1054840-ACRD,1054840-DCA,Business,Application,K P DELI CORP.,Issued,2000-12-06,2000-12-05,NaT,NaT,Tobacco Retail Dealer,Basic,255,HYLAN BLVD,,,,,STATEN ISLAND,NY,10305.0,7184487730,-74.0704,40.6119,,both
312875,1054840-DCA,Business,2020-12-31,Active,2000-12-05,Tobacco Retail Dealer,K P DELI CORP.,,255,HYLAN BLVD,,STATEN ISLAND,NY,10305,718-448-7730,Staten Island,5.0,501.0,49.0,5090837.0,5028530001.0,SI37,8.0,,-74.0704,40.6119,"(40.61193872381227, -74.07037351766044)",1054840-RCRD,1054840-DCA,Business,Renewal,K P DELI CORP.,Issued,2012-10-23,2012-10-24,NaT,NaT,Tobacco Retail Dealer,Basic,255,HYLAN BLVD,,,,,STATEN ISLAND,NY,10305.0,7184487730,-74.0704,40.6119,,both


In [None]:
df_merge.to_csv('out.csv', index=False)