In [1]:
import requests
import json
import pandas as pd

In [2]:
pd.options.display.max_rows = 50
pd.options.display.max_columns = 100

In [3]:
# This is the curl code given by https://iservices.dca.ca.gov/docs/bcclicensereadservices#/
# curl -X GET --header "Accept: */*" --header "app_id: 6c4528f5" --header "app_key: f82d0e73e4b21beb466a0e10f19339dc" "https://iservices.dca.ca.gov/api/bcclicenseread/getAllBccLicenses"

# This is the curl code translated to Python with https://curl.trillworks.com/

headers = {
    'Accept': '*/*',
    'app_id': '6c4528f5',
    'app_key': 'f82d0e73e4b21beb466a0e10f19339dc',
}

response = requests.get('https://iservices.dca.ca.gov/api/bcclicenseread/getAllBccLicenses', headers=headers)

In [4]:
# This translates it to a Pandas dataframe

bcc_df = pd.DataFrame(response.json())

In [5]:
bcc_df.columns

Index(['licenseNumber', 'licenseType', 'issuedDate', 'addressLine1',
       'addressLine2', 'premiseCity', 'premiseState', 'premiseZip',
       'premiseCounty', 'licenseStatus', 'businessStructure', 'medicinal',
       'adultUse', 'microActivityRetailerNonStorefront',
       'microActivityRetailer', 'microActivityDistributor',
       'microActivityDistributorTransportOnly',
       'microActivityLevel1Manufacturer', 'microActivityCultivator',
       'expiryDate', 'businessName', 'businessDBA', 'businessOwner', 'website',
       'phone', 'email'],
      dtype='object')

## Data Cleaning

In [6]:
# This changes all the yes/blank columns to binary
bcc_df['microActivityRetailerNonStorefront'] = bcc_df['microActivityRetailerNonStorefront'].replace('YES', 1).replace('', 0)
bcc_df['microActivityRetailer'] = bcc_df['microActivityRetailer'].replace('YES', 1).replace('', 0)
bcc_df['microActivityDistributor'] = bcc_df['microActivityDistributor'].replace('YES', 1).replace('', 0)
bcc_df['microActivityDistributorTransportOnly'] = bcc_df['microActivityDistributorTransportOnly'].replace('YES', 1).replace('', 0)
bcc_df['microActivityLevel1Manufacturer'] = bcc_df['microActivityLevel1Manufacturer'].replace('YES', 1).replace('', 0)
bcc_df['microActivityCultivator'] = bcc_df['microActivityCultivator'].replace('YES', 1).replace('', 0)

In [7]:
bcc_df['microActivityLevel1Manufacturer'].value_counts()

0    5877
1     622
Name: microActivityLevel1Manufacturer, dtype: int64

In [8]:
bcc_df['microActivityCultivator'].value_counts()

0    6021
1     478
Name: microActivityCultivator, dtype: int64

In [9]:
# This changes all the yes/no columns to binary
bcc_df['medicinal'] = bcc_df['medicinal'].replace('YES', 1).replace('NO', 0)
bcc_df['adultUse'] = bcc_df['adultUse'].replace('YES', 1).replace('NO', 0)

In [10]:
bcc_df[bcc_df.licenseNumber == 'C11-0001269-LIC']

Unnamed: 0,licenseNumber,licenseType,issuedDate,addressLine1,addressLine2,premiseCity,premiseState,premiseZip,premiseCounty,licenseStatus,businessStructure,medicinal,adultUse,microActivityRetailerNonStorefront,microActivityRetailer,microActivityDistributor,microActivityDistributorTransportOnly,microActivityLevel1Manufacturer,microActivityCultivator,expiryDate,businessName,businessDBA,businessOwner,website,phone,email
6474,C11-0001269-LIC,Cannabis - Distributor License,11/17/2020,,,MARYSVILLE,CA,959014800,YUBA,Active,Corporation,1,1,0,0,0,0,0,0,11/17/2021,Bio Cani Rx Inc,BioCani Rx Inc,michael shipp,,7143171101,michael@biocanirx.com


## Change Column Names

In [11]:
# Clean Control section of clean_combine ends with dataframe called df_control
# This dataframe has the following columns

intended_columns = ['license_number', 'license_description', 'business_company_type',
                    'premise_address', 'date_issue', 'date_expiration',
                    'business_description', 'in_db', 'license_adult', 'license_medicinal',
                    'status_active', 'status_canceled', 'status_expired', 'status_inactive',
                    'status_revoked', 'status_surrendered', 'status_suspended',
                    'license_category', 'name_legal', 'name_dba', 'contact_email',
                    'contact_phone', 'contact_website', 'contact_owner_1', 'contact_owner_2']

In [12]:
# bcc_df.head(3)

In [13]:
# Matching columns to df_final_date_to_edit format and column titles

bcc_edited = pd.DataFrame(bcc_df['licenseNumber'])
bcc_edited.rename(columns={'licenseNumber': 'license_number'}, inplace=True)

In [14]:
# This renames licenseType for license_descriptions, and then drops all of the rows for the temp files

bcc_edited['license_description'] = bcc_df['licenseType'].replace('Cannabis - Event Organizer License', 'Event Organizer').replace('Cannabis - Retailer Nonstorefront License', 'Retailer Nonstorefront').replace('Cannabis - Testing Laboratory License', 'Testing Laboratory').replace('Cannabis - Distributor-Transport Only License', 'Distributor-Transport Only').replace('Cannabis - Microbusiness License', 'Microbusiness').replace('Cannabis - Distributor License', 'Distributor').replace('Cannabis - Retailer License', 'Retailer')

bcc_edited = bcc_edited[~bcc_edited['license_description'].str.contains('Temp')]

In [15]:
bcc_edited['business_company_type'] = bcc_df['businessStructure']

bcc_edited['premise_address'] = bcc_df['addressLine1'] + '' + bcc_df['premiseCity'] + ', ' + bcc_df['premiseState'] + bcc_df['premiseZip'] + ' County: ' + bcc_df['premiseCounty']

bcc_edited['date_issue'] = bcc_df['issuedDate']

bcc_edited['date_expiration'] = bcc_df['expiryDate']

In [16]:
bcc_edited

Unnamed: 0,license_number,license_description,business_company_type,premise_address,date_issue,date_expiration
0,CEO14-0000115-LIC,Event Organizer,Limited Liability Company,"OAKLAND, CA94608 County: ALAMEDA",12/20/2019,12/19/2020
1,CEO14-0000114-LIC,Event Organizer,undefined,"HESPERIA, CA92394 County: null",12/12/2019,12/11/2020
2,CEO14-0000111-LIC,Event Organizer,Corporation,"MURRIETA, CA92562 County: null",12/02/2019,12/01/2020
3,CEO14-0000109-LIC,Event Organizer,Foreign Corporation,"SAN FRANCISCO, CA94111 County: SAN FRANCISCO",10/22/2019,10/21/2021
4,CEO14-0000110-LIC,Event Organizer,Limited Liability Company,"HAYWARD, CA94541 County: ALAMEDA",11/26/2019,11/25/2020
...,...,...,...,...,...,...
6494,C10-0000765-LIC,Retailer,Corporation,"872 WASHINGTON STPERRIS, CA925718835 County: ...",12/07/2020,12/07/2021
6495,C11-0001277-LIC,Distributor,Corporation,"LONG BEACH, CA90813 County: LOS ANGELES",12/08/2020,12/08/2021
6496,C9-0000391-LIC,Retailer Nonstorefront,Corporation,"TRUCKEE, CA961610263 County: NEVADA",12/09/2020,12/09/2021
6497,C12-0000333-LIC,Microbusiness,Limited Liability Company,"SAN JOSE, CA951113103 County: SANTA CLARA",12/09/2020,12/09/2021


In [17]:
# This doesn't seem to have as much data as the other file

bcc_edited['business_description'] = ''

for n in range(len(bcc_df)):
    description = []
    if bcc_df['microActivityRetailer'][n] == 1:
        description.append('Retailer')
    if bcc_df['microActivityRetailerNonStorefront'][n] == 1:
        description.append('Retailer Non-Storefront')
    if bcc_df['microActivityDistributor'][n] == 1:
        description.append('Distributor')
    if bcc_df['microActivityDistributorTransportOnly'][n] == 1:
        description.append('Distributor-Transport Only')
    if bcc_df['microActivityLevel1Manufacturer'][n] == 1:
        description.append('Level 1 Manufacturer')    
    if bcc_df['microActivityCultivator'][n] == 1:
        description.append('Cultivator (less than 10K sq ft)')

    bcc_edited['business_description'][n] = description

In [18]:
bcc_edited['in_db'] = 0

In [19]:
# This renames the adultUse and medicinal columns

bcc_edited['license_adult'] = bcc_df['adultUse']
bcc_edited['license_medicinal'] = bcc_df['medicinal']

In [20]:
# This splits up the licenseStatus into seperate columns

bcc_edited['status_active'] = (bcc_df['licenseStatus'] == 'Active').astype(int)
bcc_edited['status_canceled'] = (bcc_df['licenseStatus'] == 'Canceled').astype(int)
bcc_edited['status_expired'] = (bcc_df['licenseStatus'] == 'Expired').astype(int)
bcc_edited['status_inactive'] = (bcc_df['licenseStatus'] == 'Inactive').astype(int)
bcc_edited['status_revoked'] = (bcc_df['licenseStatus'] == 'Revoked').astype(int)
bcc_edited['status_surrendered'] = (bcc_df['licenseStatus'] == 'Surrendered').astype(int)
bcc_edited['status_suspended'] = (bcc_df['licenseStatus'] == 'Suspended').astype(int)

In [21]:
# This just indicates where the data comes from for when it is combined with other data

bcc_edited['license_category'] = 'Bureau of Cannabis Control'

In [22]:
# This moves the name's over to the new dataframe

bcc_edited['name_legal'] = bcc_df['businessName']
bcc_edited['name_dba'] = bcc_df['businessDBA']

In [23]:
# This moves the contact info over to the new dataframe

bcc_edited['contact_email'] = bcc_df['email']
bcc_edited['contact_phone'] = bcc_df['phone']
bcc_edited['contact_website'] = bcc_df['website']

In [24]:
# This moves over and slits the contact list

bcc_edited['contact_owners'] = bcc_df['businessOwner']
for n in range(len(bcc_edited)):
    bcc_edited['contact_owners'][n] = bcc_edited['contact_owners'][n].split(',')

bcc_edited['contact_owner_1'] = bcc_edited['contact_owners']
bcc_edited['contact_owner_2'] = bcc_edited['contact_owners']
for n in range(len(bcc_edited)):
    bcc_edited['contact_owner_1'][n] = bcc_edited['contact_owner_1'][n][0]
    if len(bcc_edited['contact_owner_2'][n]) > 1:
        bcc_edited['contact_owner_2'][n] = bcc_edited['contact_owner_2'][n][1]
    else:
        bcc_edited['contact_owner_2'][n] = ''

# bcc_edited = bcc_edited.drop(columns='contact_owners')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


KeyError: 2879

In [None]:
bcc_df[bcc_df['licenseNumber'] == 'C12-0000301-LIC']

In [None]:
bcc_edited[bcc_edited['license_number'] == 'C12-0000301-LIC']

In [None]:
bcc_edited.head(3)