In [1]:
import pandas as pd
import pyodbc
import pymssql
from pathlib import Path
from sqlalchemy import create_engine
import numpy as np
import urllib
import datetime as dt
import re

In [2]:
# Just so I see the whole DataFrame
pd.options.display.max_columns = 50
pd.options.display.min_rows = 100
pd.options.display.max_rows = 100

# Date

In [150]:
# Change this to the current date
# It will be used in file names throughout the notebook

date = dt.date(2020, 11, 4)

date_day = date.strftime('%d')
date_month = date.strftime('%m')
date_year = date.strftime('%y')

### Reading in and Combining the License Files

In [4]:
# I'll need to change this read in method, but I'm keeping it simple for now.
provisional_licenses = pd.read_csv('licenses/provisional/provisional_licenses_10_28_2020.csv')
annual_licenses = pd.read_csv('licenses/annual/annual_licenses_10_28_2020.csv')
df_all = provisional_licenses.append(annual_licenses)

In [5]:
# Resetting the index because it was messy.
df_all.reset_index(drop=True, inplace=True)

In [6]:
df_all.tail(3)

Unnamed: 0,id,licenseNumber,licenseStatus,licenseTerm,licenseType,licenseDesignation,issueDate,expirationDate,licenseStatusDate,licensingAuthorityId,licensingAuthority,businessLegalName,businessDbaName,businessOwnerName,businessStructure,activity,premiseStreetAddress,premiseCity,premiseState,premiseCounty,premiseZipCode,businessEmail,businessPhone,parcelNumber,PremiseLatitude,PremiseLongitude
10695,5655,CEO14-0000105-LIC,Expired,Annual,Commercial - Event Organizer,Data Not Available,10/03/2019 00:00:00,10/02/2020 00:00:00,,BCC,Bureau of Cannabis Control (BCC),"Weedcon Productions, LLC",Weedcon,John Williams,Limited Liability Company,Data Not Available,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,John@weedconproductions.Com,(310) 699-1416,Data Not Available,,
10696,5660,CEO14-0000110-LIC,Active,Annual,Commercial - Event Organizer,Data Not Available,11/26/2019 00:00:00,11/25/2020 00:00:00,,BCC,Bureau of Cannabis Control (BCC),Se And Sa Holdings,Data Not Available,"Pamela Epstein, Shareef El-Sissi, Soufyan Abo...",Limited Liability Company,Data Not Available,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Edeninfusions@gmail.Com,(323) 645-7669,Data Not Available,,
10697,5663,CEO14-0000113-LIC,Active,Annual,Commercial - Event Organizer,Data Not Available,12/05/2019 00:00:00,12/04/2020 00:00:00,,BCC,Bureau of Cannabis Control (BCC),"Green Flower Events, LLC",Data Not Available,Marlinda Girley,Limited Liability Company,Data Not Available,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Marlee.Girley@gmail.Com,(209) 518-6324,Data Not Available,,


### Cleaning the Data

#### Change Column Names

In [7]:
df_all.columns

Index(['id', 'licenseNumber', 'licenseStatus', 'licenseTerm', 'licenseType',
       'licenseDesignation', 'issueDate', 'expirationDate',
       'licenseStatusDate', 'licensingAuthorityId', 'licensingAuthority',
       'businessLegalName', 'businessDbaName', 'businessOwnerName',
       'businessStructure', 'activity', 'premiseStreetAddress', 'premiseCity',
       'premiseState', 'premiseCounty', 'premiseZipCode', 'businessEmail',
       'businessPhone', 'parcelNumber', 'PremiseLatitude', 'PremiseLongitude'],
      dtype='object')

In [8]:
# Changing the columns names to match the SQL column names

df_all.rename(
    
    columns={
                    
        # License Info
        'licenseNumber': 'license_number', 
        'licenseType': 'license_description',
        'licensingAuthorityId': 'license_category',
        'licenseDesignation': 'adult_medicinal',
        'licenseStatus': 'status_curr',
        'licenseTerm': 'license_term',
                   
        # Business Info
        'businessOwnerName': 'business_owner',
        'businessStructure': 'business_structure',
        'activity': 'business_description',
        'businessLegalName': 'name_legal',
        'businessDbaName': 'name_dba',
                    
        # Contact Info
        'businessEmail': 'contact_email',
        'businessPhone': 'contact_phone',
        'premiseStreetAddress': 'contact_address',
        'premiseCity': 'contact_city',
        'premiseState': 'contact_state',
        'premiseCounty': 'contact_county',
        'premiseZipCode': 'contact_zip',
        
        # Dates
        'issueDate': 'date_issue', 
        'expirationDate': 'date_expiration',
                    
            }, inplace=True)

In [9]:
df_all.columns

Index(['id', 'license_number', 'status_curr', 'license_term',
       'license_description', 'adult_medicinal', 'date_issue',
       'date_expiration', 'licenseStatusDate', 'license_category',
       'licensingAuthority', 'name_legal', 'name_dba', 'business_owner',
       'business_structure', 'business_description', 'contact_address',
       'contact_city', 'contact_state', 'contact_county', 'contact_zip',
       'contact_email', 'contact_phone', 'parcelNumber', 'PremiseLatitude',
       'PremiseLongitude'],
      dtype='object')

#### Change Data

In [10]:
# This changes the License Category section from initials to names

df_all.license_category.replace('BCC', 'Bureau of Cannabis Control', inplace=True)
df_all.license_category.replace('CCL', 'Cannabis Cultivation License', inplace=True)
df_all.license_category.replace('MCSB', 'Manufactured Cannabis License', inplace=True)

In [11]:
# This creates columns for Annual and Procisional type licenses by extracting from license_term

df_all['license_annual'] = df_all.license_term.replace('Annual', '1').replace('Provisional', '0')
df_all['license_provisional'] = df_all.license_term.replace('Provisional', '1').replace('Annual', '0')

In [12]:
# This creates columns for Medical and Adult Use type licenses by extracting from adult_medical

df_all['license_adult'] = 0
df_all['license_medical'] = 0

for n in range(len(df_all)):
    if "adult" in df_all.adult_medicinal[n].lower():
        df_all.license_adult[n] = 1
    if "med" in df_all.adult_medicinal[n].lower():
        df_all.license_medical[n] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [13]:
# This removes the unnecessary information from the license_description column

# idk why inplace=True didn't work, but w/e, this'll do
df_all.license_description = df_all.license_description.str.replace("Commercial -  ", "")
df_all.license_description = df_all.license_description.str.replace("Cultivation -  ", "")
df_all.license_description = df_all.license_description.str.replace("Manufacturer - ", "")

# This is just a minor formating thing
df_all.license_description = df_all.license_description.str.replace(" - ", "-")

In [14]:
# This creates columns for each status type by creating boolian values and converting them to int

df_all['status_active'] = (df_all.status_curr == 'Active').astype(int)
df_all['status_canceled'] = (df_all.status_curr == 'Canceled').astype(int)
df_all['status_expired'] = (df_all.status_curr == 'Expired').astype(int)
df_all['status_inactive'] = (df_all.status_curr == 'Inactive').astype(int)
df_all['status_revoked'] = (df_all.status_curr == 'Revoked').astype(int)
df_all['status_surrendered'] = (df_all.status_curr == 'Surrendered').astype(int)
df_all['status_suspended'] = (df_all.status_curr == 'Suspended').astype(int)

In [15]:
# This changes the date data to the correct datetime format
# It looks like the wrong format in Pandas, but when converted to csv it is in the correct format

df_all['date_issue'] = pd.to_datetime(df_all["date_issue"])
df_all['date_expiration'] = pd.to_datetime(df_all["date_expiration"])

In [16]:
# This may not be necessary, but makes the code a bit cleaner

df_all['business_description'] = df_all['business_description'].replace(to_replace='Data Not Available', value='')

In [17]:
number = '(555) 555-1212'
''.join(re.findall(r'\d+', number))

'5555551212'

In [18]:
# This changes the contact_phone into a number instead of a string

for n in range(len(df_all)):
    df_all['contact_phone'].iloc[n] = ''.join(re.findall(r'\d+', str(df_all['contact_phone'].iloc[n])))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [19]:
# This turns the string of business owners into a list so it can be converted to business contacts 1 and 2

for n in range(len(df_all)):
    df_all.business_owner[n] = df_all.business_owner[n].split(',')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [20]:
df_all.head(3)

Unnamed: 0,id,license_number,status_curr,license_term,license_description,adult_medicinal,date_issue,date_expiration,licenseStatusDate,license_category,licensingAuthority,name_legal,name_dba,business_owner,business_structure,business_description,contact_address,contact_city,contact_state,contact_county,contact_zip,contact_email,contact_phone,parcelNumber,PremiseLatitude,PremiseLongitude,license_annual,license_provisional,license_adult,license_medical,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended
0,3218,C10-0000010-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-03-19,2021-03-18,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),"Alternatives, A Health Collective","Alternatives, A Health Collective",[KAREN KISSLER],Corporation,,1603 Hampton Way,Santa Rosa,CA,Sonoma,95407,Mskslr@comcast.Net,4152508888,Data Not Available,38.4283,-122.738,0,1,1,1,1,0,0,0,0,0,0
1,3219,C10-0000011-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-03-28,2021-03-27,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),MY Golden Green INC.,The Humboldt County Collective,"[Collin Hammans, JoAnn Hammans]",Corporation,,1670 Myrtle Ave,Eureka,CA,Humboldt,95501,Goldengreen420@gmail.Com,7074422420,Data Not Available,40.7939,-124.136,0,1,1,1,1,0,0,0,0,0,0
2,3220,C10-0000012-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-02,2021-04-01,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Us Bloom INC.,Us Bloom,[Nicholas Foster],Corporation,,1201 Springs Rd,Vallejo,CA,Solano,94591,Usbloom707@gmail.Com,7075610716,Data Not Available,38.1058,-122.228,0,1,1,1,1,0,0,0,0,0,0


#### Add Columns

In [21]:
# This creates the name_legal_clean column by using the name_legal column as a starting point and cleaning from there

df_all['name_legal_clean'] = df_all['name_legal']

df_all['name_legal_clean'] = df_all['name_legal_clean'].str.lower() 
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\b, inc.\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\binc.\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\binc\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\b, llc.\b", "") 
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\b, llc\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\b, l.l.c.\b", "") 
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bllc\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bcorp.\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bcorp\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bcorporation\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bco.\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bco\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(",", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(".", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace("'", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.strip()

In [22]:
# Let's just give this a glance

df_all[['name_legal', 'name_legal_clean']].head()

Unnamed: 0,name_legal,name_legal_clean
0,"Alternatives, A Health Collective",alternatives a health collective
1,MY Golden Green INC.,my golden green
2,Us Bloom INC.,us bloom
3,Paula Deeter,paula deeter
4,Hah Coalinga LLC,hah coalinga


## Add Data From SQL

In [23]:
# This code connects us to the SQL data

ca_cannabis = pyodbc.connect('Driver={SQL Server};'
                      'Server=LAPTOP-E6QKON1L;'
                      'Database=ca_cannabis;'
                      'Trusted_Connection=yes;')

In [24]:
# This reads in the 3 old SQL tables that we're working with

ca_main_old = pd.read_sql("SELECT * FROM ca_main", ca_cannabis)
ca_roll_old = pd.read_sql("SELECT * FROM ca_roll", ca_cannabis)
ca_contact_old = pd.read_sql("SELECT * FROM ca_contact", ca_cannabis)

In [25]:
# Will probably change this to just one of the 3, but I need to see which of them matter

ca_main_old['in_main'] = 1
ca_roll_old['in_roll'] = 1
ca_contact_old['in_contact'] = 1

In [26]:
df_all.columns

Index(['id', 'license_number', 'status_curr', 'license_term',
       'license_description', 'adult_medicinal', 'date_issue',
       'date_expiration', 'licenseStatusDate', 'license_category',
       'licensingAuthority', 'name_legal', 'name_dba', 'business_owner',
       'business_structure', 'business_description', 'contact_address',
       'contact_city', 'contact_state', 'contact_county', 'contact_zip',
       'contact_email', 'contact_phone', 'parcelNumber', 'PremiseLatitude',
       'PremiseLongitude', 'license_annual', 'license_provisional',
       'license_adult', 'license_medical', 'status_active', 'status_canceled',
       'status_expired', 'status_inactive', 'status_revoked',
       'status_surrendered', 'status_suspended', 'name_legal_clean'],
      dtype='object')

In [27]:
ca_main_old.columns

Index(['license_number', 'license_category', 'license_description',
       'license_adult', 'license_medicinal', 'license_annual',
       'license_provisional', 'name_legal', 'name_dba', 'date_issue',
       'date_expiration', 'status_active', 'status_canceled', 'status_expired',
       'status_revoked', 'status_surrendered', 'status_suspended',
       'business_description', 'business_company_type', 'date_uploaded',
       'in_db', 'in_main'],
      dtype='object')

### From CA Roll

In [28]:
# This adds the roll_up_id, company_roll_up, and in_roll columns

df_with_roll = df_all.merge(ca_roll_old, on='license_number', how='left')

In [29]:
df_with_roll.in_roll.value_counts()

1.0    10676
Name: in_roll, dtype: int64

In [30]:
# This fills the in_roll columns

df_with_roll['in_roll'] = df_with_roll.in_roll.fillna(0).astype(int)

In [31]:
df_with_roll.in_roll.value_counts()

1    10676
0       22
Name: in_roll, dtype: int64

In [32]:
df_with_roll[df_with_roll.in_roll == 0]

Unnamed: 0,id,license_number,status_curr,license_term,license_description,adult_medicinal,date_issue,date_expiration,licenseStatusDate,license_category,licensingAuthority,name_legal,name_dba,business_owner,business_structure,business_description,contact_address,contact_city,contact_state,contact_county,contact_zip,contact_email,contact_phone,parcelNumber,PremiseLatitude,PremiseLongitude,license_annual,license_provisional,license_adult,license_medical,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,name_legal_clean,roll_up_id,company_roll_up,in_roll
7751,1675,CCL20-0000338,Active,Provisional,Nursery,Adult-Use,2020-07-29,2021-07-29,07/29/2020 00:00:00,Cannabis Cultivation License,CalCannabis Cultivation Licensing (CCL),"Coastal Prairie, LLC",Data Not Available,[Iris Carpenter],Limited Liability Company,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Data Not Available,,209-311-008,,,0,1,1,0,1,0,0,0,0,0,0,coastal prairie,,,0
8377,2135,CDPH-10002356,Canceled,Provisional,Type 6,Medicinal,2019-04-08,2020-03-25,03/25/2020 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"S.a.m.'s Management Venture, LLC",Data Not Available,[Stephen Garcia],Limited Liability Company,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Stephengar9@gmail.Com,7608596457.0,Data Not Available,,,0,1,0,1,0,1,0,0,0,0,0,sams management venture,,,0
8390,2339,CDPH-10002422,Surrendered,Provisional,Type 6,Adult-Use,2019-04-12,2019-10-21,04/12/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Regen West, INC.",The Resourcery,[Ashley Nelson],PvtCorpSubC,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Ashley.Nelson@theresourcery.Com,7072008257.0,Data Not Available,,,0,1,1,0,0,0,0,0,0,1,0,regen west,,,0
8407,2343,CDPH-10002527,Surrendered,Provisional,Type N,Medicinal,2019-04-18,2019-12-08,04/18/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Humanity Products, INC",Humanity Holdings,[Jessica Mcelfresh],Other,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Jessica@mcelfreshlaw.Com,8588637142.0,Data Not Available,,,0,1,0,1,0,0,0,0,0,1,0,humanity products,,,0
8419,2732,CDPH-10002586,Surrendered,Provisional,Type P,Adult-Use and Medicinal,2019-04-19,2019-08-02,04/19/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Stanley Brothers California, LLC","Stanley Brothers California, LLC",[J Stanley],Limited Liability Company,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Sbcalifornia@stanleybrothers.Co,3037462922.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,stanley brothers california,,,0
8462,2319,CDPH-10002783,Surrendered,Provisional,Type N,Adult-Use and Medicinal,2019-04-24,2019-11-01,04/24/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),PT Kor INC,Korova,[Cathy Knowles],501 Non-Profit,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Cathyk@like-Clock-Work.Com,7076035277.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,pt kor,,,0
8466,2741,CDPH-10002794,Surrendered,Provisional,Type S,Adult-Use and Medicinal,2019-04-24,2019-11-20,04/24/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Alap Enterprises, INC.",Roam Supply Co.,[Amory Langmo],PvtCorpSubS,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Amory@roamsupplycompany.Com,8312462184.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,alap enterprises,,,0
8481,2742,CDPH-10002857,Surrendered,Provisional,Type 6,Adult-Use and Medicinal,2019-04-25,2019-10-31,04/25/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),Ace Wield INC.,West Of West,[Nigam Arora],PvtCorpSubC,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Nigam@humboldtlegends.Com,6097132636.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,ace wield,,,0
8526,2347,CDPH-10003055,Surrendered,Provisional,Type 7,Medicinal,2019-05-01,2019-10-31,05/01/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),Zyfa INC,Zyfa INC,[Yevgeniy Zebrov],PvtCorpSubC,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Yev@zyfainc.Com,9169695451.0,Data Not Available,,,0,1,0,1,0,0,0,0,0,1,0,zyfa,,,0
8540,2734,CDPH-10003083,Surrendered,Provisional,Type 7,Adult-Use and Medicinal,2019-05-03,2020-01-02,05/03/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Mme MFDST, INC.",Medmen Manufacturing,[Adam Bierman],PvtCorpSubC,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Adamb@medmen.Com,3235935110.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,mme mfdst,,,0


### CA Contact

In [33]:
# Don't know if this matters, but the numbers were coiming up weird

ca_contact_old.contact_phone = ca_contact_old.contact_phone.astype(str)

for n in range(len(ca_contact_old)):
    ca_contact_old.contact_phone[n] = ca_contact_old.contact_phone[n][:10]
    
ca_contact_old.contact_phone.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


10677    8056948594
10678    8056948594
10679    8056948594
10680    8056948594
10681    8056948594
Name: contact_phone, dtype: object

In [34]:
# This adds the needed info from the ca_contact database in SQL

df_with_contact = df_with_roll.merge(ca_contact_old[['license_number', 'contact_website', 'contact_owner_1', 'contact_owner_2', 'in_contact']], on='license_number', how='left')

In [35]:
df_with_contact.in_contact.value_counts()

1.0    10676
Name: in_contact, dtype: int64

In [36]:
# This fills the in_contact columns

df_with_contact['in_contact'] = df_with_contact.in_contact.fillna(0).astype(int)

In [37]:
df_with_contact.in_contact.value_counts()

1    10676
0       22
Name: in_contact, dtype: int64

In [38]:
df_with_contact.in_roll.value_counts()

1    10676
0       22
Name: in_roll, dtype: int64

In [39]:
# This adds contact_owners for the newly added license numbers

for n in range(len(df_with_contact)):
    if df_with_contact['in_contact'][n] == 0:
        df_with_contact['contact_owner_1'][n] = df_with_contact['business_owner'][n][0]
        if len(df_with_contact['business_owner'][n]) > 1:
            df_with_contact['contact_owner_1'][n] = df_with_contact['business_owner'][n][1]       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [40]:
# df_with_contact.columns

In [41]:
df_with_contact[df_with_contact.in_contact == 0]

Unnamed: 0,id,license_number,status_curr,license_term,license_description,adult_medicinal,date_issue,date_expiration,licenseStatusDate,license_category,licensingAuthority,name_legal,name_dba,business_owner,business_structure,business_description,contact_address,contact_city,contact_state,contact_county,contact_zip,contact_email,contact_phone,parcelNumber,PremiseLatitude,PremiseLongitude,license_annual,license_provisional,license_adult,license_medical,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,name_legal_clean,roll_up_id,company_roll_up,in_roll,contact_website,contact_owner_1,contact_owner_2,in_contact
7751,1675,CCL20-0000338,Active,Provisional,Nursery,Adult-Use,2020-07-29,2021-07-29,07/29/2020 00:00:00,Cannabis Cultivation License,CalCannabis Cultivation Licensing (CCL),"Coastal Prairie, LLC",Data Not Available,[Iris Carpenter],Limited Liability Company,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Data Not Available,,209-311-008,,,0,1,1,0,1,0,0,0,0,0,0,coastal prairie,,,0,,Iris Carpenter,,0
8377,2135,CDPH-10002356,Canceled,Provisional,Type 6,Medicinal,2019-04-08,2020-03-25,03/25/2020 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"S.a.m.'s Management Venture, LLC",Data Not Available,[Stephen Garcia],Limited Liability Company,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Stephengar9@gmail.Com,7608596457.0,Data Not Available,,,0,1,0,1,0,1,0,0,0,0,0,sams management venture,,,0,,Stephen Garcia,,0
8390,2339,CDPH-10002422,Surrendered,Provisional,Type 6,Adult-Use,2019-04-12,2019-10-21,04/12/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Regen West, INC.",The Resourcery,[Ashley Nelson],PvtCorpSubC,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Ashley.Nelson@theresourcery.Com,7072008257.0,Data Not Available,,,0,1,1,0,0,0,0,0,0,1,0,regen west,,,0,,Ashley Nelson,,0
8407,2343,CDPH-10002527,Surrendered,Provisional,Type N,Medicinal,2019-04-18,2019-12-08,04/18/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Humanity Products, INC",Humanity Holdings,[Jessica Mcelfresh],Other,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Jessica@mcelfreshlaw.Com,8588637142.0,Data Not Available,,,0,1,0,1,0,0,0,0,0,1,0,humanity products,,,0,,Jessica Mcelfresh,,0
8419,2732,CDPH-10002586,Surrendered,Provisional,Type P,Adult-Use and Medicinal,2019-04-19,2019-08-02,04/19/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Stanley Brothers California, LLC","Stanley Brothers California, LLC",[J Stanley],Limited Liability Company,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Sbcalifornia@stanleybrothers.Co,3037462922.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,stanley brothers california,,,0,,J Stanley,,0
8462,2319,CDPH-10002783,Surrendered,Provisional,Type N,Adult-Use and Medicinal,2019-04-24,2019-11-01,04/24/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),PT Kor INC,Korova,[Cathy Knowles],501 Non-Profit,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Cathyk@like-Clock-Work.Com,7076035277.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,pt kor,,,0,,Cathy Knowles,,0
8466,2741,CDPH-10002794,Surrendered,Provisional,Type S,Adult-Use and Medicinal,2019-04-24,2019-11-20,04/24/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Alap Enterprises, INC.",Roam Supply Co.,[Amory Langmo],PvtCorpSubS,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Amory@roamsupplycompany.Com,8312462184.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,alap enterprises,,,0,,Amory Langmo,,0
8481,2742,CDPH-10002857,Surrendered,Provisional,Type 6,Adult-Use and Medicinal,2019-04-25,2019-10-31,04/25/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),Ace Wield INC.,West Of West,[Nigam Arora],PvtCorpSubC,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Nigam@humboldtlegends.Com,6097132636.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,ace wield,,,0,,Nigam Arora,,0
8526,2347,CDPH-10003055,Surrendered,Provisional,Type 7,Medicinal,2019-05-01,2019-10-31,05/01/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),Zyfa INC,Zyfa INC,[Yevgeniy Zebrov],PvtCorpSubC,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Yev@zyfainc.Com,9169695451.0,Data Not Available,,,0,1,0,1,0,0,0,0,0,1,0,zyfa,,,0,,Yevgeniy Zebrov,,0
8540,2734,CDPH-10003083,Surrendered,Provisional,Type 7,Adult-Use and Medicinal,2019-05-03,2020-01-02,05/03/2019 00:00:00,Manufactured Cannabis License,Manufactured Cannabis Safety Branch (MCSB),"Mme MFDST, INC.",Medmen Manufacturing,[Adam Bierman],PvtCorpSubC,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Adamb@medmen.Com,3235935110.0,Data Not Available,,,0,1,1,1,0,0,0,0,0,1,0,mme mfdst,,,0,,Adam Bierman,,0


In [42]:
df_with_contact

Unnamed: 0,id,license_number,status_curr,license_term,license_description,adult_medicinal,date_issue,date_expiration,licenseStatusDate,license_category,licensingAuthority,name_legal,name_dba,business_owner,business_structure,business_description,contact_address,contact_city,contact_state,contact_county,contact_zip,contact_email,contact_phone,parcelNumber,PremiseLatitude,PremiseLongitude,license_annual,license_provisional,license_adult,license_medical,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,name_legal_clean,roll_up_id,company_roll_up,in_roll,contact_website,contact_owner_1,contact_owner_2,in_contact
0,3218,C10-0000010-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-03-19,2021-03-18,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),"Alternatives, A Health Collective","Alternatives, A Health Collective",[KAREN KISSLER],Corporation,,1603 Hampton Way,Santa Rosa,CA,Sonoma,95407,Mskslr@comcast.Net,4152508888,Data Not Available,38.4283,-122.738,0,1,1,1,1,0,0,0,0,0,0,alternatives a health collective,194.0,alternatives a health collective,1,www.alternativescollective.com,karen kissler,,1
1,3219,C10-0000011-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-03-28,2021-03-27,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),MY Golden Green INC.,The Humboldt County Collective,"[Collin Hammans, JoAnn Hammans]",Corporation,,1670 Myrtle Ave,Eureka,CA,Humboldt,95501,Goldengreen420@gmail.Com,7074422420,Data Not Available,40.7939,-124.136,0,1,1,1,1,0,0,0,0,0,0,my golden green,2600.0,my golden green,1,,joann hammans,collin hammans,1
2,3220,C10-0000012-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-02,2021-04-01,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Us Bloom INC.,Us Bloom,[Nicholas Foster],Corporation,,1201 Springs Rd,Vallejo,CA,Solano,94591,Usbloom707@gmail.Com,7075610716,Data Not Available,38.1058,-122.228,0,1,1,1,1,0,0,0,0,0,0,us bloom,3877.0,us bloom collective,1,,nicholas foster,michelle sevier,1
3,3221,C10-0000013-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-04,2021-04-03,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Paula Deeter,Data Not Available,"[Amy Deeter, Paula Deeter]",Sole Proprietorship,,17875 Hwy One Hwy,FT Bragg,CA,Mendocino,95437,Herbanlegend@live.Com,7079610113,Data Not Available,39.4037,-123.809,0,1,1,1,1,0,0,0,0,0,0,paula deeter,2868.0,paula deeter,1,,paula deeter,amy deeter,1
4,3842,C10-0000014-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-04,2021-04-03,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Hah Coalinga LLC,Have A Heart CC,[Ryan Kunkel],Limited Liability Company,,286 5th St N,Coalinga,CA,Fresno,93210,Core@haveaheartcc.Com,2068890583,Data Not Available,36.1405,-120.361,0,1,1,1,1,0,0,0,0,0,0,hah coalinga,1600.0,hah,1,,ryan kunkel,,1
5,3843,C10-0000015-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-05,2021-04-04,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Green Spirit Mendocino LLC,Data Not Available,"[Christian Briggs, Harlan Ribnik, Leslie Bal...",Limited Liability Company,,138 Main St,Point Arena,CA,Mendocino,95468,Tom@greenspiritrx.Com,2148088649,Data Not Available,38.9078,-123.693,0,1,1,1,1,0,0,0,0,0,0,green spirit mendocino,1526.0,green spirit mendocino,1,www.thegreenroomcollective.com,tom gingerich,leslie ball,1
6,3844,C10-0000016-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-09,2021-04-08,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Mission Herbal Care INC,The Cookie Company,[Arthur Vugelman],Corporation,,3139 Mission St,San Francisco,CA,San Francisco,94110,Arthurvugelman@yahoo.Com,3233099090,Data Not Available,37.7470,-122.419,0,1,1,1,1,0,0,0,0,0,0,mission herbal care,2510.0,mission herbal care,1,www.cookieco415.com,arthur vugelman,,1
7,3845,C10-0000017-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-09,2021-04-08,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Northeastern Management INC,Herbal Cruz,[John Hadayia],Corporation,,1051 41st Ave,Santa Cruz,CA,Santa Cruz,95062,Unitedoil@msn.Com,6177174453,Data Not Available,36.9672,-121.965,0,1,1,1,1,0,0,0,0,0,0,northeastern management,2708.0,northeastern management,1,,john hadayia,,1
8,3846,C10-0000018-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-11,2021-04-10,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),"Malibu Community Collective, INC.",Data Not Available,[Michael Sutton],Corporation,,22523 Pacific Coast Hwy,Malibu,CA,Los Angeles,90265,Michael@malibucapLLC.Com,3108648965,Data Not Available,34.0399,-118.666,0,1,1,1,1,0,0,0,0,0,0,malibu community collective,2322.0,malibu community collective,1,,michael sutton,,1
9,3847,C10-0000019-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-12,2021-04-11,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),A.t.a.c.h.s. INC.,A Therapeutic Alternative,"[Kimberly Cargile, Richard Cormier]",Corporation,,3015 H St,Sacramento,CA,Sacramento,95816,Atafrontdesk@gmail.Com,9168224717,Data Not Available,38.5754,-121.465,0,1,1,1,1,0,0,0,0,0,0,atachs,101.0,atachs,1,,kimberly cargile,,1


### CA Main

In [43]:
# This adds the needed columns from ca_main 

df_with_main = df_with_contact.merge(ca_main_old[['license_number', 'date_uploaded', 'in_main']], on='license_number', how='left')

In [44]:
df_with_main.in_main.value_counts()

1.0    10696
Name: in_main, dtype: int64

In [45]:
# This fills the 

df_with_main['in_main'] = df_with_main.in_main.fillna(0).astype(int)

In [46]:
df_with_main.in_main.value_counts()

1    10696
0       22
Name: in_main, dtype: int64

In [47]:
df_with_main.in_contact.value_counts()

1    10696
0       22
Name: in_contact, dtype: int64

In [48]:
df_with_main.in_roll.value_counts()

1    10696
0       22
Name: in_roll, dtype: int64

In [49]:
df_with_main[df_with_main.license_number == 'CCL20-0000338']

Unnamed: 0,id,license_number,status_curr,license_term,license_description,adult_medicinal,date_issue,date_expiration,licenseStatusDate,license_category,licensingAuthority,name_legal,name_dba,business_owner,business_structure,business_description,contact_address,contact_city,contact_state,contact_county,contact_zip,contact_email,contact_phone,parcelNumber,PremiseLatitude,PremiseLongitude,license_annual,license_provisional,license_adult,license_medical,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,name_legal_clean,roll_up_id,company_roll_up,in_roll,contact_website,contact_owner_1,contact_owner_2,in_contact,date_uploaded,in_main
7757,1675,CCL20-0000338,Active,Provisional,Nursery,Adult-Use,2020-07-29,2021-07-29,07/29/2020 00:00:00,Cannabis Cultivation License,CalCannabis Cultivation Licensing (CCL),"Coastal Prairie, LLC",Data Not Available,[Iris Carpenter],Limited Liability Company,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,Data Not Available,,209-311-008,,,0,1,1,0,1,0,0,0,0,0,0,coastal prairie,,,0,,Iris Carpenter,,0,NaT,0


## Converting File to df_to_edit Format

In [50]:
df_with_sql = df_with_main

In [51]:
# Not fully sure what this is for, but I'm following the steps in the orignial file

df_with_sql['changed_contact_info'] = 0
df_with_sql['in_db'] = 0

In [52]:
# Just wanna check that theses are the same

differences = (df_with_sql['in_roll'] == df_with_sql['in_contact']).astype(int).sum() - len(df_with_sql)

if differences == 0:
    print(differences)
else:
    print(error)

0


In [53]:
# This adds the in_sql column by picking the max of in_roll and in_contact
# They are usually the same,  but this in just in case

df_with_sql['in_sql'] = df_with_sql['in_roll']

df_with_sql.in_sql.value_counts()

1    10696
0       22
Name: in_sql, dtype: int64

In [54]:
# First let's create the starter DataFrame

df_to_edit = pd.DataFrame()

In [55]:
# Then look at the columns we need to convert

pd.read_csv('result/df_final_20201028_to_edit.csv').columns

Index(['license_number', 'license_category', 'license_description',
       'license_adult', 'license_medicinal', 'license_annual',
       'license_provisional', 'name_legal', 'name_dba', 'name_legal_clean',
       'roll_up_id', 'company_roll_up', 'date_issue', 'date_expiration',
       'status_active', 'status_canceled', 'status_expired', 'status_inactive',
       'status_revoked', 'status_surrendered', 'status_suspended',
       'business_description', 'business_company_type', 'contact_email',
       'contact_phone', 'contact_website', 'contact_owner_1',
       'contact_owner_2', 'contact_street', 'contact_city', 'contact_county',
       'contact_state', 'contact_zip', 'changed_contact_info', 'date_uploaded',
       'in_db', 'in_sql'],
      dtype='object')

In [56]:
# This is easier than removing all the unnneded columns manually

df_to_edit['license_number'] = df_with_sql['license_number']
df_to_edit['license_category'] = df_with_sql['license_category']
df_to_edit['license_description'] = df_with_sql['license_description']
df_to_edit['license_adult'] = df_with_sql['license_adult']
df_to_edit['license_medicinal'] = df_with_sql['license_medical']
df_to_edit['license_annual'] = df_with_sql['license_annual']
df_to_edit['license_provisional'] = df_with_sql['license_provisional']

df_to_edit['name_legal'] = df_with_sql['name_legal']
df_to_edit['name_dba'] = df_with_sql['name_dba']
df_to_edit['name_legal_clean'] = df_with_sql['name_legal_clean']
df_to_edit['roll_up_id'] = df_with_sql['roll_up_id']
df_to_edit['company_roll_up'] = df_with_sql['company_roll_up']
df_to_edit['date_issue'] = df_with_sql['date_issue']
df_to_edit['date_expiration'] = df_with_sql['date_expiration']

df_to_edit['status_active'] = df_with_sql['status_active']
df_to_edit['status_canceled'] = df_with_sql['status_canceled']
df_to_edit['status_expired'] = df_with_sql['status_expired']
df_to_edit['status_inactive'] = df_with_sql['status_inactive']
df_to_edit['status_revoked'] = df_with_sql['status_revoked']
df_to_edit['status_surrendered'] = df_with_sql['status_surrendered']
df_to_edit['status_suspended'] = df_with_sql['status_suspended']

df_to_edit['business_description'] = df_with_sql['business_description']
df_to_edit['business_company_type'] = df_with_sql['business_structure']

df_to_edit['contact_email'] = df_with_sql['contact_email']
df_to_edit['contact_phone'] = df_with_sql['contact_phone']
df_to_edit['contact_website'] = df_with_sql['contact_website']
df_to_edit['contact_owner_1'] = df_with_sql['contact_owner_1']
df_to_edit['contact_owner_2'] = df_with_sql['contact_owner_2']
df_to_edit['contact_street'] = df_with_sql['contact_address']
df_to_edit['contact_city'] = df_with_sql['contact_city']
df_to_edit['contact_county'] = df_with_sql['contact_county']
df_to_edit['contact_state'] = df_with_sql['contact_state']
df_to_edit['contact_zip'] = df_with_sql['contact_zip']
df_to_edit['changed_contact_info'] = df_with_sql['changed_contact_info']

df_to_edit['date_uploaded'] = df_with_sql['date_uploaded']

df_to_edit['in_db'] = df_with_sql['in_main']
df_to_edit['in_sql'] = df_with_sql['in_sql']

In [57]:
df_with_sql.in_sql.value_counts()

1    10696
0       22
Name: in_sql, dtype: int64

In [58]:
df_to_edit.in_db.value_counts()

1    10696
0       22
Name: in_db, dtype: int64

In [59]:
df_to_edit.sort_values(by='date_issue', ascending=False).head()

Unnamed: 0,license_number,license_category,license_description,license_adult,license_medicinal,license_annual,license_provisional,name_legal,name_dba,name_legal_clean,roll_up_id,company_roll_up,date_issue,date_expiration,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,business_description,business_company_type,contact_email,contact_phone,contact_website,contact_owner_1,contact_owner_2,contact_street,contact_city,contact_county,contact_state,contact_zip,changed_contact_info,date_uploaded,in_db,in_sql
8267,CCL20-0001574,Cannabis Cultivation License,Medium Outdoor,1,0,0,1,"Sugarbear Farms, INC.",Data Not Available,sugarbear farms,5296.0,sugarbear farms,2020-10-27,2021-10-27,1,0,0,0,0,0,0,,Corporation,Data Not Available,4156102245,0.0,0.0,0.0,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-10-21,1,1
9132,CDPH-10004395,Manufactured Cannabis License,Type 6,1,1,0,1,Ball Family Farms Corporation,Ball Family Farms Corporation,ball family farms,301.0,ball family farms,2020-10-27,2021-10-27,1,0,0,0,0,0,0,,PvtCorpSubC,Cball55555@gmail.Com,7148290971,0.0,0.0,0.0,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-10-21,1,1
10684,CDPH-10004392,Manufactured Cannabis License,Type P,1,1,1,0,"Next Green Wave, LLC","Next Green Wave, LLC",next green wave,2662.0,next green wave,2020-10-26,2021-10-26,1,0,0,0,0,0,0,,Limited Liability Company,Mjennings@nextgreenwave.Com,5599172222,0.0,0.0,0.0,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-10-21,1,1
7655,CCL20-0000130,Cannabis Cultivation License,Small Mixed-Light Tier 1,0,1,0,1,Mendo Blendo LLC,Data Not Available,mendo blendo,2433.0,mendo blendo,2020-10-25,2021-10-25,1,0,0,0,0,0,0,,Limited Liability Company,Data Not Available,5599707250,0.0,0.0,0.0,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-10-21,1,1
10632,CDPH-10003938,Manufactured Cannabis License,Type P,1,0,1,0,Dolo Company,Dolo Rolling Company,dolo company,989.0,dolo company,2020-10-25,2021-10-25,1,0,0,0,0,0,0,,PvtCorpSubC,Doloproject@gmail.Com,6267806195,,,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-03-12,1,1


In [60]:
# Sorting by name_legal_clean

df_to_edit.sort_values(by='name_legal_clean', inplace=True)

In [61]:
df_to_edit.reset_index(inplace=True)

In [62]:
df_to_edit.drop(axis='columns', labels='index', inplace=True)

In [63]:
df_to_edit.head()

Unnamed: 0,license_number,license_category,license_description,license_adult,license_medicinal,license_annual,license_provisional,name_legal,name_dba,name_legal_clean,roll_up_id,company_roll_up,date_issue,date_expiration,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,business_description,business_company_type,contact_email,contact_phone,contact_website,contact_owner_1,contact_owner_2,contact_street,contact_city,contact_county,contact_state,contact_zip,changed_contact_info,date_uploaded,in_db,in_sql
0,CCL19-0003643,Cannabis Cultivation License,Small Mixed-Light Tier 1,1,0,0,1,"00G, LLC.",Data Not Available,00g,1.0,00g,2019-12-19,2020-12-19,1,0,0,0,0,0,0,,Limited Liability Company,Data Not Available,,,Drew Plebani,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-03-12,1,1
1,CDPH-10003551,Manufactured Cannabis License,Type 6,1,1,0,1,1000 Palms Associates Group INC,Cookies,1000 palms associates group,2.0,1000 palms associates group,2020-06-25,2021-06-25,1,0,0,0,0,0,0,,Other,Arthurvugelman@yahoo.Com,8186944195.0,,,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-03-12,1,1
2,C11-0000630-LIC,Bureau of Cannabis Control,Distributor,1,1,0,1,1000 Palms Associates Group INC,Cookies,1000 palms associates group,2.0,1000 palms associates group,2019-07-03,2021-07-02,1,0,0,0,0,0,0,,Corporation,Cookieco415@icloud.Com,8186944195.0,,arthur vugelman,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-03-12,1,1
3,CCL19-0000084,Cannabis Cultivation License,Medium Indoor,1,0,0,1,1000 Palms Associates Group DBA Cookies,Data Not Available,1000 palms associates group dba cookies,2.0,1000 palms associates group,2020-01-13,2021-01-13,1,0,0,0,0,0,0,,Corporation,Data Not Available,3104801200.0,,arthur vugelman,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-03-12,1,1
4,C11-0000596-LIC,Bureau of Cannabis Control,Distributor,1,1,0,1,10TH Street Facilities LLC,Data Not Available,10th street facilities,3.0,10th street facilities,2019-07-01,2021-06-30,1,0,0,0,0,0,0,,Limited Liability Company,Jad2129@columbia.Edu,2055234966.0,,jason dixon,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-03-12,1,1


In [64]:
df_with_sql.head()

Unnamed: 0,id,license_number,status_curr,license_term,license_description,adult_medicinal,date_issue,date_expiration,licenseStatusDate,license_category,licensingAuthority,name_legal,name_dba,business_owner,business_structure,business_description,contact_address,contact_city,contact_state,contact_county,contact_zip,contact_email,contact_phone,parcelNumber,PremiseLatitude,PremiseLongitude,license_annual,license_provisional,license_adult,license_medical,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,name_legal_clean,roll_up_id,company_roll_up,in_roll,contact_website,contact_owner_1,contact_owner_2,in_contact,date_uploaded,in_main,changed_contact_info,in_db,in_sql
0,3218,C10-0000010-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-03-19,2021-03-18,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),"Alternatives, A Health Collective","Alternatives, A Health Collective",[KAREN KISSLER],Corporation,,1603 Hampton Way,Santa Rosa,CA,Sonoma,95407,Mskslr@comcast.Net,4152508888,Data Not Available,38.4283,-122.738,0,1,1,1,1,0,0,0,0,0,0,alternatives a health collective,194.0,alternatives a health collective,1,www.alternativescollective.com,karen kissler,,1,2020-03-12,1,0,0,1
1,3219,C10-0000011-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-03-28,2021-03-27,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),MY Golden Green INC.,The Humboldt County Collective,"[Collin Hammans, JoAnn Hammans]",Corporation,,1670 Myrtle Ave,Eureka,CA,Humboldt,95501,Goldengreen420@gmail.Com,7074422420,Data Not Available,40.7939,-124.136,0,1,1,1,1,0,0,0,0,0,0,my golden green,2600.0,my golden green,1,,joann hammans,collin hammans,1,2020-03-12,1,0,0,1
2,3220,C10-0000012-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-02,2021-04-01,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Us Bloom INC.,Us Bloom,[Nicholas Foster],Corporation,,1201 Springs Rd,Vallejo,CA,Solano,94591,Usbloom707@gmail.Com,7075610716,Data Not Available,38.1058,-122.228,0,1,1,1,1,0,0,0,0,0,0,us bloom,3877.0,us bloom collective,1,,nicholas foster,michelle sevier,1,2020-03-12,1,0,0,1
3,3221,C10-0000013-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-04,2021-04-03,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Paula Deeter,Data Not Available,"[Amy Deeter, Paula Deeter]",Sole Proprietorship,,17875 Hwy One Hwy,FT Bragg,CA,Mendocino,95437,Herbanlegend@live.Com,7079610113,Data Not Available,39.4037,-123.809,0,1,1,1,1,0,0,0,0,0,0,paula deeter,2868.0,paula deeter,1,,paula deeter,amy deeter,1,2020-03-12,1,0,0,1
4,3842,C10-0000014-LIC,Active,Provisional,Retailer,Adult-Use and Medicinal,2019-04-04,2021-04-03,,Bureau of Cannabis Control,Bureau of Cannabis Control (BCC),Hah Coalinga LLC,Have A Heart CC,[Ryan Kunkel],Limited Liability Company,,286 5th St N,Coalinga,CA,Fresno,93210,Core@haveaheartcc.Com,2068890583,Data Not Available,36.1405,-120.361,0,1,1,1,1,0,0,0,0,0,0,hah coalinga,1600.0,hah,1,,ryan kunkel,,1,2020-03-12,1,0,0,1


## Convert to CSV and Edit

In [65]:
# Date should change to Date entered at the begining of the notebook

df_to_edit.to_csv(f'result/df_final_{date.strftime("%Y")}{date.strftime("%m")}{date.strftime("%d")}_to_edit_v2.csv')

##### Make Changes to the company_roll_up File in the CSV Before Reuploading

In [66]:
# This will auto-upload the CSV that was created and changed

df_edited = pd.read_csv(f'result/df_final_{date.strftime("%Y")}{date.strftime("%m")}{date.strftime("%d")}_to_edit.csv')

In [67]:
# This checks that changes were made properly

if df_edited.company_roll_up.isnull().sum() == 0:
    print('Ready to Continue')
else:
    print(error)

Ready to Continue


In [68]:
# This removes the Unnamed: 0 Column

df_edited = df_edited.drop(axis='columns', labels='Unnamed: 0')

In [69]:
# This splits the edited dataframe into two seperate dataframes to work with

df_edited_fill = df_edited.dropna(subset=['roll_up_id'])
df_edited_null = df_edited[df_edited.roll_up_id.isnull()]

In [70]:
# This makes a list of the company_roll_ups of the licenses being added

names_missing = df_edited_null['company_roll_up'].unique()

In [71]:
names_missing

array(['alap enterprises', 'calabasas edibles company', 'foco',
       'lba california manufacturing', 'marbl', 'mme mfdst', 'regen west',
       'rome flower', 'sams management venture',
       'stanley brothers california', 'winji', 'zyfa'], dtype=object)

In [72]:
df_edited_null

Unnamed: 0,license_number,license_category,license_description,license_adult,license_medicinal,license_annual,license_provisional,name_legal,name_dba,name_legal_clean,roll_up_id,company_roll_up,date_issue,date_expiration,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,business_description,business_company_type,contact_email,contact_phone,contact_website,contact_owner_1,contact_owner_2,contact_street,contact_city,contact_county,contact_state,contact_zip,changed_contact_info,date_uploaded,in_db,in_sql
477,CDPH-10002794,Manufactured Cannabis License,Type S,1,1,0,1,"Alap Enterprises, INC.",Roam Supply Co.,alap enterprises,,alap enterprises,4/24/2019,11/20/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubS,Amory@roamsupplycompany.Com,(831) 246-2184,,Amory Langmo,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0
1461,CDPH-10003944,Manufactured Cannabis License,Type N,1,1,0,1,Calabasas Edibles Company,Calabasas Edibles Company,calabasas edibles company,,calabasas edibles company,10/29/2019,12/16/2019,0,1,0,0,0,0,0,Data Not Available,Limited Liability Company,Calabasascandyco@yahoo.Com,(818) 681-2614,,Cheryl De Rose,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0
3401,CDPH-10003681,Manufactured Cannabis License,Type N,1,1,0,1,"Foco, INC.",Data Not Available,foco,,foco,7/11/2019,10/31/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubS,Ryan@bseenco.Com,(916) 502-6631,,Ryan Dearkland,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0
5958,CDPH-10003167,Manufactured Cannabis License,Type N,1,1,0,1,"Lba California Manufacturing, INC.",Data Not Available,lba california manufacturing,,lba california manufacturing,5/13/2019,10/1/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubC,Burl@lunchboxalchemy.Com,(541) 797-1700,,Douglas Bryson,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0
6374,CDPH-10003910,Manufactured Cannabis License,Type S,1,1,0,1,Marbl Corporation,Marbl Corporation,marbl,,marbl,10/8/2019,12/16/2019,0,1,0,0,0,0,0,Data Not Available,PvtCorpSubC,Eric.Piearcy@gmail.Com,(844) 855-2254,,Eric Piearcy,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0
6774,CDPH-10003083,Manufactured Cannabis License,Type 7,1,1,0,1,"Mme MFDST, INC.",Medmen Manufacturing,mme mfdst,,mme mfdst,5/3/2019,1/2/2020,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubC,Adamb@medmen.Com,(323) 593-5110,,Adam Bierman,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0
8341,CDPH-10002422,Manufactured Cannabis License,Type 6,1,0,0,1,"Regen West, INC.",The Resourcery,regen west,,regen west,4/12/2019,10/21/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubC,Ashley.Nelson@theresourcery.Com,(707) 200-8257,,Ashley Nelson,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0
8480,CDPH-10003412,Manufactured Cannabis License,Type P,1,1,0,1,Rome Flower,Rome Flower Company,rome flower,,rome flower,6/10/2019,11/25/2019,0,0,0,0,0,1,0,Data Not Available,Limited Liability Company,Mk@romeflower.Com,(916) 385-1917,,Musie Kidane,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0
8636,CDPH-10002356,Manufactured Cannabis License,Type 6,0,1,0,1,"S.a.m.'s Management Venture, LLC",Data Not Available,sams management venture,,sams management venture,4/8/2019,3/25/2020,0,1,0,0,0,0,0,Data Not Available,Limited Liability Company,Stephengar9@gmail.Com,(760) 859-6457,,Stephen Garcia,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0
9159,CDPH-10002586,Manufactured Cannabis License,Type P,1,1,0,1,"Stanley Brothers California, LLC","Stanley Brothers California, LLC",stanley brothers california,,stanley brothers california,4/19/2019,8/2/2019,0,0,0,0,0,1,0,Data Not Available,Limited Liability Company,Sbcalifornia@stanleybrothers.Co,(303) 746-2922,,J Stanley,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,,0,0


### Makes Changes to df_edited_null

In [73]:
# Making this list lets up group up new additions with the same company_roll_up name so they can get the id number

new_roll_ups = df_edited_null.company_roll_up.unique()
new_roll_ups

array(['alap enterprises', 'calabasas edibles company', 'foco',
       'lba california manufacturing', 'marbl', 'mme mfdst', 'regen west',
       'rome flower', 'sams management venture',
       'stanley brothers california', 'winji', 'zyfa'], dtype=object)

In [74]:
# This adds the new roll_up_ids

new_roll_up_id = int(max(df_edited.roll_up_id)) + 1

for new in new_roll_ups:
    df_edited_null['roll_up_id'][df_edited_null['company_roll_up'] == new] = new_roll_up_id
    new_roll_up_id += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [75]:
# This adds the upload date for the newly added files

df_edited_null.date_uploaded = date
df_edited_null.date_uploaded.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


477     2020-11-04
1461    2020-11-04
3401    2020-11-04
5958    2020-11-04
6374    2020-11-04
Name: date_uploaded, dtype: object

In [76]:
df_edited_fill.date_uploaded.value_counts()

3/12/2020     8840
8/3/2020       726
10/6/2020      397
10/21/2020     107
5/27/2020       73
4/22/2020       69
10/14/2020      64
3/19/2020       56
5/13/2020       52
2/12/2020       49
4/1/2020        45
5/6/2020        45
4/8/2020        44
4/29/2020       44
3/25/2020       43
4/15/2020       42
Name: date_uploaded, dtype: int64

In [77]:
df_edited_null.head()

Unnamed: 0,license_number,license_category,license_description,license_adult,license_medicinal,license_annual,license_provisional,name_legal,name_dba,name_legal_clean,roll_up_id,company_roll_up,date_issue,date_expiration,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,business_description,business_company_type,contact_email,contact_phone,contact_website,contact_owner_1,contact_owner_2,contact_street,contact_city,contact_county,contact_state,contact_zip,changed_contact_info,date_uploaded,in_db,in_sql
477,CDPH-10002794,Manufactured Cannabis License,Type S,1,1,0,1,"Alap Enterprises, INC.",Roam Supply Co.,alap enterprises,5301.0,alap enterprises,4/24/2019,11/20/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubS,Amory@roamsupplycompany.Com,(831) 246-2184,,Amory Langmo,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0
1461,CDPH-10003944,Manufactured Cannabis License,Type N,1,1,0,1,Calabasas Edibles Company,Calabasas Edibles Company,calabasas edibles company,5302.0,calabasas edibles company,10/29/2019,12/16/2019,0,1,0,0,0,0,0,Data Not Available,Limited Liability Company,Calabasascandyco@yahoo.Com,(818) 681-2614,,Cheryl De Rose,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0
3401,CDPH-10003681,Manufactured Cannabis License,Type N,1,1,0,1,"Foco, INC.",Data Not Available,foco,5303.0,foco,7/11/2019,10/31/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubS,Ryan@bseenco.Com,(916) 502-6631,,Ryan Dearkland,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0
5958,CDPH-10003167,Manufactured Cannabis License,Type N,1,1,0,1,"Lba California Manufacturing, INC.",Data Not Available,lba california manufacturing,5304.0,lba california manufacturing,5/13/2019,10/1/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubC,Burl@lunchboxalchemy.Com,(541) 797-1700,,Douglas Bryson,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0
6374,CDPH-10003910,Manufactured Cannabis License,Type S,1,1,0,1,Marbl Corporation,Marbl Corporation,marbl,5305.0,marbl,10/8/2019,12/16/2019,0,1,0,0,0,0,0,Data Not Available,PvtCorpSubC,Eric.Piearcy@gmail.Com,(844) 855-2254,,Eric Piearcy,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0


### New SQL Files

In [78]:
# This creates the dataframes for the new additions to the ca_roll and ca_contact databases

ca_roll_new = df_edited_null[['license_number', 'roll_up_id', 'company_roll_up']].reset_index()

ca_contact_new = df_edited_null[['license_number', 'contact_email', 'contact_phone', 'contact_website',
                                 'contact_owner_1', 'contact_owner_2', 'contact_street', 'contact_city',
                                 'contact_county', 'contact_state', 'contact_zip']].reset_index()# This

In [79]:
# Let's read back in the SQL databse for ca_roll so we can add to it

sql_ca_roll = pd.read_sql("SELECT * FROM ca_cannabis.dbo.ca_roll", ca_cannabis)

In [80]:
# Just making sure everything is adding right

print(len(sql_ca_roll.append(ca_roll_new, sort=True).drop(axis='columns', labels='index')))
print(len(sql_ca_roll))
print(len(ca_roll_new))
len(sql_ca_roll.append(ca_roll_new, sort=True).drop(axis='columns', labels='index')) - len(sql_ca_roll) - len(ca_roll_new)

10694
10682
12


0

In [81]:
# This makes the final ca_roll dataframe to send back to SQL

ca_roll_final = sql_ca_roll.append(ca_roll_new, ignore_index=True, sort=False).drop(axis='columns', labels='index')

In [82]:
# Let's read back in the SQL databse for ca_roll so we can add to it

sql_ca_contact = pd.read_sql("SELECT * FROM ca_cannabis.dbo.ca_contact", ca_cannabis)

In [83]:
# Just making sure everything is adding right

print(len(sql_ca_contact.append(ca_contact_new, sort=True).drop(axis='columns', labels='index')))
print(len(sql_ca_contact))
print(len(ca_contact_new))
len(sql_ca_contact.append(ca_contact_new, sort=True).drop(axis='columns', labels='index')) - len(sql_ca_contact) - len(ca_contact_new)

10694
10682
12


0

In [84]:
# This is for differenciating in a bit
sql_ca_contact['in_db'] = 1
ca_contact_new['in_db'] = 0

In [85]:
# This makes the final ca_roll dataframe to send back to SQL

ca_contact_final = sql_ca_contact.append(ca_contact_new, ignore_index=True, sort=False).drop(axis='columns', labels='index')

In [86]:
# This makes the new dataframe for the ca_main database

ca_main_new = df_edited_null.append(df_edited_null).reset_index()
ca_main_new = ca_main_new.drop(axis='columns', labels='index')

In [87]:
df_edited_null.head()

Unnamed: 0,license_number,license_category,license_description,license_adult,license_medicinal,license_annual,license_provisional,name_legal,name_dba,name_legal_clean,roll_up_id,company_roll_up,date_issue,date_expiration,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended,business_description,business_company_type,contact_email,contact_phone,contact_website,contact_owner_1,contact_owner_2,contact_street,contact_city,contact_county,contact_state,contact_zip,changed_contact_info,date_uploaded,in_db,in_sql
477,CDPH-10002794,Manufactured Cannabis License,Type S,1,1,0,1,"Alap Enterprises, INC.",Roam Supply Co.,alap enterprises,5301.0,alap enterprises,4/24/2019,11/20/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubS,Amory@roamsupplycompany.Com,(831) 246-2184,,Amory Langmo,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0
1461,CDPH-10003944,Manufactured Cannabis License,Type N,1,1,0,1,Calabasas Edibles Company,Calabasas Edibles Company,calabasas edibles company,5302.0,calabasas edibles company,10/29/2019,12/16/2019,0,1,0,0,0,0,0,Data Not Available,Limited Liability Company,Calabasascandyco@yahoo.Com,(818) 681-2614,,Cheryl De Rose,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0
3401,CDPH-10003681,Manufactured Cannabis License,Type N,1,1,0,1,"Foco, INC.",Data Not Available,foco,5303.0,foco,7/11/2019,10/31/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubS,Ryan@bseenco.Com,(916) 502-6631,,Ryan Dearkland,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0
5958,CDPH-10003167,Manufactured Cannabis License,Type N,1,1,0,1,"Lba California Manufacturing, INC.",Data Not Available,lba california manufacturing,5304.0,lba california manufacturing,5/13/2019,10/1/2019,0,0,0,0,0,1,0,Data Not Available,PvtCorpSubC,Burl@lunchboxalchemy.Com,(541) 797-1700,,Douglas Bryson,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0
6374,CDPH-10003910,Manufactured Cannabis License,Type S,1,1,0,1,Marbl Corporation,Marbl Corporation,marbl,5305.0,marbl,10/8/2019,12/16/2019,0,1,0,0,0,0,0,Data Not Available,PvtCorpSubC,Eric.Piearcy@gmail.Com,(844) 855-2254,,Eric Piearcy,,Not Published,Data Not Available,Data Not Available,Data Not Available,Data Not Available,0,2020-11-04,0,0


## Upload to SQL

### Create the Connection

In [88]:
# Typing the driver directly into the create_engine kept not working, so I'm trying it this way
driver="ODBC Driver 17 for SQL Server"

In [89]:
# This creates the engine needed
engine = create_engine(f'mssql://LAPTOP-E6QKON1L/ca_cannabis?driver={driver}')

In [90]:
# This connects the engine
engine_con = engine.connect()

### CA Roll

In [91]:
# This sends the file to the correct location, which is currently the test file
ca_roll_final.to_sql('ca_roll_test', con=engine_con, if_exists='append', index=False)

In [92]:
ca_roll_final

Unnamed: 0,license_number,roll_up_id,company_roll_up
0,C11-0000615-LIC,39.0,300
1,CDPH-10003352,39.0,300
2,CCL19-0003643,1.0,00g
3,CDPH-10003551,2.0,1000 palms associates group
4,C11-0000630-LIC,2.0,1000 palms associates group
5,CCL19-0000084,2.0,1000 palms associates group
6,C11-0000596-LIC,3.0,10th street facilities
7,CCL19-0002577,3.0,10th street facilities
8,CDPH-10003795,3.0,10th street facilities
9,C11-0000508-LIC,4.0,12/12 management


### CA Contact

In [93]:
ca_contact_final

Unnamed: 0,license_number,contact_email,contact_phone,contact_website,contact_owner_1,contact_owner_2,contact_street,contact_city,contact_county,contact_state,contact_zip,in_db
0,C11-0000615-LIC,kevin@caliherba.com,9.49631e+09,,kevin bacon,,,adelanto,san bernardino,CA,92301,1
1,CDPH-10003352,kevin@caliherba.com,9.49631e+09,,,,,adelanto,san bernardino,CA,,1
2,CCL19-0003643,contact@00g.com,5.30914e+09,,Drew Plebani,,17840 Wildwood Rd,Unincorporated,trinity,CA,96041,1
3,CDPH-10003551,arthurvugelman@yahoo.com,8.18694e+09,,,,,los angeles,los angeles,CA,,1
4,C11-0000630-LIC,cookieco415@icloud.com,8.18694e+09,,arthur vugelman,,,los angeles,los angeles,CA,90001,1
5,CCL19-0000084,arthurvugelman@yahoo.com,8.18694e+09,,arthur vugelman,,935 E 59th Street,Los Angeles,los angeles,CA,90001,1
6,C11-0000596-LIC,jad2129@columbia.edu,2.05523e+09,,jason dixon,,2180 E. 10TH Street,los angeles,los angeles,CA,90021,1
7,CCL19-0002577,jad2129@columbia.edu,2.05523e+09,,JASON DIXON,,2180 E. 10TH Street,Los Angeles,los angeles,CA,90021,1
8,CDPH-10003795,jad2129@columbia.edu,2.05523e+09,,,,,los angeles,los angeles,CA,,1
9,C11-0000508-LIC,info@1212distro.com,8.3342e+09,,garrett gervais,,,fortuna,humboldt,CA,95540,1


In [94]:
# This drops all 

ca_contact_final = ca_contact_final[ca_contact_final['in_db'] == 0]
ca_contact_final = ca_contact_final.drop(columns='in_db')

In [95]:
# This changes the format of the phone number so that contact_phone can be changed to a float, and then changes it

for n in range(len(ca_contact_final)):
    ca_contact_final['contact_phone'].iloc[n] = ''.join(re.findall(r'\d+', str(ca_contact_final['contact_phone'].iloc[n])))
    
ca_contact_final['contact_phone'] = ca_contact_final['contact_phone'].astype(float)

In [96]:
# This changes the format of the zip code so that contact_zip can be changed to a float, and then changes it

for n in range(len(ca_contact_final)):
    if len(re.findall(r'\d+', str(ca_contact_final['contact_zip'].iloc[n]))) == 0:
        ca_contact_final['contact_zip'].iloc[n] = 0
    else:
        ca_contact_final['contact_zip'].iloc[n] = ''.join(re.findall(r'\d+', str(ca_contact_final['contact_zip'].iloc[n])))
        
ca_contact_final['contact_zip'] = ca_contact_final['contact_zip'].astype(float)

In [97]:
sql_ca_contact.dtypes

license_number      object
contact_email       object
contact_phone      float64
contact_website     object
contact_owner_1     object
contact_owner_2     object
contact_street      object
contact_city        object
contact_county      object
contact_state       object
contact_zip        float64
in_db                int64
dtype: object

In [98]:
ca_contact_final.dtypes

license_number      object
contact_email       object
contact_phone      float64
contact_website     object
contact_owner_1     object
contact_owner_2     object
contact_street      object
contact_city        object
contact_county      object
contact_state       object
contact_zip        float64
dtype: object

In [99]:
# This is just so I can test if the command will change the old info
ca_contact_final['new_contact'] = 1

In [100]:
ca_contact_final

Unnamed: 0,license_number,contact_email,contact_phone,contact_website,contact_owner_1,contact_owner_2,contact_street,contact_city,contact_county,contact_state,contact_zip,new_contact
10682,CDPH-10002794,Amory@roamsupplycompany.Com,8312462000.0,,Amory Langmo,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1
10683,CDPH-10003944,Calabasascandyco@yahoo.Com,8186813000.0,,Cheryl De Rose,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1
10684,CDPH-10003681,Ryan@bseenco.Com,9165027000.0,,Ryan Dearkland,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1
10685,CDPH-10003167,Burl@lunchboxalchemy.Com,5417972000.0,,Douglas Bryson,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1
10686,CDPH-10003910,Eric.Piearcy@gmail.Com,8448552000.0,,Eric Piearcy,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1
10687,CDPH-10003083,Adamb@medmen.Com,3235935000.0,,Adam Bierman,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1
10688,CDPH-10002422,Ashley.Nelson@theresourcery.Com,7072008000.0,,Ashley Nelson,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1
10689,CDPH-10003412,Mk@romeflower.Com,9163852000.0,,Musie Kidane,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1
10690,CDPH-10002356,Stephengar9@gmail.Com,7608596000.0,,Stephen Garcia,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1
10691,CDPH-10002586,Sbcalifornia@stanleybrothers.Co,3037463000.0,,J Stanley,,Not Published,Data Not Available,Data Not Available,Data Not Available,0.0,1


In [101]:
# This sends the file to the correct location, which is currently the test file
ca_contact_final.to_sql('ca_contact_test', con=engine_con, if_exists='append', index=False)

### Create main_alter Excel File

In [102]:
main_alter_goal = pd.read_excel('result/main_alter_10_28_20.xlsx')

In [103]:
main_alter_goal

Unnamed: 0,business_company_type,business_description,changed_contact_info,company_roll_up,contact_city,contact_county,contact_email,contact_owner_1,contact_owner_2,contact_phone,contact_state,contact_street,contact_website,contact_zip,date_expiration,date_issue,date_uploaded,in_db,in_sql,license_adult,license_annual,license_category,license_description,license_medicinal,license_number,license_provisional,name_dba,name_legal,name_legal_clean,roll_up_id,status_active,status_canceled,status_expired,status_inactive,status_revoked,status_surrendered,status_suspended
0,0,0,0,17k muskrat 2,0,0,donnieacker3@yahoo.com,0,0,0,CA,0,0,0,2021-06-09,2020-06-09,10/21/2020,1,1,0,0,Cannabis Cultivation License,Medium Indoor,1,CCL20-0000276,0,0,"17k muskrat 1, inc.",17k muskrat 1,4359,1,0,0,0,0,0,0
1,Limited Liability Company,N/A for this license type,0,2018hmo,0,san diego,aaronmagagna@gmail.com,aaron magagna,0,6194050298,CA,3940 home ave san diego,0,92105,2021-10-22,2019-10-23,10/21/2020,1,1,1,0,Bureau of Cannabis Control,Retailer,1,C10-0000642-LIC,0,0,2018hmo llc,2018hmo,29,1,0,0,0,0,0,0
2,undefined,N/A for this license type,0,3011 sr ave,0,sonoma,brandon@doobienights.com,brandon levine,0,7073211800,CA,3011 santa rosa ave santa rosa,www.doobienights.com,95407,2021-11-17,2019-11-18,10/21/2020,1,1,1,0,Bureau of Cannabis Control,Retailer,1,C10-0000656-LIC,0,doobie nights,"3011 sr ave., inc.",3011 sr ave,41,1,0,0,0,0,0,0
3,Corporation,"Level 1 Manufacturer, Distributor, Retailer No...",0,420 strains,oceano,san luis obispo,products@420strains.net,ron laurence,0,8054204250,CA,0,420strains.net,93445,2021-10-02,2019-10-03,10/21/2020,1,1,1,0,Bureau of Cannabis Control,Microbusiness,1,C12-0000270-LIC,0,420 strains inc.,420 strains inc,420 strains,59,1,0,0,0,0,0,0
4,0,0,0,420-1,perris,riverside,jonathan@levyre.com,0,0,3108837900,CA,0,0,0,2021-04-26,2020-04-26,10/21/2020,1,1,1,0,Manufactured Cannabis License,Type 6,1,CDPH-10002903,1,"420-1, llc","420-1, llc",420-1,60,1,0,0,0,0,0,0
5,0,0,0,legion of bloom,Oakland,alameda,marcos@thelegionofbloom.com,Marcos Morales,0,7075085806,CA,5601 San Leandro Street,0,94621,2021-11-15,2019-11-15,10/21/2020,1,1,1,0,Cannabis Cultivation License,Medium Indoor,0,CCL19-0000077,1,0,"5601-a, llc",5601-a,70,1,0,0,0,0,0,0
6,Corporation,N/A for this license type,0,562 discount med,0,los angeles,connectedbelmontshore@gmail.com,elliot lewis,0,5623703780,CA,5227 2nd st long beach,www.connectedcannabisco.com,90803,2021-06-17,2019-06-18,10/21/2020,1,1,1,0,Bureau of Cannabis Control,Retailer,1,C10-0000227-LIC,0,"562 discount med, inc.","562 discount med, inc.",562 discount med,71,1,0,0,0,0,0,0
7,Sole Proprietorship,N/A for this license type,0,nguey lay,san francisco,san francisco,Ngueylay@msn.com,nguey lay,0,4157067014,CA,0,0,94107,2021-05-19,2019-05-20,10/21/2020,1,1,1,0,Bureau of Cannabis Control,Distributor,1,C11-0000129-LIC,0,golden dragon distribution,"888 lay, llc",888 lay,2666,1,0,0,0,0,0,0
8,Corporation,N/A for this license type,0,a tribe of us collective,0,0,nathan@roselosangeles.com,nathan cozzolino,0,6509962926,0,0,0,0,2021-07-28,2020-07-28,10/21/2020,1,1,1,0,Bureau of Cannabis Control,Distributor,1,C11-0001244-LIC,0,rose delights,a tribe of us collective,a tribe of us collective,98,1,0,0,0,0,0,0
9,0,0,0,accentian,oakland,alameda,Joseph@accentian.com,0,0,3054206945,CA,0,0,0,2021-05-31,2020-05-31,10/21/2020,1,1,1,0,Manufactured Cannabis License,Type 7,1,CDPH-10003335,1,accentian inc.,accentian inc.,accentian,117,1,0,0,0,0,0,0


In [104]:
sql_main = pd.read_sql("SELECT * FROM ca_main", ca_cannabis)

In [105]:
sql_main.columns

Index(['license_number', 'license_category', 'license_description',
       'license_adult', 'license_medicinal', 'license_annual',
       'license_provisional', 'name_legal', 'name_dba', 'date_issue',
       'date_expiration', 'status_active', 'status_canceled', 'status_expired',
       'status_revoked', 'status_surrendered', 'status_suspended',
       'business_description', 'business_company_type', 'date_uploaded',
       'in_db'],
      dtype='object')

In [106]:
sql_main.columns

Index(['license_number', 'license_category', 'license_description',
       'license_adult', 'license_medicinal', 'license_annual',
       'license_provisional', 'name_legal', 'name_dba', 'date_issue',
       'date_expiration', 'status_active', 'status_canceled', 'status_expired',
       'status_revoked', 'status_surrendered', 'status_suspended',
       'business_description', 'business_company_type', 'date_uploaded',
       'in_db'],
      dtype='object')

In [107]:
df_main = df_edited[['license_number', 'license_category', 'license_description',
       'license_adult', 'license_medicinal', 'license_annual',
       'license_provisional', 'name_legal', 'name_dba', 'date_issue',
       'date_expiration', 'status_active', 'status_canceled', 'status_expired',
       'status_revoked', 'status_surrendered', 'status_suspended',
       'business_description', 'business_company_type', 'date_uploaded',
       'in_db']]

In [108]:
sql_main.dtypes

license_number                   object
license_category                 object
license_description              object
license_adult                   float64
license_medicinal               float64
license_annual                  float64
license_provisional             float64
name_legal                       object
name_dba                         object
date_issue               datetime64[ns]
date_expiration          datetime64[ns]
status_active                   float64
status_canceled                 float64
status_expired                  float64
status_revoked                  float64
status_surrendered              float64
status_suspended                float64
business_description             object
business_company_type            object
date_uploaded            datetime64[ns]
in_db                           float64
dtype: object

In [109]:
df_main.dtypes

license_number           object
license_category         object
license_description      object
license_adult             int64
license_medicinal         int64
license_annual            int64
license_provisional       int64
name_legal               object
name_dba                 object
date_issue               object
date_expiration          object
status_active             int64
status_canceled           int64
status_expired            int64
status_revoked            int64
status_surrendered        int64
status_suspended          int64
business_description     object
business_company_type    object
date_uploaded            object
in_db                     int64
dtype: object

In [110]:
# This changes all the column types to match the sql column types

# df_main.license_number.astype(object)
# df_main.license_category.astype(objec)
# df_main.license_description.astype(object)
df_main.license_adult = df_main.license_adult.astype(float)
df_main.license_medicinal = df_main.license_medicinal.astype(float)
df_main.license_annual = df_main.license_annual.astype(float)
df_main.license_provisional = df_main.license_provisional.astype(float)
# df_main.name_legal.astype(object)
# df_main.name_dba.astype(object)
df_main.date_issue = df_main.date_issue = pd.to_datetime(df_main.date_issue)
df_main.date_expiration = df_main.date_expiration = pd.to_datetime(df_main.date_expiration)
df_main.status_active = df_main.status_active.astype(float)
df_main.status_canceled = df_main.status_canceled.astype(float)
df_main.status_expired = df_main.status_expired.astype(float)
df_main.status_revoked = df_main.status_revoked.astype(float)
df_main.status_surrendered = df_main.status_surrendered.astype(float)
df_main.status_suspended = df_main.status_suspended.astype(float)
# df_main.business_description.astype(object)
# df_main.business_company_type.astype(object)
df_main.date_uploaded = pd.to_datetime(df_main.date_uploaded)
df_main.in_db = df_main.in_db.astype(float)

In [111]:
df_main[df_main.license_number == 'CCL19-0003643']

Unnamed: 0,license_number,license_category,license_description,license_adult,license_medicinal,license_annual,license_provisional,name_legal,name_dba,date_issue,date_expiration,status_active,status_canceled,status_expired,status_revoked,status_surrendered,status_suspended,business_description,business_company_type,date_uploaded,in_db
0,CCL19-0003643,Cannabis Cultivation License,Small Mixed-Light Tier 1,1.0,0.0,0.0,1.0,"00G, LLC.",Data Not Available,2019-12-19,2020-12-19,1.0,0.0,0.0,0.0,0.0,0.0,Data Not Available,Limited Liability Company,2020-03-12,1.0


In [112]:
sql_main[sql_main.license_number == 'CCL19-0003643']

Unnamed: 0,license_number,license_category,license_description,license_adult,license_medicinal,license_annual,license_provisional,name_legal,name_dba,date_issue,date_expiration,status_active,status_canceled,status_expired,status_revoked,status_surrendered,status_suspended,business_description,business_company_type,date_uploaded,in_db
252,CCL19-0003643,Cannabis Cultivation License,Small Mixed-Light Tier 1,1.0,0.0,,1.0,"00G, LLC.",,2019-12-19,2020-12-19,1.0,,,0.0,0.0,,,,2020-03-12,1.0


In [119]:
# This creates a list of all of the license_numbers of rows that belong in the main_alter csv

to_alter = []

for lic in df_main['license_number']:
    if lic not in list(sql_main['license_number']):
        to_alter.append(lic)
    elif lic in list(sql_main['license_number']):
        if False in (sql_main[sql_main['license_number'] == lic].values == df_main[df_main['license_number'] == lic].values):
            to_alter.append(lic)
        else:
            print(sql_main[sql_main['license_number'] == lic].values == df_main[df_main['license_number'] == lic].values)

In [123]:
# This creates the dataframe the main_alter csv

main_alter = pd.DataFrame(to_alter, columns=['license_number'])

main_alter = main_alter.merge(df_main, on='license_number')

In [157]:
# And let's toss this into a csv 

main_alter.to_excel(f'result/main_alter_{date_day}_{date_month}_{date_year}.xlsx', index=False)

##### So, I think that because of the differences in how the new source shows data, there are differences in literally every row.
##### I assume that will just mean that the first change will be major, but future changes might only be the actual changes instead of missing data.