In [None]:
import pandas as pd
import pyodbc
from pathlib import Path
from sqlalchemy import create_engine
import numpy as np
import urllib

## Clean Control

works for either just one file or multiple

In [None]:
filenames = Path('../control_files').glob('*.csv')
list_of_dfs = [pd.read_csv(file) for file in filenames]
for dataframe, file in zip(list_of_dfs, filenames):
    Dataframe['file'] = file
df_file = pd.concat(list_of_dfs, ignore_index=True, sort=False)

In [None]:
df_file.rename(columns={'License Number': 'license_number', 
                   'License Type': 'license_type', 
                   'Business Owner': 'business_owner',
                   'Business Contact Information': 'business_contact_information', 
                   'Business Structure': 'business_structure', 
                   'Premise Address': 'premise_address',
                   'Status': 'status_curr', 
                   'Issue Date': 'date_issue', 
                   'Expiration Date': 'date_expiration',
                    'Activities': 'business_description',
                   'Adult-Use/Medicinal': 'adult_medicinal'}, inplace=True)

drop "TEMP" licenses. These were all created in 2018 when industry started.

In [None]:
df_file.drop(df_file[df_file['license_number'].str.contains("TEMP")].index, inplace=True)
df_file.reset_index(drop=True, inplace=True)

There are multiple lines that have the header name within the file. Reset index after dropping

In [None]:
df_file.drop(df_file[df_file['license_number'] == 'License Number'].index, inplace=True)
df_file.reset_index(drop=True, inplace=True)

big Y for yyyy. small y for yy.

In [None]:
df_file['date_issue'] = pd.to_datetime(df_file['date_issue'], format="%m/%d/%Y")
df_file['date_expiration'] = pd.to_datetime(df_file['date_expiration'], format="%m/%d/%Y")

In [None]:
df_file['in_db'] = 0

get dummies for adult_medicinal. then assign both to adult/medicinal and eliminate both. add dummies back to original file

In [None]:
df1_dum = pd.get_dummies(df_file['adult_medicinal'])
for i in range(len(df1_dum['BOTH'])):
    if df1_dum["BOTH"][i] == 1:
        df1_dum['Adult-Use'][i] = 1
        df1_dum['Medicinal'][i] = 1
df1_dum.drop(columns=["BOTH"], inplace=True)
df1_dum.rename(columns={'Adult-Use': 'license_adult', 'Medicinal': 'license_medicinal'}, inplace=True)
df_file = pd.concat([df_file, df1_dum], axis=1)

get dummies for status. append "status" and make lower so dont need to change. add dummies back to original file.

In [None]:
df1_dum_status = pd.get_dummies(df_file['status_curr'], prefix="status")
df1_dum_status.columns = map(str.lower, df1_dum_status.columns)
df_file = pd.concat([df_file, df1_dum_status], axis=1)

add bureau of cannabis control to differentiate it when combined.
Remove redundant cannabis and license words from the license type

In [None]:
df_file['main_license_type'] = "Bureau of Cannabis Control"
df_file['license_type'] = df_file['license_type'].str.replace("Cannabis - ", "")
df_file['license_type'] = df_file['license_type'].str.replace("License", "")
df_file['license_type'] = df_file['license_type'].str.strip()

there are several owner names that have been merged into one column. Need to split each owner up individually.

In [None]:
df_file['business_owner'] = df_file['business_owner'].str.replace(": :", ":")
df_names_nan = df_file['business_owner'].str.split(':', expand=True)
for i in range(len(df_names_nan.columns)):
    df_names_nan[i] = df_names_nan[i].str.lower()
    df_names_nan[i] = df_names_nan[i].str.strip()

dropping duplicates across the row. sometimes owner name listed multiple times in same row. after duplicate names have been deleted, move the names one over to the left so there are not blanks in between names

In [None]:
df_names_removed = df_names_nan.apply(lambda x: x.drop_duplicates(), axis=1)
df2 = df_names_removed.apply(lambda x:pd.Series(x.dropna().values), axis=1)

renaming to make it easier when combining. only grabbing first two owners. the third column was only 0.42% (12 / 2812) of the whole df.

In [None]:
df2.info()

In [None]:
pd_owner = df2.iloc[:,0:2]

In [None]:
pd.set_option('mode.chained_assignment', None)
for i in range(len(pd_owner.columns)):
    pd_owner.rename(columns={i: 'contact_owner_{}'.format(i+1)}, inplace=True)

loop through the split business contact information and if the column contains "Email" then append it to the email list.

In [None]:
df_biz_nan = df_file['business_contact_information'].str.split(' :', expand=True)

In [None]:
pd_email = None
for i in range(len(df_biz_nan.columns)):
    if pd_email is None:
        pd_email = df_biz_nan[df_biz_nan[i].str.contains("Email", na=False)][i]
    else:    
        pd_email = pd_email.append(df_biz_nan[df_biz_nan[i].str.contains("Email", na=False)][i], ignore_index=False)

if a company were to have a dba it would be in the second column

In [None]:
pd_dba = df_biz_nan[~df_biz_nan[1].str.contains("Email", na=False)][1]
pd_name = df_biz_nan[0]

loop through to add phone and website. concat all the series into one.

In [None]:
pd_phone = None
for i in range(len(df_biz_nan.columns)):
    if pd_phone is None:
        pd_phone = df_biz_nan[df_biz_nan[i].str.contains("Phone", na=False)][i]
    else:
        pd_phone = pd_phone.append(df_biz_nan[df_biz_nan[i].str.contains("Phone", na=False)][i], ignore_index=False)

In [None]:
pd_website = None
for i in range(len(df_biz_nan.columns)):
    if pd_website is None:
        pd_website = df_biz_nan[df_biz_nan[i].str.contains("Website", na=False)][i]
    else:
        pd_website = pd_website.append(df_biz_nan[df_biz_nan[i].str.contains("Website", na=False)][i], ignore_index=False)

In [None]:
pd_biz_all = pd.concat([pd_name.rename("name_legal"), pd_dba.rename("name_dba"), pd_email.rename("contact_email"), pd_phone.rename("contact_phone"), pd_website.rename("contact_website")], axis=1)

cleaning up contact info

In [None]:
pd_biz_all['name_legal'] = pd_biz_all['name_legal'].str.strip()
pd_biz_all['name_dba'] = pd_biz_all['name_dba'].str.strip()

pd_biz_all['contact_email'] = pd_biz_all['contact_email'].str.replace("Email- ", "")
pd_biz_all['contact_email'] = pd_biz_all['contact_email'].str.strip()

pd_biz_all['contact_phone'] = pd_biz_all['contact_phone'].str.replace("Phone- ", "")
pd_biz_all['contact_phone'] = pd_biz_all['contact_phone'].str.replace("-", "")
pd_biz_all['contact_phone'] = pd_biz_all['contact_phone'].str.replace("(", "")
pd_biz_all['contact_phone'] = pd_biz_all['contact_phone'].str.replace(")", "")
pd_biz_all['contact_phone'] = pd_biz_all['contact_phone'].str.replace(" ", "")
pd_biz_all['contact_phone'] = pd_biz_all['contact_phone'].str.strip()

pd_biz_all['contact_website'] = pd_biz_all['contact_website'].str.replace("Website- ", "")
pd_biz_all['contact_website'] = pd_biz_all['contact_website'].str.strip()

In [None]:
df_file.info()

In [None]:
# pd_all = pd.concat([pd_biz_all, pd_owner, pd_zip_county], axis=1)
pd_all = pd.concat([pd_biz_all, pd_owner], axis=1)

In [None]:
df_control = pd.concat([df_file, pd_all], axis=1)

In [None]:
df_control.info()

In [None]:
df_control.drop(columns=['business_owner', 
                     'business_contact_information', 
#                      'premise_address', 
                     'status_curr', 
                     'adult_medicinal'], inplace=True)

In [None]:
df_control.rename(columns={
'license_type' : 'license_description',
'business_structure' : 'business_company_type',
'main_license_type' : 'license_category'}, inplace=True)

In [None]:
# df_control.columns

In [None]:
df_control.shape

In [None]:
df_control

# Clean Manufacture

In [None]:
filenames = Path('../manufacture_files').glob('*.csv')
list_of_dfs = [pd.read_csv(file, header=1) for file in filenames]
for dataframe, file in zip(list_of_dfs, filenames):
    Dataframe['file'] = file

df_file_m = pd.concat(list_of_dfs, ignore_index=True, sort=False)
len(df_file_m)

In [None]:
df_file_m.rename(columns={
'BUSINESS LEGAL NAME': 'name_legal', 
'BUSINESS DBA NAME': 'name_dba', 
'LICENSE NUMBER': 'license_number',
'PREMISES CITY': 'contact_city', 
'STATUS': 'status', 
'LICENSE CATEGORY': 'license_category', 
'PREMISES EMAIL': 'contact_email',
'EFFECTIVE DATE': 'date_issue', 
'LICENSE TYPE': 'license_description', 
'PREMISES PHONE': 'contact_phone', 
'EXPIRATION DATE': 'date_expiration',
'PREMISES COUNTY': 'contact_county', 
'ANNUAL/PROVISIONAL': 'annual_provisional'  
}, inplace=True)

In [None]:
df_file_m['date_issue'] = pd.to_datetime(df_file_m['date_issue'], format="%m/%d/%Y")
df_file_m['date_expiration'] = pd.to_datetime(df_file_m['date_expiration'], format="%m/%d/%Y")

if phone type is int then dont need to remove "-"

In [None]:
# df_file_m['contact_phone'].dtype
# pd_biz_all['contact_phone'] = pd_biz_all['contact_phone'].str.replace("-", "")
# df_file_m['contact_phone'].str.contains("-")

In [None]:
df_file_m['in_db'] = 0

get dummies for status column

In [None]:
active_dum = pd.get_dummies(df_file_m['status'], prefix="status")
active_dum.columns = map(str.lower, active_dum.columns)
df_file_m = pd.concat([df_file_m, active_dum], axis=1)
df_file_m.drop(columns='status', inplace=True)

get dummies for license category. basically adds the column if it doesnt exist so it can loop through.

In [None]:
category_dum = pd.get_dummies(df_file_m['license_category'])
category_dum.head()

In [None]:
if 'Adult Use' not in category_dum.columns:
    category_dum['license_adult'] = 0
else:
    category_dum.rename(columns={'Adult Use': 'license_adult'}, inplace=True)
    
if 'Medicinal' not in category_dum.columns:
    category_dum['license_medicinal'] = 0
else:
    category_dum.rename(columns={'Medicinal': 'license_medicinal'}, inplace=True)
    
if 'Adult and Medicinal' in category_dum.columns:
    for i in range(len(category_dum)):
        if category_dum["Adult and Medicinal"][i] == 1:
            category_dum['license_adult'][i] = 1
            category_dum['license_medicinal'][i] = 1
    category_dum.drop(columns='Adult and Medicinal', inplace=True)

df_file_m = pd.concat([df_file_m, category_dum], axis=1)
df_file_m.drop(columns='license_category', inplace=True)

get dummies for annual / provisional

In [None]:
annual_dum = pd.get_dummies(df_file_m['annual_provisional'], prefix='license')
annual_dum.columns = map(str.lower, annual_dum.columns)
df_file_m = pd.concat([df_file_m, annual_dum], axis=1)
df_file_m.drop(columns='annual_provisional', inplace=True)

modifiy the contact info

In [None]:
df_file_m['contact_city'] = df_file_m['contact_city'].str.lower()
df_file_m['contact_city'] = df_file_m['contact_city'].str.strip()

df_file_m['contact_county'] = df_file_m['contact_county'].str.replace("County", "")
df_file_m['contact_county'] = df_file_m['contact_county'].str.lower()
df_file_m['contact_county'] = df_file_m['contact_county'].str.strip()

df_file_m['contact_state'] = 'CA'

df_file_m['license_category'] = 'Manufactured Cannabis License'

# Clean Cultivation

Two versions. Can either manually combine the two tabs into one or can treat each tab as a separate workbook. The provisional DRP typically has about 25 that the business one does not have.

Also the lists don't include "Expired - Pending Renewal"

In [None]:
filenames = Path('../cultivation_files').glob('*.csv')
# list_of_dfs = [pd.read_csv(file) for file in filenames]

# Only use this when the above code doesn't work.
list_of_dfs = [pd.read_csv(file, encoding="ISO-8859-1") for file in filenames]

for dataframe, file in zip(list_of_dfs, filenames):
    Dataframe['file'] = file

cult_df = pd.concat(list_of_dfs, ignore_index=True, sort=False)
len(cult_df)

reminder - when editing the file just use issuance date and then can remove the below

In [None]:
cult_df.head(1)

In [None]:
cult_df.rename(columns={'Valid From Date':'Issuance Date'}, inplace=True)

drop temp licenses, then reset index

In [None]:
cult_df.drop(cult_df[cult_df['License Type'] == 'Temporary Cannabis Cultivation License'].index, inplace=True)
cult_df.reset_index(drop=True, inplace=True)

In [None]:
cult_df.drop(cult_df[cult_df['License Type'] == 'Cultivation License Renewal'].index, inplace=True)
cult_df.reset_index(drop=True, inplace=True)

In [None]:
cult_df['Issuance Date'] = pd.to_datetime(cult_df['Issuance Date'], format="%m/%d/%Y")
cult_df['Expiration Date'] = pd.to_datetime(cult_df['Expiration Date'], format="%m/%d/%Y")
# cult_df['contact_owner_1'] = cult_df['First Name'] + " " + cult_df['Last Name']
cult_df['in_db'] = 0

drop and rename columns

In [None]:
# cult_df.drop(columns=['APN', 'First Name', 'Last Name', 'Title'], inplace=True)

cult_df.rename(columns={
# 'Legal Business Name': 'name',
'ï»¿Legal Business Name': 'name',
'E-Mail': 'contact_email',
# 'Phone Number': 'contact_phone',
'License Type': 'license_category',
'Type of License': 'license_type',
'License Number': 'license_number',
'License Status': 'status',
'Issuance Date': 'date_issue',
'Expiration Date': 'date_expiration',
# 'Premise Address': 'contact_street',
# 'Premise City': 'contact_city',
# 'Premise County': 'contact_county',
# 'Premise Zip': 'contact_zip'
}, inplace=True)

get dummies for status. change about to expire to active. add back to main df

In [None]:
df1_dum = pd.get_dummies(cult_df['status'], prefix='status')

In [None]:
df1_dum.head(1)

In [None]:
if 'status_About to Expire' in df1_dum.columns:
    if 'status_Active' in df1_dum.columns:
        df1_dum['status_active'] = df1_dum['status_About to Expire'] + df1_dum['status_Active']
        df1_dum.drop(columns={'status_About to Expire', 'status_Active'}, inplace=True)
#         df1_dum.columns = map(str.lower, df1_dum.columns)
#     elif 'status_Active' not in df1_dum.columns:
#         df1_dum['status_active'] = df1_dum['status_About to Expire']
#         df1_dum.drop(columns={'status_About to Expire'})
#         df1_dum.columns = map(str.lower, df1_dum.columns)
# elif 'status_About to Expire' not in df1_dum.columns:
#         df1_dum.columns = map(str.lower, df1_dum.columns)

In [None]:
if 'status_Expired - Pending Renewal' in df1_dum.columns:
    if 'status_Expired' in df1_dum.columns:
        df1_dum['status_expired'] = df1_dum['status_Expired - Pending Renewal'] + df1_dum['status_Expired']
        df1_dum.drop(columns={'status_Expired - Pending Renewal', 'status_Expired'}, inplace=True)

In [None]:
df1_dum.columns = map(str.lower, df1_dum.columns)

In [None]:
df1_dum.head(1)

In [None]:
cult_df = pd.concat([cult_df, df1_dum], axis=1)
cult_df.drop(columns='status', inplace=True)

split license type then concat. add back license description. remove license type

In [None]:
df_license = cult_df['license_type'].str.split(' - ', expand=True)
df_adult = pd.get_dummies(df_license[0])

In [None]:
df_adult.head(1)

In [None]:
# df_adult['license_adult'] = df_adult['Adult-Use'] + df_adult['Annual Adult-Use'] + df_adult['Provisional Adult-Use']
df_adult['license_adult'] = df_adult['Annual Adult-Use'] + df_adult['Provisional Adult-Use']

In [None]:
df_adult['license_medicinal'] = df_adult['Annual Medicinal'] + df_adult['Provisional Medicinal']
# df_adult['license_medicinal'] = df_adult['Annual Medicinal'] + df_adult['Provisional Medicinal'] + df_adult['Medicinal']

In [None]:
df_adult['license_provisional'] = df_adult['Provisional Adult-Use'] + df_adult['Provisional Medicinal']
df_ad_md_pr = df_adult[['license_adult', 'license_medicinal', 'license_provisional']]

In [None]:
cult_df = pd.concat([cult_df, df_ad_md_pr], axis=1)

df_license.rename(columns={1:'license_description'}, inplace=True)
cult_df = pd.concat([cult_df, df_license['license_description']], axis=1)
cult_df.drop(columns=['license_type'], inplace=True)

edit county and phone

In [None]:
# cult_df['contact_county'] = cult_df['contact_county'].str.replace("County", "")
# cult_df['contact_county'] = cult_df['contact_county'].str.lower()
# cult_df['contact_county'] = cult_df['contact_county'].str.strip()

# cult_df['contact_phone'] = cult_df['contact_phone'].str.replace("-", "")
# cult_df['contact_phone'] = cult_df['contact_phone'].str.replace(" ", "")
# cult_df['contact_phone'] = cult_df['contact_phone'].str.strip()

cult_df['contact_state'] = 'CA'

In [None]:
#good way to check if numbers dont have any other characters, if the below works
# cult_df.astype({'contact_phone': float})

need to make name lowercase so that it catches all of the DBA

In [None]:
cult_df.head(1)

In [None]:
cult_df['name'] = cult_df['name'].str.lower()

if len(cult_df[cult_df['name'].str.contains("dba", na=False)]) > 0:
    df_dba = cult_df['name'].str.split("dba", expand=True)
    df_dba[0] = df_dba[0].str.strip()
    df_dba[1] = df_dba[1].str.strip()
    df_dba.rename(columns={0: 'name_legal', 1: 'name_dba'}, inplace=True)
    cult_df = pd.concat([cult_df, df_dba], axis=1)
    cult_df.drop(columns=['name'], inplace=True)
else:
    cult_df.rename(columns={'name': 'name_legal'}, inplace=True)

# Combine All

combine all the dataframes.

In [None]:
df_all = pd.concat([df_control, df_file_m, cult_df], axis=0, sort=False)
df_all.reset_index(drop=True, inplace=True)
len(df_all)

### make sure to edit. create date uploaded column. rearrange the columns to make it easier for editing.

In [None]:
df_all['date_uploaded'] = pd.datetime(2020, 10, 21)

In [None]:
df_all.info()

In [None]:
df_all = df_all[[
    'license_number', 'license_category','license_description','license_adult','license_medicinal','license_annual','license_provisional',
    'name_legal', 'name_dba', 
    'date_issue', 'date_expiration', 
    'status_active','status_canceled', 'status_expired', 'status_inactive','status_revoked', 'status_surrendered', 'status_suspended',
    'business_description', 'business_company_type',
    'contact_email','contact_phone','contact_website', 'contact_owner_1','contact_owner_2', 
#     'contact_street',
    'contact_city','contact_county','contact_state', 
#     'contact_zip', 
    'date_uploaded',
    'in_db',
]]

assign names to NaN / blank name_legal. not using contact information as it could get confusing. if the name from the csv is "N/A", it gets converted to NaN after it's brought in.

In [None]:
df_name_legal_null = df_all[(df_all['name_legal'] == "") | (df_all['name_legal'].isnull()) | (df_all['name_legal'] == 'no legal business name provided')]

In [None]:
len(df_name_legal_null)

### change date for date of file yyyymmdd

In [None]:
assign_date = '20210113'

In [None]:
pd.set_option('mode.chained_assignment', None)

for i in range(len(df_name_legal_null)):
    df_name_legal_null['name_legal'].iloc[i] = 'name_legal_null_' + assign_date + "_" + str(i)

df_all = df_all.append(df_name_legal_null)
df_all.drop_duplicates(subset='license_number', keep='last', inplace=True)
df_all.reset_index(drop=True, inplace=True)

## Roll up

In [None]:
cnxn = pyodbc.connect(server = 'bespoke-database-1.cmevrozrcs7c.us-west-2.rds.amazonaws.com', 
                      driver = '{ODBC Driver 17 for SQL Server}',
                      database = 'ca_cannabis',
                      UID = 'admin',
                      PWD = 'N19lrqxnurTUJLJT6GFe')

In [None]:
df_roll = pd.read_sql("SELECT * FROM ca_roll", cnxn)
len(df_roll)

remove dash "-" with space and ":" - with extra space or no?

In [None]:
# df_all[df_all['name_legal_clean'].str.contains("-")]

In [None]:
df_all['name_legal_clean'] = df_all['name_legal']

In [None]:
pd.set_option('mode.chained_assignment', None)

df_all['name_legal_clean'] = df_all['name_legal_clean'].str.lower() 
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\b, inc.\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\binc.\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\binc\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\b, llc.\b", "") 
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\b, llc\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\b, l.l.c.\b", "") 
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bllc\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bcorp.\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bcorp\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bcorporation\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bco.\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(r"\bco\b", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(",", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace(".", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.replace("'", "")
df_all['name_legal_clean'] = df_all['name_legal_clean'].str.strip()

In [None]:
pd.set_option('mode.chained_assignment', None)

df_roll['company_roll_up'] = df_roll['company_roll_up'].str.lower() 
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\b, inc.\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\binc.\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\binc\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\b, llc.\b", "") 
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\b, llc\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\b, l.l.c.\b", "") 
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\bllc\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\bcorp.\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\bcorp\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\bcorporation\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\bco.\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(r"\bco\b", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(",", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace(".", "")
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.replace("'", "") #not sure if this will work?
df_roll['company_roll_up'] = df_roll['company_roll_up'].str.strip()

merge with df roll to assign roll up id. df_all_fill is the df that does NOT have any missing roll up id. company_nan is the df that HAS missing roll up id.

In [None]:
df_all_merge = df_all.merge(df_roll, on='license_number', how='left')
df_all_fill = df_all_merge.dropna(subset=['roll_up_id'])
company_nan = df_all_merge[df_all_merge['roll_up_id'].isnull()]

In [None]:
# len(df_all_merge) == len(df_all_fill) + len(company_nan)

drop roll up id and company roll up as they are going to be added in the merge (company_nan). take only the roll up id and company roll up (take out license #) and then drop duplicates across both columns.

In [None]:
company_nan_drop = company_nan.drop(columns=['roll_up_id', 'company_roll_up'])
df_roll_name_only = df_roll[['roll_up_id', 'company_roll_up']]
df_roll_name_only.drop_duplicates(inplace=True)

combine based on the name in the company roll up.

In [None]:
company_nan_merge = pd.merge(company_nan_drop, df_roll_name_only, left_on='name_legal_clean', 
                             right_on='company_roll_up', how='left', suffixes=('', '_y'))

combine the df that does not have missing items to the one that was just populated based on name.

In [None]:
df_final = df_all_fill.append(company_nan_merge, sort=False)
df_final.reset_index(drop=True, inplace=True)

In [None]:
df_final.sort_values(['name_legal_clean'], ascending=True, inplace=True)

don't change the info except for contact / roll up

add column changed contact info if changes need to be made while editing file. this only makes sense if i am editing the new items. need to change it so that i can view the already altered information with the new items.

In [None]:
df_final['changed_contact_info'] = 0

In [None]:
df_contact = pd.read_sql("SELECT * FROM ca_contact", cnxn)

In [None]:
df_main = pd.read_sql("SELECT * FROM ca_main", cnxn)

In [None]:
# main0['license_number'].isin(sql_main['license_number']).astype(int)
# df_final[df_final['license_number'].isin(df_main['license_number'])]
df_final['in_sql'] = df_final['license_number'].isin(df_main['license_number']).astype(int)

In [None]:
df_final_add = df_final[df_final['in_sql'] == 0]
len(df_final_add.columns)

In [None]:
df_final_exist = df_final[df_final['in_sql'] == 1]
len(df_final_exist.columns)

In [None]:
df_exist_min = df_final_exist[['license_number',  #1
                               'license_category', 
                               'license_description',
                               'license_adult', 
                                'license_medicinal', #5 
                               'license_annual',
                               'license_provisional',
                                'name_legal', 
                               'name_dba', 
                               'name_legal_clean', #10
                                'date_issue',
                               'date_expiration', 
                                'status_active', 
                               'status_canceled', 
                               'status_expired',
                               'status_inactive', 
                                'status_revoked', 
                               'status_surrendered', #20
                               'status_suspended', 
                                'business_description', 
                               'business_company_type',
                                'company_roll_up', 
                               'roll_up_id', #25
                                'changed_contact_info',
                               'date_uploaded', 
                               'in_db', 
                               'in_sql']] #29

In [None]:
df_sql_merge = df_exist_min.merge(df_contact, on='license_number')

In [None]:
df_to_edit = df_sql_merge.append(df_final_add, sort=False)

In [None]:
df_to_edit.sort_values(['name_legal_clean'], ascending=True, inplace=True)

In [None]:
df_to_edit = df_to_edit[['license_number', 'license_category', 'license_description','license_adult', 
                    'license_medicinal', 'license_annual','license_provisional',
                    'name_legal', 'name_dba', 'name_legal_clean', 'roll_up_id', 'company_roll_up',
                    'date_issue','date_expiration', 
                    'status_active', 'status_canceled', 'status_expired','status_inactive', 
                    'status_revoked', 'status_surrendered','status_suspended', 
                    'business_description', 'business_company_type',
                    'contact_email', 'contact_phone', 'contact_website', 'contact_owner_1',
                    'contact_owner_2', 'contact_street', 'contact_city', 'contact_county',
                    'contact_state', 'contact_zip', 'changed_contact_info',
                    'date_uploaded', 'in_db', 'in_sql']]

In [None]:
# len(df_to_edit)

In [None]:
# df_to_edit[df_to_edit['roll_up_id'] != ""]

In [None]:
#Change Date yyyymmdd
df_to_edit.to_csv('../edited_files/df_final_20210113_to_edit.csv', index=False)

# MAKE EDITS TO FILE BEFORE GOING FURTHER

## Read in Edited

Only update contact info and roll up from edited file.

In [None]:
# Change Date yyyymmdd
df_edited = pd.read_csv('../edited_files/df_final_20210113_to_edit.csv')
len(df_edited)

In [None]:
# This is Nolan added code to check that he didn't forget to add company_roll_ups
if len(df_edited[df_edited['company_roll_up'].isnull()]) == 0:
    print("Continue")
else:
    print(error)

In [None]:
# df_edited = pd.read_csv('../edited_files/df_final_20200527_edited.csv')
# len(df_edited)

split df_edited into ones where roll up id has been populated and where they have not

In [None]:
df_edited_fill = df_edited.dropna(subset=['roll_up_id'])
df_edited_null = df_edited[df_edited['roll_up_id'].isnull()]

take all the unique company names and then give them an id

In [None]:
df_roll['roll_up_id'].max()

In [None]:
df_roll['roll_up_id'].max()

In [None]:
names_missing = df_edited_null['company_roll_up'].unique()
max_val = df_roll['roll_up_id'].max() + 1

num_vals = len(names_missing)
array_list = []
for i in range(num_vals):
    array_list.append(max_val)
    max_val += 1

id_array = np.asarray(array_list)

below i am creating a database of of the unique names (names_missing) and the ids that were created.

In [None]:
pd_miss_fill = pd.DataFrame({'company_roll_up': names_missing, 'roll_up_id': id_array})

drop roll up id because then it will be added on merge from the loop created above.

In [None]:
df_edited_null.drop(columns='roll_up_id', inplace=True)
df_edited_pop = df_edited_null.merge(pd_miss_fill, on='company_roll_up', how='left')

adding back the previously populated with the new populated

In [None]:
df_populated = df_edited_fill.append(df_edited_pop, sort=False)
len(df_populated)

change from df_all

In [None]:
df_main = df_populated[['license_number','license_category','license_description','license_adult','license_medicinal',
                       'license_annual','license_provisional','name_legal','name_dba','date_issue','date_expiration',
                       'status_active','status_canceled','status_expired','status_revoked','status_surrendered',
                       'status_suspended','business_description','business_company_type','date_uploaded','in_db']]

In [None]:
df_main = df_edited

In [None]:
# Added infer_datetime_format=True to deal with reuccuring error.

df_main['date_issue'] = pd.to_datetime(df_main['date_issue'], infer_datetime_format=True, format="%m/%d/%Y")
df_main['date_expiration'] = pd.to_datetime(df_main['date_expiration'], infer_datetime_format=True, format="%m/%d/%Y")

In [None]:
df_contact = df_populated[['license_number','contact_email','contact_phone','contact_website','contact_owner_1',
                           'contact_owner_2','contact_street','contact_city','contact_county','contact_state','contact_zip',
                          'changed_contact_info']]

In [None]:
df_roll = df_populated[['license_number', 'roll_up_id', 'company_roll_up']]

In [None]:
# df_main[df_main.duplicated(subset='license_number')]

## Compare to SQL

### Main

In [None]:
cnxn = pyodbc.connect(server = 'bespoke-database-1.cmevrozrcs7c.us-west-2.rds.amazonaws.com', 
                      driver = '{ODBC Driver 17 for SQL Server}',
                      database = 'ca_cannabis',
                      UID = 'admin',
                      PWD = 'N19lrqxnurTUJLJT6GFe')

In [None]:
sql_main = pd.read_sql("SELECT * FROM ca_main", cnxn)
sql_main['in_db'] = 1
len(sql_main)

change the names to all lowercase so that it will drop duplicates

In [None]:
sql_main['name_legal'] = sql_main['name_legal'].str.lower()
sql_main['name_dba'] = sql_main['name_dba'].str.lower()

In [None]:
df_main['name_legal'] = df_main['name_legal'].str.lower()
df_main['name_dba'] = df_main['name_dba'].str.lower()

In [None]:
main_all = sql_main.append(df_main, ignore_index=True)
len(main_all)

In [None]:
main_all.fillna(0, inplace=True)

fill na as 0 so it's not marked as none. help with deleting duplicates

In [None]:
main_drop = main_all.drop_duplicates(subset=['license_number', 'license_category', 'license_description',
       'license_adult', 'license_medicinal', 'license_annual',
       'license_provisional', 'name_legal', 'name_dba', 'date_issue',
       'date_expiration', 'status_active', 'status_canceled', 'status_expired',
       'status_revoked', 'status_surrendered', 'status_suspended',
       'business_description', 'business_company_type'], keep='first')

In [None]:
main_drop.reset_index(drop=True, inplace=True)
len(main_drop)

In [None]:
for i in range(len(main_drop)):
    if main_drop['name_dba'].iloc[i] == '0':
        main_drop['name_dba'].iloc[i] = 0
    if main_drop['business_description'].iloc[i] == '0':
        main_drop['business_description'].iloc[i] = 0
    if main_drop['business_company_type'].iloc[i] == '0':
        main_drop['business_company_type'].iloc[i] = 0

In [None]:
main_drop.drop_duplicates(subset=['license_number', 'license_category', 'license_description',
       'license_adult', 'license_medicinal', 'license_annual',
       'license_provisional', 'name_legal', 'name_dba', 'date_issue',
       'date_expiration', 'status_active', 'status_canceled', 'status_expired',
       'status_revoked', 'status_surrendered', 'status_suspended',
       'business_description', 'business_company_type'], keep='first', inplace=True)

main_drop.reset_index(drop=True, inplace=True)
len(main_drop)

check duplicates

In [None]:
# df_main[df_main['name_legal'].str.contains("’")]
# df_main['name_legal'] = df_main['name_legal'].str.replace("’", "'")

In [None]:
# main_drop[main_drop.duplicated(subset='license_number')]

In [None]:
# main_drop[main_drop['license_number'] == 'C13-0000016-LIC']

In [None]:
# main_drop.iloc[320] == main_drop.iloc[9339]

In [None]:
# main_drop.iloc[9338]

had some issues where some dba names were listed as string 0 not numeric 0 and so they were not dropping when removing duplicates. unsure if this is an ongoing problem

In [None]:
# main_drop[main_drop['name_dba'] == '0']

need to to the same thing for business description and business company type

separate out where in_db = 0. creating a column called exists_db. It looks to see if the license number exists in the sql database. main_add are the license numbers that do not exist in the database.

In [None]:
# main_drop[main_drop['in_db'] == 0]

In [None]:
# main0['license_number'].isin(sql_main['license_number']).astype(int)

In [None]:
# main0[main0['exists_db'] == 0]

In [None]:
main0 = main_drop[main_drop['in_db'] == 0]
main0['exists_db'] = main0['license_number'].isin(sql_main['license_number']).astype(int)
main_add = main0[main0['exists_db'] == 0]
len(main_add)

In [None]:
main_add.reset_index(drop=True, inplace=True)

drop the exists column as it is not in the database table

In [None]:
main_add.drop(columns='exists_db', inplace=True)

note - need to go in and change all the ones in the table to in_db = 1. i think i did not change it when initially uploading

In [None]:
main_add['in_db'] = 1

In [None]:
# main_add.info()

## Adding line items to SQL table ca_main

In [None]:
params = urllib.parse.quote_plus("DRIVER={ODBC Driver 17 for SQL Server};"
                                 "SERVER=bespoke-database-1.cmevrozrcs7c.us-west-2.rds.amazonaws.com;"
                                 "DATABASE=ca_cannabis;"
                                 "UID=admin;"
                                 "PWD=N19lrqxnurTUJLJT6GFe")

In [None]:
engine = create_engine("mssql+pyodbc:///?odbc_connect={}".format(params))

In [None]:
# For Comparing

# main_add.tail()

In [None]:
# For Comparing

# pd.read_sql('ca_main', con=engine).tail()

In [None]:
# Use this only if there are extra columns again.

ca_main_columns=['license_number', 'license_category', 'license_description',
       'license_adult', 'license_medicinal', 'license_annual',
       'license_provisional', 'name_legal', 'name_dba', 'date_issue',
       'date_expiration', 'status_active', 'status_canceled', 'status_expired',
       'status_revoked', 'status_surrendered', 'status_suspended',
       'business_description', 'business_company_type', 'date_uploaded',
       'in_db']

for column in main_add.columns:
    if column not in ca_main_columns:
        main_add = main_add.drop(columns=column)

In [None]:
main_add.to_sql('ca_main', con=engine, if_exists='append', index=False)

In [None]:
main_alter = main0[main0['exists_db'] == 1]
len(main_alter)

In [None]:
# main_alter

In [None]:
main_alter.drop(columns='exists_db', inplace=True)

In [None]:
main_alter['in_db'] = 1

### need to export to excel to manually update SQL through SQL server import / export to make changes

In [None]:
# Change Date mm_dd_yy
main_alter.to_excel('../edited_files/main_alter_01_13_21.xlsx', index=False)

would rather have a check internally if the name legal or name dba has changed

### Contact

going to have to add in a column 'edit_contact' of 1 and 0 that I mark off when editing to decipher which ones to then update in contact

In [None]:
sql_contact = pd.read_sql("SELECT * FROM ca_contact", cnxn)
len(sql_contact)

In [None]:
sql_contact['in_db'] = 1
sql_contact['changed_contact_info'] = 0

df_contact['in_db'] = 0

In [None]:
contact_all = sql_contact.append(df_contact, ignore_index=True, sort=False)
contact_all.fillna(0, inplace=True)
len(contact_all)

In [None]:
# contact_drop.in_db.value_counts()

In [None]:
contact_drop = contact_all.drop_duplicates(subset=['license_number'], keep='first')
len(contact_drop)

In [None]:
contact_drop.in_db.value_counts()

In [None]:
contact0 = contact_drop[contact_drop['in_db'] == 0]
len(contact0)

below is the one to add

In [None]:
contact0.drop(columns=['in_db', 'changed_contact_info'], inplace=True)

In [None]:
engine = create_engine("mssql+pyodbc:///?odbc_connect={}".format(params))

In [None]:
contact0.to_sql('ca_contact', con=engine, if_exists='append', index=False)

to alter

In [None]:
contact_add = contact_all[contact_all['changed_contact_info'] == 1]
len(contact_add)

In [None]:
contact_add.drop(columns=['in_db', 'changed_contact_info'], inplace=True)

### Roll Up

come up with way to alter company roll up ids 

In [None]:
sql_roll = pd.read_sql("SELECT * FROM ca_roll", cnxn)

In [None]:
len(sql_roll)

In [None]:
sql_roll['in_db'] = 1
df_roll['in_db'] = 0

In [None]:
roll_all = sql_roll.append(df_roll, ignore_index=True)
len(roll_all)

In [None]:
roll_drop = roll_all.drop_duplicates(subset=['license_number'], keep='first')
len(roll_drop)

In [None]:
roll0 = roll_drop[roll_drop['in_db'] == 0]
len(roll0)

In [None]:
roll0['exists_db'] = roll0['license_number'].isin(sql_roll['license_number']).astype(int)

In [None]:
roll_add = roll0[roll0['exists_db'] == 0]
len(roll_add)

In [None]:
roll_add.drop(columns='exists_db', inplace=True)

In [None]:
# roll_add['in_db'] = 1

In [None]:
roll_add.drop(columns='in_db', inplace=True)

In [None]:
roll_add.head(1)

In [None]:
engine = create_engine("mssql+pyodbc:///?odbc_connect={}".format(params))

In [None]:
roll_add.to_sql('ca_roll', con=engine, if_exists='append', index=False)

In [None]:
# roll_alter = roll0[roll0['exists_db'] == 1]
# len(roll_alter)

In [None]:
# roll_alter.drop(columns='exists_db', inplace=True)

In [None]:
# roll_alter['in_db'] = 1

# Notes

In [None]:
#UPDATE SQL SERVER

# main_alter.to_sql('temp_table', con=engine, if_exists='append', index=False)
# main_alter.groupby('date_uploaded').sum()

# sql = """Update t1
# SET t1.[license_adult] = t2.[license_adult], 
#     t1.[license_medicinal] = t2.[license_medicinal],
#     t1.[license_annual] = t2.[license_annual],
#     t1.[license_provisional] = t2.[license_provisional],
#     t1.[date_issue] = t2.[date_issue],
#     t1.[date_expiration] = t2.[date_expiration],
#     t1.[status_active] = t2.[status_active],
#     t1.[status_canceled] = t2.[status_canceled],
#     t1.[status_expired] = t2.[status_expired],
#     t1.[status_revoked] = t2.[status_revoked],
#     t1.[status_surrendered] = t2.[status_surrendered],
#     t1.[status_suspended] = t2.[status_suspended]
# FROM [ca_cannabis].[dbo].[ca_main] as t1
# INNER JOIN [ca_cannabis].[dbo].[temp_table] as t2
# ON t1.[license_number] = t2.[license_number];"""

# cnxn = pyodbc.connect(server = 'bespoke-database-1.cmevrozrcs7c.us-west-2.rds.amazonaws.com', 
#                       driver = '{ODBC Driver 17 for SQL Server}',
#                       database = 'ca_cannabis',
#                       UID = 'admin',
#                       PWD = 'N19lrqxnurTUJLJT6GFe')

# mycursor = cnxn.cursor()

# mycursor.execute(sql)

# sql = "DELETE FROM temp_table"

# mycursor.execute(sql)

In [None]:
#UPDATE SQL SERVER

# contact_add.to_sql('temp_table_contact', con=engine, if_exists='append', index=False)

# sql = """Update t1
# SET t1.[contact_email] = t2.[contact_email], 
# t1.[contact_phone] = t2.[contact_phone], 
# t1.[contact_website] = t2.[contact_website],
# t1.[contact_owner_1] = t2.[contact_owner_1], 
# t1.[contact_owner_2] = t2.[contact_owner_2], 
# t1.[contact_street] = t2.[contact_street], 
# t1.[contact_city] = t2.[contact_city],
# t1.[contact_county] = t2.[contact_county], 
# t1.[contact_state] = t2.[contact_state], 
# t1.[contact_zip] = t2.[contact_zip]
# FROM [ca_cannabis].[dbo].[ca_contact] as t1
# INNER JOIN [ca_cannabis].[dbo].[temp_table_contact] as t2
# ON t1.[license_number] = t2.[license_number];"""

# mycursor = cnxn.cursor()
# mycursor.execute(sql)
# sql = "DELETE FROM temp_table"
# mycursor.execute(sql)

In [None]:
#CA CONTROL - address

# df_add = df_file['premise_address'].str.split(',', expand=True)
# df_zip_county = df_add[1].str.split('County:', expand=True)
# df_street = df_add[0]
# df_zip_county[0] = df_zip_county[0].str.replace("CA", "")
# df_zip_county[0] = df_zip_county[0].str.strip()
# df_zip_county[1] = df_zip_county[1].str.strip()
# df_zip_county[1] = df_zip_county[1].str.lower()
# df_zip_county[0] = df_zip_county[0].str[:5]
# df_zip_county.rename(columns={0: 'contact_zip', 1: 'contact_county'}, inplace=True)
# df_zip_county['contact_state'] = 'CA'
# pd_zip_county = pd.concat([df_street, df_zip_county], axis=1)
# pd_zip_county.rename(columns={0:'contact_street'}, inplace=True)
# pd_zip_county['contact_street'] = pd_zip_county['contact_street'].str.lower()
# pd_zip_county['contact_street'] = pd_zip_county['contact_street'].str.strip()

In [None]:
# cnxn = pyodbc.connect(server = 'bespoke-database-1.cmevrozrcs7c.us-west-2.rds.amazonaws.com', 
#                       driver = '{ODBC Driver 17 for SQL Server}',
#                       database = 'ca_cannabis',
#                       UID = 'admin',
#                       PWD = 'N19lrqxnurTUJLJT6GFe')

In [None]:
# cnxn = pyodbc.connect('Trusted_Connection=yes',
#                       server = 'DESKTOP-KA6KCMH\SQLEXPRESS', 
#                       driver = '{ODBC Driver 17 for SQL Server}',
#                       database = 'ca_cannabis_v3'
#                       )

In [None]:
#i think this is more useful when i was only doing a subset of the control file and not the entirety.
# df2[~df2[5].isnull()]
#I dont know what this does. renames the column headers I think if need to remove columns
# df2 = df2.reindex(columns=range(len(df_names_removed.columns)))
#again not sure the point
# df2.columns = df_names_removed.columns

In [None]:
# df_file[df_file.duplicated()]
#only need to do if there are duplicates

# df_file.drop_duplicates(subset='license_number', keep='first', inplace=True)
# df_file.reset_index(drop=True, inplace=True)

In [None]:
# df_file_m[df_file_m.duplicated()]

In [None]:
# cult_biz = pd.read_csv('./cultivation_files/Annual_ Provisional Business 2020_03_12.csv')
# cult_drp = pd.read_csv('./cultivation_files/Annual_Provisional DRP 2020_03_12.csv')
# cult_df = cult_biz.append(cult_drp, sort=False)
# cult_df[cult_df['License Number'] == 'License Number']
# cult_df.drop_duplicates(subset='License Number', keep='first', inplace=True)

In [None]:
# df_new_items.drop(columns=['status','in_db'], inplace=True)

In [None]:
# df_all = pd.read_csv('./df_all_v10.csv')

In [None]:
# df_all['date_issue'] = pd.to_datetime(df_all['date_issue'], format="%m/%d/%Y")
# df_all['date_expiration'] = pd.to_datetime(df_all['date_expiration'], format="%m/%d/%Y")
# df_all['date_uploaded'] = pd.to_datetime(df_all['date_uploaded'], format="%m/%d/%Y")

# df_all['status_inactive'] = 0

In [None]:
# df_name_legal_null = df_all[df_all['name_legal'].isnull()]

In [None]:
# check to see if df_roll has any blank name_legal. if so delete. 
# keep blanks to assign them a roll up id and company name if possible.

# df_roll[df_roll['company_roll_up'].isnull()]

In [None]:
# need to remove blanks or when you merge on name legal it creates a bunch of duplicates
# df_all.drop(df_all[df_all['name_legal'].isnull()].index, inplace=True)
# df_all.reset_index(drop=True, inplace=True)

In [None]:
# df_final.rename(columns={'changed_contact_info?': 'changed_contact_info'}, inplace=True)

In [None]:
# main_drop[main_drop.duplicated(subset='license_number')]
# main_drop[main_drop['license_number'] == 'CCL18-0001395']
# sql_dupe = sql_main[sql_main.duplicated(subset='license_number')]
# sql_main[sql_main['license_number'] == 'CCL18-0001395']
# sql_main[sql_main.duplicated(subset='license_number')]

In [None]:
# engine = create_engine('mssql+pyodbc://DESKTOP-KA6KCMH\SQLEXPRESS/ca_cannabis_v3?driver=ODBC Driver 17 for SQL Server')

In [None]:
# contact_drop_v1 = contact_all.drop_duplicates(subset=['license_number', 'contact_email', 'contact_phone', 'contact_website',
#        'contact_owner_1', 'contact_owner_2', 'contact_street', 'contact_city',
#        'contact_county', 'contact_state', 'contact_zip'], keep='first')

# contact_drop_v1[contact_drop_v1.duplicated(subset='license_number')]

In [None]:
# sql_main = pd.read_sql("SELECT * FROM ca_main", cnxn)
# sql_contact = pd.read_sql("SELECT * FROM ca_contact", cnxn)
# sql_roll = pd.read_sql("SELECT * FROM ca_roll", cnxn)
# sql_main.to_excel('./sql_main_20200331.xlsx', index=False)
# sql_contact.to_excel('./sql_contact_20200331.xlsx', index=False)
# sql_roll.to_excel('./sql_roll_20200331.xlsx', index=False)

In [None]:
# roll_all.isnull()
# roll_all.fillna(0, inplace=True)

In [None]:
# main_drop = main_drop.append(main_change_0)
# main_drop.drop_duplicates(subset='license_number', keep='last')
# main_drop[main_drop['license_number'] == 'CCL18-0000011']
# main_drop.iloc[463] == main_drop.iloc[9046]
# main_drop['name_dba'].iloc[9628] == 0
# main_drop['name_dba'].iloc[2146]
# main_drop[main_drop['name_dba'] == '0']

In [None]:
# main_drop[main_drop['name_dba'] == '0']
# main_drop['name_dba'].iloc[2] == '0'

# main_change_0 = main_drop[main_drop['name_dba'] == '0']
# main_drop['lic'] == 0
#122 = '0'
#7187 = 0
# main_change_0['name_dba'] = 0
# main_change_0['name_dba'].iloc[0]

In [None]:
# sql_main.fillna(0, inplace=True)
# df_main.fillna(0, inplace=True)

In [None]:
# df_name_legal_null = df_all[(df_all['name_legal'] == "") | (df_all['name_legal'].isnull())]
# len(df_name_legal_null)

In [None]:

# 'license_number', 
# 'license_category', 
# 'license_description',
# 'license_adult', 
# 'license_medicinal', 
# 'license_annual',
# 'license_provisional', 
# 'name_legal', 
# 'name_dba', 
# 'name_legal_clean',
# 'roll_up_id', 
# 'company_roll_up', 
# 'date_issue', 
# 'date_expiration',
# 'status_active', 
# 'status_canceled', 
# 'status_expired', 
# 'status_inactive',
# 'status_revoked', 
# 'status_surrendered', 
# 'status_suspended',
# 'business_description', 
# 'business_company_type', 
# 'contact_email', 
# 'contact_phone',
# 'contact_website',
# 'contact_owner_1', 
# 'contact_owner_2',
# 'contact_street',
# 'contact_city',
# 'contact_county', 
# 'contact_state',
# 'contact_zip',
# 'changed_contact_info',
# 'date_uploaded', 
# 'in_db' #36 total


In [None]:
# df_final_add['date_uploaded'] = pd.datetime(2020, 4,29)

# df_final_add['in_db'] = 0

# df_sql_merge.drop(columns=['roll_up_id', 'company_roll_up'])

In [None]:
# sql = """Update t1
# SET t1.[name_legal] = t2.[name_legal],
#     t1.[name_dba] = t2.[name_dba],
#     t1.[license_adult] = t2.[license_adult], 
#     t1.[license_medicinal] = t2.[license_medicinal],
#     t1.[license_annual] = t2.[license_annual],
#     t1.[license_provisional] = t2.[license_provisional],
#     t1.[date_issue] = t2.[date_issue],
#     t1.[date_expiration] = t2.[date_expiration],
#     t1.[status_active] = t2.[status_active],
#     t1.[status_canceled] = t2.[status_canceled],
#     t1.[status_expired] = t2.[status_expired],
#     t1.[status_revoked] = t2.[status_revoked],
#     t1.[status_surrendered] = t2.[status_surrendered],
#     t1.[status_suspended] = t2.[status_suspended]
# FROM [ca_cannabis].[dbo].[ca_main] as t1
# INNER JOIN [ca_cannabis].[dbo].[temp_table] as t2
# ON t1.[license_number] = t2.[license_number];"""

In [None]:
# main_alter['date_uploaded'] = pd.to_datetime(main_alter['date_uploaded'], format="%m/%d/%Y")
# main_alter.columns
# main_alter.iloc[:,[0,1,2,4,5,6,7,9,10,11,12,13,14,15,16]]
# main_drop[main_drop['license_number'] == 'CCL18-0001771'].iloc[:,[1,2,4,5,6,7,9,10,11,12,13,14,15,16]]
# main_drop.iloc[9138]
# main_drop.iloc[7278] == main_drop.iloc[10293]
# main_alter.iloc[0]
# main_drop[main_drop['license_number'] == 'CCL19-0000084']

In [None]:
# df_main[df_main['license_number'] == 'CCL19-0001283']
# sql_main[sql_main['license_number'] == 'CCL19-0001283']

In [None]:
# df_sql_merge[['license_number', 'license_category', 'license_description',
#        'license_adult', 'license_medicinal', 'license_annual',
#        'license_provisional', 
#         'name_legal', 'name_dba', 'name_legal_clean',
#        'roll_up_id', 'company_roll_up', 
#         'date_issue', 'date_expiration',
#        'status_active', 'status_canceled', 'status_expired', 'status_inactive',
#        'status_revoked', 'status_surrendered', 'status_suspended',    
#        'business_description', 'business_company_type', 
#         'company_roll_up','roll_up_id', 
#         'changed_contact_info', 'date_uploaded', 'in_db','in_sql', 
#         'contact_email', 'contact_phone', 'contact_website',
#        'contact_owner_1', 'contact_owner_2', 'contact_street', 'contact_city',
#        'contact_county', 'contact_state', 'contact_zip']]

In [None]:
# df_final_add[['license_number', 'license_category', 'license_description',
#        'license_adult', 'license_medicinal', 'license_annual',
#        'license_provisional', 
#         'name_legal', 'name_dba', 'name_legal_clean',
#         'roll_up_id', 'company_roll_up',      
#         'date_issue', 'date_expiration', 
#         'status_active', 'status_canceled', 'status_expired', 'status_inactive', 
#         'status_revoked', 'status_surrendered','status_suspended', 
#         'business_description', 'business_company_type',
#        'contact_email', 'contact_phone', 'contact_website', 'contact_owner_1',
#        'contact_owner_2', 'contact_street', 'contact_city', 'contact_county',
#        'contact_state', 'contact_zip', 'date_uploaded', 'in_db',
        
#        'changed_contact_info', 'in_sql']]

In [None]:
# df_edited['roll_up_id'].max()

In [None]:
# df_sql_merge.append(df_final_add, sort=False)
# df_sql_merge['in_sql'] = 1
# df_to_edit.columns