In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import re
from sqlalchemy import create_engine
import urllib

In [None]:
# https://online-dfpr.micropact.com/
# Copy Paste from Adult Use and Medical

# Eventually, switch to bulk lookup
# https://www.idfpr.com/LicenseLookUp/BulkLookup.asp

In [None]:
pd.options.display.max_rows = 250
pd.options.display.min_rows = 250

In [None]:
# This sets the date for all of the points in the data that need the date

# this_date = dt.date.today()
this_date = dt.date(2021, 8, 10)

this_day = this_date.strftime('%d')
this_month = this_date.strftime('%m')
this_year = this_date.strftime('%Y')

In [None]:
pd.options.display.max_rows = 100
pd.options.display.min_rows = 100
pd.options.display.max_columns = 100
pd.set_option('display.max_colwidth', None)

In [None]:
# Download this PDF https://www.idfpr.com/LicenseLookup/AdultUseDispensaries.pdf or check https://www.idfpr.com/profs/adultusecan.asp
# Use this to convert the PDF to Excel https://www.adobe.com/acrobat/online/pdf-to-excel.html

# The current link is only for Dispensaries, so this will all need new formatting if we get a new list

In [None]:
# These are the types of cannabis licenses Illinois provides, though I've only found licneses for that last one so far

# Cultivation centers
# Craft growers
# Processors
# Transporting organizations
# Dispensing organizations

### SQL Connection

In [None]:
# # Typing the driver directly into the create_engine kept not working, so I'm trying it this way
# driver="ODBC Driver 17 for SQL Server"
# engine = create_engine(f'mssql://LAPTOP-E6QKON1L/il_cannabis?driver={driver}')
# engine_con = engine.connect()

In [None]:
# This creates the connection to the Bespoke Engine's Oregon Cannabis files in SQL
params = urllib.parse.quote_plus("DRIVER={ODBC Driver 17 for SQL Server};"
                                 "SERVER=bespoke-database-1.cmevrozrcs7c.us-west-2.rds.amazonaws.com;"
                                 "DATABASE=il_cannabis;"
                                 "UID=admin;"
                                 "PWD=N19lrqxnurTUJLJT6GFe")
engine = create_engine("mssql+pyodbc:///?odbc_connect={}".format(params))
engine_con = engine.connect()

### Read in the Data

In [None]:
# This reads in the date from https://www.idfpr.com/LicenseLookup/AdultUseDispensaries.pdf

# il_dispensaries = pd.read_excel(f'licenses/il_licenses_{this_year}_{this_month}_{this_day}.xlsx', header=5)
il_adult_use = pd.read_excel(f'licenses/il_licenses_{this_year}_{this_month}_{this_day}.xlsx')

In [None]:
il_adult_use = il_adult_use.rename(columns={'Original':'Original Issue Date'})
il_adult_use.head()

In [None]:
for col in il_adult_use.columns:
    if col[:7] == 'Unnamed':
        il_adult_use = il_adult_use.drop(columns={col})
        
il_adult_use.head()

In [None]:
il_adult_use['License Type'] = 'Adult Use'

In [None]:
il_medical = pd.read_excel(f'licenses/il_medical_{this_year}_{this_month}_{this_day}.xlsx')

In [None]:
il_medical = il_medical.rename(columns={'Original':'Original Issue Date'})
il_medical.head()

In [None]:
for col in il_medical.columns:
    if col[:7] == 'Unnamed':
        il_medical = il_medical.drop(columns={col})
        
il_medical.head()

In [None]:
il_medical['License Type'] = 'Medical'

In [None]:
il_cannabis = il_adult_use.append(il_medical)

### Data Cleaning

In [None]:
# This just removes the rows with null in the license column
il_cannabis = il_cannabis[il_cannabis['Credential'].notna()]

In [None]:
# This function will remove all of the \ns from each of the columns

def remove_slash_n(column):
    
    old_list = list(il_cannabis[column])
    new_list = []
    
    for cell in old_list:
        cell = str(cell).replace('\n', ' ')
        new_list.append(cell)    
        
    il_cannabis[column] = new_list

In [None]:
il_cannabis.columns

In [None]:
# I think these are all of the columns that need this
remove_slash_n('DBA / AKA')
remove_slash_n('Name')
# remove_slash_n('Address & Phone Number')

In [None]:
str_list_dba = []
for dba in il_cannabis['DBA / AKA']:
    dba_str = str(dba)
    str_list_dba.append(dba_str)
il_cannabis['DBA / AKA'] = str_list_dba

In [None]:
# This should remove the unneeded spaces by removing spaces followed by lowercase letters
# All spaces followed by capital letters are supposed to be there, since all words start with capital letters in the names
# Hopefully this doesn't get screwed up by some words not starting with capital letters

def remove_spaces(column):
        
    old_list = list(il_cannabis[column])
    new_list = []
    
    for cell in old_list:
        while re.search(' [a-z]', cell) != None: # This specifies spaces followed by lowercase letters
            cell = cell[:re.search(' [a-z]', cell).span()[0]] + cell[re.search(' [a-z]', cell).span()[0]+1:]
        new_list.append(cell)
    
    il_cannabis[column] = new_list

In [None]:
il_cannabis['Name'][20:30]

In [None]:
remove_spaces('Name')

In [None]:
il_cannabis['Name'][20:30]

In [None]:
il_cannabis['DBA / AKA'][40:50]

In [None]:
remove_spaces('DBA / AKA')

In [None]:
il_cannabis['DBA / AKA'][40:50]

In [None]:
# This is just a simple line to remove unneeded periods
# Idk if it's needed, but it'll make it a bit cleaner
new_lic_holder_col = []
for cell in il_cannabis['DBA / AKA']:
    cell = cell.replace('LLC.', 'LLC')
    cell = cell.replace('Inc.', 'Inc')
    cell = cell.replace(', LLC', ' LLC')
    cell = cell.replace(', Inc', ' Inc')
    new_lic_holder_col.append(cell)
il_cannabis['License Holder'] = new_lic_holder_col

In [None]:
il_cannabis['DBA / AKA'].unique()

In [None]:
# This just exctracs the State from City/State so we can show the contact city
city_list = []
il_cannabis['City/State'] = il_cannabis['City/State'].fillna(', IL')
for city in il_cannabis['City/State']:
    city = city.replace(', IL', '')
    city_list.append(city)
il_cannabis['City/State'] = city_list

In [None]:
il_cannabis.head()

In [None]:
# These are the 
# il_cannabis[il_cannabis['Original'].isnull()]

### New Columns

In [None]:
# Address and Phone column not present currently

In [None]:
# # This extracts the phone number column from the Address & Phone Number column
# phone_list = []
# for info in il_cannabis['Address & Phone Number']:
#     info = info[-14:]
#     info = info.replace('(', '').replace(')', '').replace('-', '').replace(' ', '')
#     phone_list.append(info)
# il_cannabis['contact_phone'] = phone_list

In [None]:
# # This extracts the address column from the Address & Phone Number column
# address_list = []
# for info in il_cannabis['Address & Phone Number']:
#     info = info.split(',')[0]
#     address_list.append(info)
# il_cannabis['contact_address'] = address_list

In [None]:
# # This extracts the zip code column from the Address & Phone Number column
# zip_list = []
# for info in il_cannabis['Address & Phone Number']:
#     info = info.split(sep=',')
#     info = info[-1]
#     info = info.replace('Illinois ', '')
#     info = info.replace('IL ', '')
#     info = info.replace(' ', '')
#     info = info[:5]
#     zip_list.append(info)
# il_cannabis['contact_zip'] = zip_list

In [None]:
# # This extracts the zip code column from the Address & Phone Number column
# zip_list = []
# for info in il_cannabis['Address & Phone Number']:
#     info = info[:-14]
#     while info[-1] == ' ':
#         info = info[:-1]
#     while info[-1] == '.':
#         info = info[:-1]
#     if info[-5] == '-':
#         info = info[:-5]
#     zip_list.append(int(info[-5:]))
# il_cannabis['contact_zip'] = zip_list

In [None]:
# This just adds the state, which doesn't really need extraction
il_cannabis['contact_state'] = 'IL'

In [None]:
il_cannabis.head()

### Renaming and Reorganizing Columns

In [None]:
renamed_data = pd.DataFrame()

In [None]:
renamed_data['license_number'] = il_cannabis['Credential']
renamed_data['license_description'] = il_cannabis['License Type']
renamed_data['name_legal'] = il_cannabis['Name']
renamed_data['name_dba'] = il_cannabis['DBA / AKA']
renamed_data['license_status'] = il_cannabis['License Status']
# renamed_data['medical'] = il_cannabis['Serve medical patients?']
# renamed_data['company_roll_up'] = il_cannabis['company_roll_up']
# renamed_data['roll_up_id'] = il_cannabis['roll_up_id']
# renamed_data['contact_phone'] = il_cannabis['contact_phone']
# renamed_data['contact_address'] = il_cannabis['contact_address']
# renamed_data['contact_zip'] = il_cannabis['contact_zip']
renamed_data['contact_city'] = il_cannabis['City/State']
renamed_data['contact_state'] = il_cannabis['contact_state']
renamed_data['date_issued'] = il_cannabis['Original Issue Date']
renamed_data['expiration_date'] = il_cannabis['Current Expiration Date']
renamed_data['is_current'] = 1

In [None]:
# renamed_data['medical'] = renamed_data['medical'].replace('Yes', 1).replace('No', 0).astype(int)

In [None]:
renamed_data['date_uploaded'] = this_date

In [None]:
renamed_data.head()

### Add Info From SQL

In [None]:
old_rolls = pd.read_sql('il_roll', con=engine_con)

In [None]:
combined_data = renamed_data.merge(old_rolls, how='left', on='license_number')

In [None]:
# Idk if there will be nas that need filling, but I'll toss this here just in case
combined_data['company_roll_up'] = combined_data['company_roll_up'].fillna('')
combined_data['roll_up_id'] = combined_data['roll_up_id'].fillna('')

In [None]:
combined_data.columns

In [None]:
# This just reorders the columns, since it's easier to work with the roll up info if the column is closer to the name columns
combined_data = pd.DataFrame.reindex(combined_data,
                    columns=['license_number', 'license_description', 'name_legal', 'name_dba', 'company_roll_up', 'roll_up_id',
                             'medical', 'contact_phone', 'contact_address', 'contact_zip', 'contact_state', 'date_issued', 
                             'license_status', 'is_current', 'date_uploaded'])

In [None]:
combined_data.head()

In [None]:
# This will create a df of all of the companies no longer on the list so we can change the is_active to 0
new_lic = list(combined_data['license_number'].unique())
old_lic = list(old_rolls['license_number'].unique())
null_lic = []
for lic in old_lic:
    if lic not in new_lic:
        null_lic.append(lic)  

In [None]:
# This reads in the data that will be merged into the null_df
old_main = pd.read_sql('il_main', con=engine_con)
old_contact = pd.read_sql('il_contact', con=engine_con)

In [None]:
# And this fully creates the data that will be merged into the end
null_df = pd.DataFrame(null_lic, columns={'license_number'})
null_df = null_df.merge(old_main)
null_df = null_df.merge(old_rolls)
null_df = null_df.merge(old_contact)
null_df['is_current'] = 0
null_df.head()

In [None]:
# And this adds the new data to the end of the combindexd_data
combined_data = combined_data.append(null_df).sort_values(by='name_dba').reset_index().drop(columns={'index'})

In [None]:
# And a quick cleaning before sending this to excel
combined_data = combined_data.reset_index().drop(columns='index')
combined_data = combined_data.replace('nan', '').sort_values('name_legal')
combined_data.head()

In [None]:
combined_data[['license_number', 'license_status']].to_csv('add_lic.csv')

In [None]:
combined_data[combined_data['license_number'].str.contains('284-')]

In [None]:
combined_data[combined_data['name_legal'].str.contains('Trini')]

### To CSV for Editing

In [None]:
combined_data.to_excel(f'edited_files/df_{this_year}_{this_month}_{this_day}_to_edit.xlsx', index=False)

#### Make Changes, Then Read Back In

In [None]:
if pd.read_excel(f'edited_files/df_{this_year}_{this_month}_{this_day}_to_edit.xlsx')['company_roll_up'].isnull().sum() == 0:
    df_edited = pd.read_excel(f'edited_files/df_{this_year}_{this_month}_{this_day}_to_edit.xlsx')
else:
    print(error)

In [None]:
df_edited.head()

### Roll Up Id

In [None]:
df_edited_null = df_edited[df_edited['roll_up_id'].isnull()]
df_edited_fill = df_edited[df_edited['roll_up_id'].notnull()]

In [None]:
# This helps fix some later code
df_edited_null = df_edited_null.reset_index().drop(columns={'index'})

In [None]:
# This creates a list of unique company roll up for companies without roll up ids and another for roll up ids to match to them
comp_roll_list = list(df_edited_null[df_edited_null['roll_up_id'].isnull()]['company_roll_up'].unique())
roll_id_list = []

In [None]:
# This sets the max roll up id to a numerical value for adding more ids
if len(df_edited_fill['roll_up_id']) == 0:
    max_roll = 0
else:
    max_roll = max(df_edited_fill['roll_up_id'].str[3:].astype(int))

In [None]:
# This adds roll_up_ids to the list for each new company_roll_up
for comp in comp_roll_list:
    max_roll += 1
    new_roll = str(max_roll)
    while len(new_roll) < 5:
        new_roll = '0' + new_roll
    new_roll = 'IL-' + new_roll
    roll_id_list.append(new_roll)

In [None]:
# This creates a dicitonary of the new company_roll_ups and the matching roll_up_ids
roll_dict = {}
for n in range(len(roll_id_list)):
    new_roll = {list(df_edited_null['company_roll_up'].unique())[n]:roll_id_list[n]}
    roll_dict.update(new_roll)

In [None]:
list(df_edited_null['company_roll_up'].unique())

In [None]:
roll_id_list

In [None]:
roll_dict

In [None]:
# And then we just use the dictionary to add roll_up_ids to the DataFrame
for n in range(len(df_edited_null)):
    df_edited_null['roll_up_id'][n] = roll_dict[df_edited_null['company_roll_up'][n]]

In [None]:
df_edited_null

In [None]:
# And now let's toss these two DataFrames back togther
df_edited_all = df_edited_fill.append(df_edited_null).reset_index().drop(columns={'index'})

In [None]:
df_edited_all.head()

### Create the main_alter DataFrame

In [None]:
# Let's narrow this with to the main columns
df_for_alter = pd.DataFrame(list(old_main.license_number.unique()), columns={'license_number'})
df_for_alter = df_for_alter.merge(df_edited_all[list(old_main.columns)])

In [None]:
# This narrows it down to all of the licenses with changes
for_alter = []
for lic in old_main.license_number:
    if old_main[old_main['license_number'] == lic]['license_description'].item() != df_for_alter[df_for_alter['license_number'] == lic]['license_description'].item():
        for_alter.append(lic)
    elif old_main[old_main['license_number'] == lic]['name_legal'].item() != df_for_alter[df_for_alter['license_number'] == lic]['name_legal'].item():
        for_alter.append(lic)
    elif old_main[old_main['license_number'] == lic]['name_dba'].item() != df_for_alter[df_for_alter['license_number'] == lic]['name_dba'].item():
        for_alter.append(lic)
    elif old_main[old_main['license_number'] == lic]['medical'].item() != df_for_alter[df_for_alter['license_number'] == lic]['medical'].item():
        for_alter.append(lic)
    elif old_main[old_main['license_number'] == lic]['is_current'].item() != df_for_alter[df_for_alter['license_number'] == lic]['is_current'].item():
        for_alter.append(lic)

In [None]:
# And this creates the final DataFrame
df_main_alter = pd.DataFrame(for_alter, columns={'license_number'})
df_main_alter = df_main_alter.merge(df_for_alter)

In [None]:
# And now we send it to a Excel
# df_main_alter.to_excel(f'edited_files/main_alter_{this_year}_{this_month}_{this_day}.xlsx', sheet_name='il_main_alter', index=False)

# Trying to change the tactic here
df_main_alter.to_sql('il_main_alter', con=engine_con, if_exists='replace', index=False)

In [None]:
old_main.head()

In [None]:
print(stop)

### Seperate for SQL

In [None]:
# This seperates out the new stuff
list_of_old_lic = list(old_rolls['license_number'].unique())
list_of_all_lic = list(df_edited_all['license_number'].unique())
list_of_new_lic = []
for lic in list_of_all_lic:
    if lic not in list_of_old_lic:
        list_of_new_lic.append(lic)

In [None]:
# And now we just make a DataFrame out of it
new_rolls_df = pd.DataFrame(list_of_new_lic, columns=['license_number'])
new_rolls_df = new_rolls_df.merge(df_edited_all)

In [None]:
new_rolls_df

In [None]:
new_rolls_df = new_rolls_df[new_rolls_df['license_number'] != 'DOPODOPO']
new_rolls_df

In [None]:
# This creates the three needed dataframes so that the 
df_il_main = new_rolls_df[['license_number', 'license_description', 'name_legal', 'name_dba', 'medical', 'date_issued', 'is_current', 'date_uploaded']]
df_il_contact = new_rolls_df[['license_number', 'contact_phone', 'contact_address', 'contact_zip', 'contact_state']]
df_il_roll = new_rolls_df[['license_number', 'company_roll_up', 'roll_up_id']]

In [None]:
# I'll move this, but this clears out the bad phone numbers
for n in range(len(df_il_contact)):
    if 'TBD' in str(df_il_contact.contact_phone[n]):
        df_il_contact.contact_phone[n] = 0

In [None]:
# If this won't run, double check both and fix them
# df_il_contact.contact_phone = df_il_contact.contact_phone.astype(np.int64)
# df_il_contact.contact_zip = df_il_contact.contact_zip.astype(np.int64)

In [None]:
df_il_main.sort_values(by='date_issued')

In [None]:
print(stop)

In [None]:
df_il_roll.to_sql('il_roll', con=engine_con, if_exists='append', index=False)

In [None]:
df_il_main.to_sql('il_main', con=engine_con, if_exists='append', index=False)

In [None]:
df_il_contact.to_sql('il_contact', con=engine_con, if_exists='append', index=False)