In [None]:
import json
import os
import pandas
import pyarrow
import sys

from dotenv import load_dotenv
from sqlalchemy import create_engine
from os import path

load_dotenv(verbose=True)
BIGQUERY_CREDENTIALS_PATH = os.environ.get('BIGQUERY_CREDENTIALS_PATH')
engine = create_engine('bigquery://bespoke-financial/ProdMetrcData', credentials_path=os.path.expanduser(BIGQUERY_CREDENTIALS_PATH))

sys.path.append(path.realpath(path.join(os.getcwd(), "../core")))

import create_queries
import prepare_data

%load_ext autoreload
%autoreload 2

In [None]:
COMPANY_IDENTIFIER = 'IDC'
TRANSFER_PACKAGES_START_DATE = '2020-01-01'
SALES_TRANSACTIONS_START_DATE = '2020-01-01'

In [None]:
company_licenses_query = create_queries.create_company_licenses_query(COMPANY_IDENTIFIER)
company_download_summaries_query = create_queries.create_company_download_summaries_query(COMPANY_IDENTIFIER, TRANSFER_PACKAGES_START_DATE)
company_incoming_transfer_packages_query = create_queries.create_company_incoming_transfer_packages_query(COMPANY_IDENTIFIER, TRANSFER_PACKAGES_START_DATE)
company_unknown_transfer_packages_query = create_queries.create_company_unknown_transfer_packages_query(COMPANY_IDENTIFIER, TRANSFER_PACKAGES_START_DATE)
company_sales_receipts_query = create_queries.create_company_sales_receipts_query(COMPANY_IDENTIFIER, SALES_TRANSACTIONS_START_DATE)
company_sales_receipts_with_transactions_query = create_queries.create_company_sales_receipts_with_transactions_query(COMPANY_IDENTIFIER, SALES_TRANSACTIONS_START_DATE)

company_licenses_dataframe = pandas.read_sql_query(company_licenses_query, engine)
company_download_summaries_dataframe = pandas.read_sql_query(company_download_summaries_query, engine)
company_incoming_transfer_packages_dataframe = pandas.read_sql_query(company_incoming_transfer_packages_query, engine)
company_unknown_transfer_packages_dataframe = pandas.read_sql_query(company_unknown_transfer_packages_query, engine)
company_sales_receipts_dataframe = pandas.read_sql_query(company_sales_receipts_query, engine)
company_sales_receipts_with_transactions_dataframe = pandas.read_sql_query(company_sales_receipts_with_transactions_query, engine)

## Licenses

In [None]:
company_licenses_dataframe

## Download summaries

In [None]:
license_numbers = company_download_summaries_dataframe['license_number'].unique()
download_summary_records = company_download_summaries_dataframe.to_dict('records')

license_number_to_download_summary_records = {}

for license_number in license_numbers:
    license_number_to_download_summary_records[license_number] = list(filter(
        lambda download_summary_record: download_summary_record['license_number'] == license_number,
        download_summary_records
    ))

bad_count = 0

for license_number, download_summary_records in license_number_to_download_summary_records.items():
    print(f'Verifying download summaries for license {license_number}...')
    print(f'Earliest download summary: {download_summary_records[-1]["date"]}')
    print(f'Latest download summary: {download_summary_records[0]["date"]}')
    for download_summary_record in download_summary_records:
        if download_summary_record['status'] != 'completed':
            bad_count += 1
            print(f'Found bad download summary for license {license_number} on date {download_summary_record["date"]}')
    print('')

if bad_count > 0:
    print(f'[FAILURE] Found a total of {bad_count} bad download summaries')
else:
    print(f'[SUCCESS] All download summaries look good!')

## Transfer packages

In [None]:
company_unknown_transfer_package_records = company_unknown_transfer_packages_dataframe.to_dict('records')
unknown_count = len(company_unknown_transfer_package_records)
if unknown_count > 0:
    print(f'[FAILURE] Found a total of {unknown_count} unknown transfer packages')
else:
    print(f'[SUCCESS] No unknown transfer packages!')

In [None]:
fn = lambda row: f'{row.created_date.year}-{"0" if row.created_date.month < 10 else ""}{row.created_date.month}'
col = company_incoming_transfer_packages_dataframe.apply(fn, axis=1)
company_incoming_transfer_packages_dataframe = company_incoming_transfer_packages_dataframe.assign(created_month=col.values)

In [None]:
company_incoming_transfer_packages_dataframe.groupby(['created_month'])['shipped_quantity'].sum().plot.bar(figsize=(24, 8), stacked=True)

## Sales GMV month-over-month from metrc_sales_receipts

In [None]:
fn = lambda row: f'{row.sales_datetime.year}-{"0" if row.sales_datetime.month < 10 else ""}{row.sales_datetime.month}'
col = company_sales_receipts_dataframe.apply(fn, axis=1)
company_sales_receipts_dataframe = company_sales_receipts_dataframe.assign(sales_month=col.values)

In [None]:
company_sales_receipts_dataframe.groupby(['sales_month'])['total_price'].sum().plot.bar(figsize=(24, 8), stacked=True)

## Sales GMV month-over-month from metrc_sales_transactions

In [None]:
deduped_sales_receipts_with_transactions_dataframe = prepare_data.dedupe_sales_transactions(company_sales_receipts_with_transactions_dataframe)

In [None]:
fn = lambda row: f'{row.sales_datetime.year}-{"0" if row.sales_datetime.month < 10 else ""}{row.sales_datetime.month}'
col = deduped_sales_receipts_with_transactions_dataframe.apply(fn, axis=1)
deduped_sales_receipts_with_transactions_dataframe = deduped_sales_receipts_with_transactions_dataframe.assign(sales_month=col.values)

In [None]:
# Sales GMV month-over-month by product category name
deduped_sales_receipts_with_transactions_dataframe.groupby(['sales_month', 'tx_product_category_name'])['tx_total_price'].sum().unstack().plot.bar(figsize=(24, 8), stacked=True)

## Search for missing incoming transfer packages

In [None]:
incoming_transfer_package_records = company_incoming_transfer_packages_dataframe.to_dict('records')
incoming_transfer_package_ids_set = set([incoming_transfer_package_record['package_id'] for incoming_transfer_package_record in incoming_transfer_package_records])

missing_count = 0
total_count = 0

# Count of missing incoming transfer packages by month.
# Missing incoming transfer package falls into then month when 1st sales transaction for that package is sold.
month_to_missing_count = {}

example_missing_package_ids = []

processed_receipt_numbers_set = set([])
sales_receipt_with_transactions_records = deduped_sales_receipts_with_transactions_dataframe.to_dict('records')
for sales_receipt_with_transaction_record in sales_receipt_with_transactions_records:
    tx_package_id = sales_receipt_with_transaction_record['tx_package_id']
    receipt_number = sales_receipt_with_transaction_record['receipt_number']
    receipt_sales_month = sales_receipt_with_transaction_record['sales_month']

    total_count += 1

    if tx_package_id in incoming_transfer_package_ids_set:
        continue
    else:
        example_missing_package_ids += [tx_package_id]
        missing_count += 1
        
        if receipt_number not in processed_receipt_numbers_set:
            processed_receipt_numbers_set.add(receipt_number)
            if receipt_sales_month not in month_to_missing_count:
                month_to_missing_count[receipt_sales_month] = 0
            month_to_missing_count[receipt_sales_month] += 1

        continue
        
print(f'# transactions missing incoming transfer package: {missing_count} ({missing_count / total_count * 100}%)')
print(f'# transactions total: {total_count}')

In [None]:
month_to_missing_count

## Search for missing metrc_sales_transactions

In [None]:
sales_receipt_with_transactions_records = deduped_sales_receipts_with_transactions_dataframe.to_dict('records')

receipt_number_to_transactions = {}
for sales_receipt_with_transaction_record in sales_receipt_with_transactions_records:
    receipt_number = sales_receipt_with_transaction_record['receipt_number']
    if receipt_number in receipt_number_to_transactions:
        receipt_number_to_transactions[receipt_number] += [sales_receipt_with_transaction_record]
    else:
        receipt_number_to_transactions[receipt_number] = [sales_receipt_with_transaction_record]

In [None]:
import math

def float_eq(receipt_total_price: float, transactions_total_price: float, num_transactions: int) -> bool:
    # For every additional transaction, increase threshold by 0.01 (a penny).
    threshold = num_transactions * 0.1
    return math.isclose(receipt_total_price, transactions_total_price, abs_tol=threshold)

mismatch_count = 0 # Count of receipts where receipt total price does not match transactions total price.
missing_count = 0 # Count of receipts with no transactions.
total_count = 0 # Count of receipts (including those missing transactions).

mismatch_over_count = 0
mismatch_under_count = 0

month_to_mismatch_count = {}
month_to_missing_count = {}

month_to_mismatch_over_count = {}
month_to_mismatch_under_count = {}

example_mismatch_over_receipts = []
example_mismatch_under_receipts = []

for receipt_number, receipt_transactions in list(receipt_number_to_transactions.items()):
    receipt_total_price = receipt_transactions[0]['rt_total_price']
    receipt_sales_month = receipt_transactions[0]['sales_month']

    total_count += 1

    if len(receipt_transactions) == 1 and receipt_transactions[0]['tx_package_id'] == None:
        missing_count += 1
        if receipt_sales_month not in month_to_missing_count:
            month_to_missing_count[receipt_sales_month] = 0
        month_to_missing_count[receipt_sales_month] += 1
        continue

    transactions_total_price = sum(receipt_transaction['tx_total_price'] for receipt_transaction in receipt_transactions)
    if not float_eq(receipt_total_price, transactions_total_price, len(receipt_transactions)):
        mismatch_count += 1
        if receipt_total_price < transactions_total_price:
            mismatch_over_count += 1
            example_mismatch_over_receipts += [(receipt_number, receipt_transactions)]
        else:
            mismatch_under_count += 1
            example_mismatch_under_receipts += [(receipt_number, receipt_transactions)]

        if receipt_sales_month not in month_to_mismatch_count:
            month_to_mismatch_count[receipt_sales_month] = 0
        month_to_mismatch_count[receipt_sales_month] += 1
        continue

print(f'# receipts with mismatching transactions: {mismatch_count} ({mismatch_count / total_count * 100}%)')
print(f'# receipts missing transactions: {missing_count} ({missing_count / total_count * 100}%)')
print(f'# receipts total: {total_count}')

if mismatch_count:
    print(f'# mismatch receipt vs transactions (transactions over): {mismatch_over_count} ({mismatch_over_count / mismatch_count * 100}%)')
    print(f'# mismatch receipt vs transactions (transactions under): {mismatch_under_count} ({mismatch_under_count / mismatch_count * 100}%)')

In [None]:
month_to_missing_count

In [None]:
for example_mismatch_over_receipt in example_mismatch_over_receipts[:10]:
    receipt_number, receipt_transactions = example_mismatch_over_receipt
    first_receipt_transaction = receipt_transactions[0]
    print(receipt_number)
    for receipt_transaction in receipt_transactions:
        print(receipt_transaction)
    print('---')

## Export data

In [None]:
# Flip flag if you want to export data to files.
is_export_enabled = False

In [None]:
import time
from datetime import date

TODAY_DATE = date.today().strftime('%m-%d-%Y')
NOW = int(time.time())

export_sales_transactions_dataframe = deduped_sales_receipts_with_transactions_dataframe

date_columns = export_sales_transactions_dataframe.select_dtypes(include=['datetime64[ns, UTC]']).columns
for date_column in date_columns:
    export_sales_transactions_dataframe[date_column] = export_sales_transactions_dataframe[date_column].dt.date

sales_transactions_xlsx_file_name = f'~/Downloads/{COMPANY_IDENTIFIER}_sales_transactions_{SALES_TRANSACTIONS_START_DATE}_{TODAY_DATE}_{NOW}.xlsx'
sales_transactions_csv_file_name = f'~/Downloads/{COMPANY_IDENTIFIER}_sales_transactions_{SALES_TRANSACTIONS_START_DATE}_{TODAY_DATE}_{NOW}.csv'

if is_export_enabled:
    num_sales_transactions = len(export_sales_transactions_dataframe.index)
    print(f'Exported {num_sales_transactions} sales transactions to files...')
    export_sales_transactions_dataframe.reset_index().to_excel(sales_transactions_xlsx_file_name)
    export_sales_transactions_dataframe.reset_index().to_csv(sales_transactions_csv_file_name)
    print(f'Exported {num_sales_transactions} sales transactions to files')