In [None]:
# import libraries

In [36]:
import warnings
warnings.filterwarnings('ignore')

In [37]:
import json
import numpy
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline  
import pyarrow
import sys

from datetime import date
from dotenv import load_dotenv
from sqlalchemy import create_engine
from os import path
from typing import List,Dict, Tuple
from collections import defaultdict
pd.set_option("display.max_columns", None)

load_dotenv(verbose=True)
BIGQUERY_CREDENTIALS_PATH = os.environ.get('BIGQUERY_CREDENTIALS_PATH')
engine = create_engine('bigquery://bespoke-financial/ProdMetrcData', credentials_path=os.path.expanduser(BIGQUERY_CREDENTIALS_PATH))

sys.path.append(path.realpath(path.join(os.getcwd(), "../core")))
sys.path.append(path.realpath(path.join(os.getcwd(), "../../src")))

import create_queries
import prepare_data

from bespoke.inventory.analysis.shared import download_util, inventory_types
from bespoke.inventory.analysis import active_inventory_util as util
from bespoke.inventory.analysis import inventory_valuations_util as valuations_util

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
# Define company name and date ranges

In [39]:
# CHANGE ME
COMPANY_IDENTIFIER = ['DL']
TRANSFER_PACKAGES_START_DATE = '2020-01-01'
SALES_TRANSACTIONS_START_DATE = '2020-01-01'

In [41]:
#fetch download report and lisences
company_licenses_query = create_queries.create_company_licenses_query(COMPANY_IDENTIFIER)
company_download_summaries_query = create_queries.create_company_download_summaries_query(COMPANY_IDENTIFIER, TRANSFER_PACKAGES_START_DATE)

company_licenses_dataframe = pd.read_sql_query(company_licenses_query, engine)
company_download_summaries_dataframe = pd.read_sql_query(company_download_summaries_query, engine)

In [42]:
print(company_download_summaries_query)


		select
			companies.id as company_id,
			companies.identifier as company_identifier,
			metrc_download_summaries.license_number,
			metrc_download_summaries.date,
			metrc_download_summaries.status
		from
			metrc_download_summaries
			inner join companies on metrc_download_summaries.company_id = companies.id
		where
			True
			and companies.identifier in ("DL")
			and metrc_download_summaries.date >= "2020-01-01"
			
		order by
			date desc
	


# Download History Checks

## Check download status summaries

In [43]:
license_numbers = company_download_summaries_dataframe['license_number'].unique()
download_summary_records = company_download_summaries_dataframe.to_dict('records')

In [44]:
license_numbers

array(['MR283369'], dtype=object)

In [45]:
def check_company_license_download(license_numbers: List[str],download_summary_records: List[Dict[str,str]]) -> Dict[str,str]:
    license_number_to_download_summary_records = {}

    for license_number in license_numbers:
        license_number_to_download_summary_records[license_number] = list(filter(
            lambda download_summary_record: download_summary_record['license_number'] == license_number,
            download_summary_records
        ))

    bad_count = 0
    bad_history = defaultdict(list)
    for license_number, download_summary_records in license_number_to_download_summary_records.items():
        print(f'Verifying download summaries for license {license_number}...')
        print(f'Earliest download summary: {download_summary_records[-1]["date"]}')
        print(f'Latest download summary: {download_summary_records[0]["date"]}')
        for download_summary_record in download_summary_records:
            if download_summary_record['status'] != 'completed':
                bad_count += 1
                print(f'Found bad download summary for license {license_number} on date {download_summary_record["date"]}') 
                bad_history[license_number].append(download_summary_record['date'])            
        
        print('')
    if bad_count > 0:
        print(f'[FAILURE] Found a total of {bad_count} bad download summaries')
    else:
        print(f'[SUCCESS] All download summaries look good!')
    return bad_history



In [46]:
bad_download_history = check_company_license_download(license_numbers,download_summary_records)

Verifying download summaries for license MR283369...
Earliest download summary: 2021-12-01
Latest download summary: 2022-02-07

[SUCCESS] All download summaries look good!


In [47]:
bad_download_history

defaultdict(list, {})

In [48]:
# warren will add more checks around date continuous
# throw error if 1 date is missing (with 5 days lags)

## Get list of retailer license numbers

In [49]:
# get list of retailer license numbers
license_numbers = list(company_licenses_dataframe[company_licenses_dataframe['license_category'].isin(['Retailer', 'Multiple'])]['license_number'].unique())

In [50]:
license_numbers

['MR283369']

In [64]:
company_licenses_dataframe

Unnamed: 0,us_state,license_number,license_category,legal_name,is_current,license_status,rollup_id,license_description,company_id,facility_row_id
0,MA,MR283369,Retailer,"Diem Lynn, LLC",True,,MA-00116,Marijuana Retailer,11695ca3-46bf-44b4-8ff3-c41eb6e0c620,


In [51]:
#license_numbers = [license_numbers[0]]

In [52]:
#company_sales_transactions_dataframe[company_sales_transactions_dataframe['license_number'] == 'C10-0000918-LIC']

In [53]:
company_incoming_transfer_packages_query = create_queries.create_company_incoming_transfer_packages_query(
    COMPANY_IDENTIFIER,
    TRANSFER_PACKAGES_START_DATE,
    license_numbers=license_numbers,
)
company_outgoing_transfer_packages_query = create_queries.create_company_outgoing_transfer_packages_query(
    COMPANY_IDENTIFIER,
    TRANSFER_PACKAGES_START_DATE,
    license_numbers=license_numbers,
)
company_unknown_transfer_packages_query = create_queries.create_company_unknown_transfer_packages_query(
    COMPANY_IDENTIFIER,
    TRANSFER_PACKAGES_START_DATE,
#     license_numbers=license_numbers,
)
company_sales_transactions_query = create_queries.create_company_sales_transactions_query(
    COMPANY_IDENTIFIER,
    SALES_TRANSACTIONS_START_DATE,
    license_numbers=license_numbers,
)
company_sales_receipts_query = create_queries.create_company_sales_receipts_query(
    COMPANY_IDENTIFIER,
    SALES_TRANSACTIONS_START_DATE,
    license_numbers=license_numbers,
)
company_sales_receipts_with_transactions_query = create_queries.create_company_sales_receipts_with_transactions_query(
    COMPANY_IDENTIFIER,
    SALES_TRANSACTIONS_START_DATE,
    license_numbers=license_numbers,
)
company_inventory_packages_query = create_queries.create_company_inventory_packages_query(
    COMPANY_IDENTIFIER,
    include_quantity_zero=True,
    license_numbers=license_numbers,
)

company_incoming_transfer_packages_dataframe = pd.read_sql_query(company_incoming_transfer_packages_query, engine)
company_outgoing_transfer_packages_dataframe = pd.read_sql_query(company_outgoing_transfer_packages_query, engine)
company_unknown_transfer_packages_dataframe = pd.read_sql_query(company_unknown_transfer_packages_query, engine)
company_sales_transactions_dataframe = pd.read_sql_query(company_sales_transactions_query, engine)
company_sales_receipts_dataframe = pd.read_sql_query(company_sales_receipts_query, engine)
company_sales_receipts_with_transactions_dataframe = pd.read_sql_query(company_sales_receipts_with_transactions_query, engine)
company_inventory_packages_dataframe = pd.read_sql_query(company_inventory_packages_query, engine)

In [63]:
company_sales_receipts_with_transactions_dataframe

Unnamed: 0,license_number,receipt_number,rt_type,sales_customer_type,sales_datetime,sales_month,total_packages,rt_total_price,tx_type,tx_package_id,tx_package_label,tx_product_name,tx_product_category_name,tx_unit_of_measure,tx_quantity_sold,tx_total_price
0,MR283369,0035783204,active,Consumer,2022-02-06 23:00:22.907000+00:00,2022-02-01 00:00:00+00:00,1,10.00,active,3374775,1A40A0100000BBD000026211,M00000280601: .5g Pre-Roll-S-NF1,Raw Pre-Rolls,Grams,0.5,10.00
1,MR283369,0035783203,active,Consumer,2022-02-06 22:59:13.550000+00:00,2022-02-01 00:00:00+00:00,1,15.00,active,3248591,1A40A0300002261000000582,M00000855069: CUL - UX - King Pre Roll - Choco...,Raw Pre-Rolls,Grams,1.0,15.00
2,MR283369,0035783202,active,Consumer,2022-02-06 22:58:10.833000+00:00,2022-02-01 00:00:00+00:00,1,39.83,active,3030853,1A40A0300001A91000010698,M00001297402: 3.5g-Premium Small Bud- Purple S...,Buds,Grams,3.5,39.83
3,MR283369,0035783201,active,Consumer,2022-02-06 22:56:47.750000+00:00,2022-02-01 00:00:00+00:00,1,15.00,active,3248591,1A40A0300002261000000582,M00000855069: CUL - UX - King Pre Roll - Choco...,Raw Pre-Rolls,Grams,1.0,15.00
4,MR283369,0035783200,active,Consumer,2022-02-06 22:55:54.117000+00:00,2022-02-01 00:00:00+00:00,1,46.34,active,2963232,1A40A030000025A000058353,M00000939802: Sira Banana Kush Flower 3.5g,Buds,Grams,3.5,46.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48316,MR283369,0023639785,active,Consumer,2021-07-03 09:20:10.640000+00:00,2021-07-01 00:00:00+00:00,2,106.00,active,2085307,1A40A0300002261000000070,M00000850806: Hand Trim Pound Smalls Gelato #3...,Buds,Grams,7.0,90.00
48317,MR283369,0023639785,active,Consumer,2021-07-03 09:20:10.640000+00:00,2021-07-01 00:00:00+00:00,2,106.00,active,1932205,1A40A03000016AA000008750,"Garlic Breath Preroll, 1g",Raw Pre-Rolls,Grams,1.0,16.00
48318,MR283369,0023639784,active,Consumer,2021-07-03 09:10:28.277000+00:00,2021-07-01 00:00:00+00:00,1,9.00,active,1967702,1A40A03000002BF000020797,M00000891601: AU: Purple Sunset PRC 0.3g,Raw Pre-Rolls,Grams,0.3,9.00
48319,MR283369,0023639783,active,Consumer,2021-07-03 09:08:08.503000+00:00,2021-07-01 00:00:00+00:00,1,10.00,active,2194452,1A40A0300002261000000216,PEX (Pineapple Express) Flower - BULK,Buds,Grams,0.5,10.00


In [55]:
company_unknown_transfer_packages_dataframe

Unnamed: 0,delivery_type,license_number,manifest_number,created_date,received_datetime,shipper_facility_license_number,shipper_facility_name,recipient_facility_license_number,recipient_facility_name,shipment_type_name,shipment_transaction_type,package_id,package_label,type,shipment_package_state,is_testing_sample,is_trade_sample,product_category_name,product_name,package_lab_results_status,shipper_wholesale_price,shipped_quantity,shipped_unit_of_measure,received_quantity,received_unit_of_measure,item_unit_weight,item_unit_weight_unit_of_measure_name
0,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2891004,1A40A0100001135000024650,transfer,Accepted,False,False,Raw Pre-Rolls,PRE-ROLL - 0.75G - ACDC - CBD,passed,1.0,75.0,Grams,75.0,Grams,,
1,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2905116,1A40A0100001135000024730,transfer,Accepted,False,False,Raw Pre-Rolls,PRE-ROLL - 0.75G - DREAD BREAD - SATIVA,passed,1.0,75.0,Grams,75.0,Grams,,
2,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2929397,1A40A0100001135000024833,transfer,Accepted,False,False,Infused Pre-Rolls,M00000145340: WAX PRE-ROLL - 0.75G - BLEND - S...,passed,1.0,75.0,Grams,75.0,Grams,,
3,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2938803,1A40A0100001135000024892,transfer,Accepted,False,False,Infused Pre-Rolls,M00000151390: WAX PRE-ROLL - 0.75G - BLEND - I...,passed,1.0,75.0,Grams,75.0,Grams,,
4,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2837400,1A40A0100001135000024551,transfer,Accepted,False,False,Infused Pre-Rolls,WAX PRE-ROLL - 0.75G - BLEND - HYBRID,passed,1.0,37.5,Grams,37.5,Grams,,
5,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2897258,1A40A0100001135000024664,transfer,Accepted,False,False,Raw Pre-Rolls,M00000460926: PRE-ROLL - 0.75G - ORIGINAL GLUE...,passed,1.0,75.0,Grams,75.0,Grams,,
6,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2914175,1A40A0100001135000024813,transfer,Accepted,False,False,Infused Pre-Rolls,M00000151385: WAX PRE-ROLL - 0.75G - BLEND - H...,passed,1.0,75.0,Grams,75.0,Grams,,
7,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2914176,1A40A0100001135000024814,transfer,Accepted,False,False,Infused Pre-Rolls,M00000151385: WAX PRE-ROLL - 0.75G - BLEND - H...,passed,1.0,75.0,Grams,75.0,Grams,,
8,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2938813,1A40A0100001135000024893,transfer,Accepted,False,False,Infused Pre-Rolls,M00000145340: WAX PRE-ROLL - 0.75G - BLEND - S...,passed,1.0,150.0,Grams,150.0,Grams,,
9,UNKNOWN,MR283369,814105,2021-11-15,2021-11-16 19:52:30+00:00,RMD125-P,"New England Treatment Access, LLC",MR282376,TDMA LLC,Unaffiliated Transfer,Wholesale,2516318,1A40A0300000068000076675,transfer,Accepted,False,False,Buds,M00000448124: NST FP - 3.5G - DREAD BREAD - SA...,passed,1.0,350.0,Grams,350.0,Grams,,


In [54]:
company_incoming_transfer_packages_dataframe['delivery_type'].unique()

array(['INCOMING_FROM_VENDOR'], dtype=object)

## Unknown transfer packages

In [17]:
def check_unknown_transfer_packages(unknown_transfer_df: pd.DataFrame) -> int:
    unknown_package_count = unknown_transfer_df.shape[0]
    if unknown_package_count > 0:
        print(f'[FAILURE] Found a total of {unknown_package_count} unknown transfer packages')
    else:
        print(f'[SUCCESS] No unknown transfer packages!')
    return unknown_package_count

In [18]:
unknown_package_count = check_unknown_transfer_packages(company_unknown_transfer_packages_dataframe)
unknown_package_count

[SUCCESS] No unknown transfer packages!


0

In [None]:
# should be 0 in the future

In [None]:
# Plot incoming transfer packages

In [None]:
#company_incoming_transfer_packages_dataframe['created_month'] = pd.to_datetime(company_incoming_transfer_packages_dataframe['created_date']).dt.strftime('%Y-%m')

In [None]:
# Incoming transfer packages by shipment type name
#company_incoming_transfer_packages_dataframe.groupby(['created_month', 'shipment_type_name'])['package_id'].count().unstack().plot.bar(figsize=(24, 8), stacked=True)

In [None]:
# Incoming transfer packages by shipment_transaction_type
#company_incoming_transfer_packages_dataframe.groupby(['created_month', 'shipment_transaction_type'])['package_id'].count().unstack().plot.bar(figsize=(24, 8), stacked=True)

In [None]:
# Incoming transfer packages by product category names
#company_incoming_transfer_packages_dataframe.groupby(['created_month', 'product_category_name'])['package_id'].count().unstack().plot.bar(figsize=(24, 8), stacked=True,colormap='Paired')
#plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))


## Receiver wholesale price coverage

In [19]:
def check_receiver_wholesale_price_coverage(incoming_transfer_df: pd.DataFrame) -> float:
    rwp_exists_count = incoming_transfer_df[incoming_transfer_df['receiver_wholesale_price'].notnull()].shape[0]
    total_count = incoming_transfer_df.shape[0]
    rwp_coverage = round(rwp_exists_count / total_count, 2) * 100
    print(f'{rwp_coverage}% of incoming transfer packages have receiver wholesale price')
    return rwp_coverage

In [20]:
company_incoming_transfer_packages_dataframe[company_incoming_transfer_packages_dataframe['receiver_wholesale_price'].notnull()].shape

(18929, 32)

In [21]:
company_incoming_transfer_packages_dataframe.shape

(19168, 32)

In [22]:
rwp_coverage = check_receiver_wholesale_price_coverage(company_incoming_transfer_packages_dataframe)


99.0% of incoming transfer packages have receiver wholesale price


In [None]:
#next step
# run on a bunch of companies and see 
# right now we use 75%, but could improve

In [None]:
# Plot Sales GMV month-over-month 

In [None]:
#company_sales_receipts_dataframe['sales_month'] = company_sales_receipts_dataframe['sales_datetime'].dt.strftime('%Y-%m')

In [None]:
#company_sales_receipts_dataframe.groupby(['sales_month'])['receipt_number'].count().plot.bar(figsize=(24, 8))

In [None]:
#company_sales_receipts_dataframe.groupby(['sales_month'])['total_price'].sum().plot.bar(figsize=(24, 8), stacked=True)

In [None]:
## Sales GMV month-over-month from metrc_sales_transactions

In [23]:
deduped_sales_receipts_with_transactions_dataframe = prepare_data.dedupe_sales_transactions(company_sales_receipts_with_transactions_dataframe)

In [24]:
deduped_sales_receipts_with_transactions_dataframe['sales_month'] = deduped_sales_receipts_with_transactions_dataframe['sales_datetime'].dt.strftime('%Y-%m')

In [None]:
# Sales GMV month-over-month by product category name
#deduped_sales_receipts_with_transactions_dataframe.groupby(['sales_month', 'tx_product_category_name'])['tx_total_price'].sum().unstack().plot.bar(figsize=(24, 8), stacked=True,colormap = 'Paired')
#plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))



In [25]:
## View consumer vs medicinal breakdown by month.
is_sales_customer_type_breakdown_visible = False
result = None
if is_sales_customer_type_breakdown_visible:
    result = company_sales_receipts_dataframe.groupby(['sales_month', 'sales_customer_type'])['total_price'].sum().unstack().plot.bar(figsize=(24, 8), stacked=True)
result

## Trxns missing incoming transfer packages

In [26]:
def check_incoming_transfer_package_coverage(incoming_transfer_df: pd.DataFrame,sales_df: pd.DataFrame) -> Tuple[List,pd.DataFrame]:
    package_transfer_sales_merged = pd.merge(sales_df,incoming_transfer_df,left_on='tx_package_id', right_on='package_id', how='left')
    package_transfer_sales_merged_missed = package_transfer_sales_merged[package_transfer_sales_merged['package_id'].isnull()]
    count_trxn_missed = package_transfer_sales_merged_missed.shape[0]
    count_total_trxn = sales_df.shape[0]
    missed_ratio = count_trxn_missed/ count_total_trxn
    print(f'# transactions missing incoming transfer package: {count_trxn_missed} ({count_trxn_missed / count_total_trxn * 100}%)')
    print(f'# transactions total: {count_total_trxn}')
    return missed_ratio,package_transfer_sales_merged_missed

In [27]:
deduped_sales_receipts_with_transactions_dataframe = prepare_data.dedupe_sales_transactions(company_sales_receipts_with_transactions_dataframe)
deduped_sales_receipts_with_transactions_dataframe['sales_month'] = deduped_sales_receipts_with_transactions_dataframe['sales_datetime'].dt.strftime('%Y-%m')

In [28]:
incoming_transfer_package_coverage,incoming_transfer_package_coverage_missing = check_incoming_transfer_package_coverage(company_incoming_transfer_packages_dataframe,deduped_sales_receipts_with_transactions_dataframe)

# transactions missing incoming transfer package: 30953 (6.7276113969253775%)
# transactions total: 460089


In [29]:
month_to_missing_count = incoming_transfer_package_coverage_missing[['sales_month','receipt_number']].groupby(['sales_month']).apply(lambda x: len(x['receipt_number'].unique()))
month_to_missing_count


sales_month
2020-01    8021
2020-02    3443
2020-03    1561
2020-04     788
2020-05     356
2020-06     131
2020-07     254
2020-08      37
2020-09      22
2020-10       5
2020-11      40
2020-12     121
2021-01     470
2021-02     317
2021-03     579
2021-04     291
2021-05     562
2021-06    1233
2021-07    1053
2021-08     970
2021-09     693
2021-10    1350
2021-11    1234
2021-12     913
2022-01     568
2022-02      66
dtype: int64

## Receipts missing metrc_sales_transactions

In [56]:
def check_receipts_missing_sales_trxns(sales_trxn_df):
    sales_receipt_with_transactions_records = sales_trxn_df.to_dict('records')
    receipt_number_to_transactions = {}
    for sales_receipt_with_transaction_record in sales_receipt_with_transactions_records:
        receipt_number = sales_receipt_with_transaction_record['receipt_number']
        if receipt_number in receipt_number_to_transactions:
            receipt_number_to_transactions[receipt_number] += [sales_receipt_with_transaction_record]
        else:
            receipt_number_to_transactions[receipt_number] = [sales_receipt_with_transaction_record]
    mismatch_count = 0 # Count of receipts where receipt total price does not match transactions total price.
    missing_count = 0 # Count of receipts with no transactions.
    total_count = 0 # Count of receipts (including those missing transactions).

    mismatch_over_count = 0
    mismatch_under_count = 0

    month_to_mismatch_count = {}
    month_to_missing_count = {}

    month_to_mismatch_over_count = {}
    month_to_mismatch_under_count = {}

    example_mismatch_over_receipts = []
    example_mismatch_under_receipts = []

    for receipt_number, receipt_transactions in list(receipt_number_to_transactions.items()):
        receipt_total_price = receipt_transactions[0]['rt_total_price']
        receipt_sales_month = receipt_transactions[0]['sales_month']
        receipt_total_packages = receipt_transactions[0]['total_packages']

        total_count += 1

        if len(receipt_transactions) == 1 and receipt_transactions[0]['tx_package_id'] == None:
            missing_count += 1
            if receipt_sales_month not in month_to_missing_count:
                month_to_missing_count[receipt_sales_month] = 0
            month_to_missing_count[receipt_sales_month] += 1
            continue

        # Check whether 'total_packages' field of sales receipt matches number of transactions related to receipt.
        if receipt_total_packages != len(receipt_transactions):
            missing_count += 1
            if receipt_sales_month not in month_to_missing_count:
                month_to_missing_count[receipt_sales_month] = 0
            month_to_missing_count[receipt_sales_month] += 1
            continue

        transactions_total_price = sum(receipt_transaction['tx_total_price'] for receipt_transaction in receipt_transactions)
        if not float_eq(receipt_total_price, transactions_total_price, len(receipt_transactions)):
            mismatch_count += 1
            if receipt_total_price < transactions_total_price:
                mismatch_over_count += 1
                example_mismatch_over_receipts += [(receipt_number, receipt_transactions)]
            else:
                mismatch_under_count += 1
                example_mismatch_under_receipts += [(receipt_number, receipt_transactions)]

            if receipt_sales_month not in month_to_mismatch_count:
                month_to_mismatch_count[receipt_sales_month] = 0
            month_to_mismatch_count[receipt_sales_month] += 1
            continue
    return mismatch_count / total_count 
    

In [57]:
a = check_receipts_missing_sales_trxns(deduped_sales_receipts_with_transactions_dataframe)

In [58]:
a

0.0

In [30]:
sales_receipt_with_transactions_records = deduped_sales_receipts_with_transactions_dataframe.to_dict('records')

receipt_number_to_transactions = {}
for sales_receipt_with_transaction_record in sales_receipt_with_transactions_records:
    receipt_number = sales_receipt_with_transaction_record['receipt_number']
    if receipt_number in receipt_number_to_transactions:
        receipt_number_to_transactions[receipt_number] += [sales_receipt_with_transaction_record]
    else:
        receipt_number_to_transactions[receipt_number] = [sales_receipt_with_transaction_record]
    

In [31]:
import math

def float_eq(receipt_total_price: float, transactions_total_price: float, num_transactions: int) -> bool:
    # For every additional transaction, increase threshold by 0.01 (a penny).
    threshold = num_transactions * 0.1
    return math.isclose(receipt_total_price, transactions_total_price, abs_tol=threshold)

mismatch_count = 0 # Count of receipts where receipt total price does not match transactions total price.
missing_count = 0 # Count of receipts with no transactions.
total_count = 0 # Count of receipts (including those missing transactions).

mismatch_over_count = 0
mismatch_under_count = 0

month_to_mismatch_count = {}
month_to_missing_count = {}

month_to_mismatch_over_count = {}
month_to_mismatch_under_count = {}

example_mismatch_over_receipts = []
example_mismatch_under_receipts = []

for receipt_number, receipt_transactions in list(receipt_number_to_transactions.items()):
    receipt_total_price = receipt_transactions[0]['rt_total_price']
    receipt_sales_month = receipt_transactions[0]['sales_month']
    receipt_total_packages = receipt_transactions[0]['total_packages']

    total_count += 1

    if len(receipt_transactions) == 1 and receipt_transactions[0]['tx_package_id'] == None:
        missing_count += 1
        if receipt_sales_month not in month_to_missing_count:
            month_to_missing_count[receipt_sales_month] = 0
        month_to_missing_count[receipt_sales_month] += 1
        continue

    # Check whether 'total_packages' field of sales receipt matches number of transactions related to receipt.
    if receipt_total_packages != len(receipt_transactions):
        missing_count += 1
        if receipt_sales_month not in month_to_missing_count:
            month_to_missing_count[receipt_sales_month] = 0
        month_to_missing_count[receipt_sales_month] += 1
        continue

    transactions_total_price = sum(receipt_transaction['tx_total_price'] for receipt_transaction in receipt_transactions)
    if not float_eq(receipt_total_price, transactions_total_price, len(receipt_transactions)):
        mismatch_count += 1
        if receipt_total_price < transactions_total_price:
            mismatch_over_count += 1
            example_mismatch_over_receipts += [(receipt_number, receipt_transactions)]
        else:
            mismatch_under_count += 1
            example_mismatch_under_receipts += [(receipt_number, receipt_transactions)]

        if receipt_sales_month not in month_to_mismatch_count:
            month_to_mismatch_count[receipt_sales_month] = 0
        month_to_mismatch_count[receipt_sales_month] += 1
        continue

print(f'# receipts with mismatching transactions: {mismatch_count} ({mismatch_count / total_count * 100}%)')
print(f'# receipts missing transactions: {missing_count} ({missing_count / total_count * 100}%)')
print(f'# receipts total: {total_count}')

if mismatch_count:
    print(f'# mismatch receipt vs transactions (transactions over): {mismatch_over_count} ({mismatch_over_count / mismatch_count * 100}%)')
    print(f'# mismatch receipt vs transactions (transactions under): {mismatch_under_count} ({mismatch_under_count / mismatch_count * 100}%)')

# receipts with mismatching transactions: 0 (0.0%)
# receipts missing transactions: 0 (0.0%)
# receipts total: 251138


In [32]:
#month_to_missing_count

In [33]:
for example_mismatch_over_receipt in example_mismatch_over_receipts[:10]:
    receipt_number, receipt_transactions = example_mismatch_over_receipt
    first_receipt_transaction = receipt_transactions[0]
    print(receipt_number)
    for receipt_transaction in receipt_transactions:
        print(receipt_transaction)
    print('---')

# Vendor churn

In [None]:
window = 4
vc_start_date = '2020-01-01'
vc_end_date = '2022-02-01'

In [None]:
#df_vendor_churn = company_incoming_transfer_packages_dataframe

In [None]:
license_numbers

In [None]:
license_number = license_numbers[0]

In [None]:
df_vendor_churn = company_incoming_transfer_packages_dataframe[company_incoming_transfer_packages_dataframe['license_number'].isin(license_numbers)]
df_vendor_churn['year_month'] = pd.to_datetime(df_vendor_churn['created_date']).dt.strftime("%Y-%m")
vc = df_vendor_churn[['year_month','shipper_facility_name','shipper_wholesale_price']].groupby(['year_month','shipper_facility_name']).sum().reset_index()
vc= vc.assign(year_month=lambda df:pd.to_datetime(df['year_month']))


In [None]:
# make `vc` a long-table version of the spreadsheet, where all conpanies have a continuous year_month starting from 
# 2020-01-01 to 2022-01-01

vc_full = (
    vc
    .groupby('shipper_facility_name').apply(
        lambda df: df.merge(
        pd.Series(
            None, 
            index = pd.date_range(start='2020-01-01', end= vc_end_date, freq = 'MS'), 
            name='__place_holder'
        ), 
        how ='right',
        left_on = 'year_month', 
        right_index=True,
    ).assign(
        **{
            "shipper_facility_name": lambda df_: df_.shipper_facility_name.dropna().unique()[0],
            "shipper_wholesale_price": lambda df_: df_.shipper_wholesale_price.fillna(0),
        }
    )
    .drop('__place_holder', axis=1)
    )
).reset_index(drop=True)

In [None]:
#vc_full[vc_full['year_month'] == '2021-08-01' ]

In [None]:
rolling_4m_sum =  vc_full.groupby('shipper_facility_name').apply(
    lambda df: df.set_index('year_month').sort_index().rolling(window).sum()
)
rolling_4m_sum.columns = ['rolling_4m_total_price']


facility_monthly_running_total = vc_full.groupby('shipper_facility_name').apply(
    lambda df: df.set_index('year_month').sort_index()['shipper_wholesale_price'].cumsum().to_frame()
)
facility_monthly_running_total.columns = ['facility_running_total']

monthly_running_total = facility_monthly_running_total.reset_index().groupby('year_month')['facility_running_total'].sum().to_frame()
monthly_running_total.columns = ['monthly_running_total']


In [None]:
vc_result = rolling_4m_sum.merge(
    facility_monthly_running_total,
    how='inner',
    left_index=True,
    right_index=True
).reset_index().merge(
    monthly_running_total,
    how='left',
    on = 'year_month'
)

In [None]:
vc_result['%_total'] = vc_result['facility_running_total'] / vc_result['monthly_running_total']

In [None]:
vc_result['last_4m_active'] = vc_result['rolling_4m_total_price'] > 0

In [None]:
vc_result ['significant'] = vc_result['%_total'] > 0.001

In [None]:
vc_result['measure'] = vc_result.apply(
    lambda row: "Active"  if (row['last_4m_active'] & row['significant']) else ("Inactive" if row['significant'] else "Exclude"),
    axis=1
)

In [None]:
churn = vc_result.groupby(['year_month']).apply(lambda x: x[x['measure'] == 'Inactive']['%_total'].sum()).reset_index()
churn.columns = ['year_month','%_inactive']
churn.index = churn.year_month



In [None]:
ax = churn['%_inactive'].plot(marker = 'o',figsize = (12,8))
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
ax = plt.title('Rolling 4m vendor churn')

In [None]:
churn

In [None]:
#churn.to_csv('surveillance/'+COMPANY_IDENTIFIER[0]+'/VC/' + COMPANY_IDENTIFIER[0] + '_'+ vc_end_date + '_' +license_number +'_vc.csv')



In [None]:
VC_MONTH_LIST = ['2021-10-01','2021-11-01','2021-12-01','2022-01-01']
VC_MONTH_END = VC_MONTH_LIST[-1]

In [None]:
# output vendor churn matrix
vc_data = vc_full[vc_full['year_month'] <= VC_MONTH_END]
vc_data['year_month'] = vc_data['year_month'].astype(str)
vc_matrix = pd.pivot_table(vc_data, values='shipper_wholesale_price', index='shipper_facility_name',
                    columns='year_month', fill_value=0).reset_index()

In [None]:
vc_matrix['facility_total'] = vc_matrix.sum(axis= 1)
vc_matrix['grand_total'] = vc_matrix['facility_total'].sum()
vc_matrix['perc_total'] = vc_matrix['facility_total'] / vc_matrix['grand_total']
vc_matrix['last_4m_total'] = vc_matrix[VC_MONTH_LIST].sum(axis = 1)

In [None]:
vc_matrix['last_4m_active'] = vc_matrix['last_4m_total'] > 0
vc_matrix ['significant'] = vc_matrix['perc_total'] > 0.001
vc_matrix['measure'] = vc_matrix.apply(
    lambda row: "Active"  if (row['last_4m_active'] & row['significant']) else ("Inactive" if row['significant'] else "Exclude"),
    axis=1
)

In [None]:
#vc_matrix.to_csv('surveillance/'+COMPANY_IDENTIFIER[0]+'/VC/' + COMPANY_IDENTIFIER[0] + '_'+ vc_end_date + '_' +license_number +'_vc_matrix.csv')



# Receiver wholesale price outlier check

In [None]:
def check_per_unit_incoming(incoming_transfer_df: pd.DataFrame) -> Tuple[pd.DataFrame,pd.DataFrame]:
    incoming_transfer_df['per_unit_incoming'] = incoming_transfer_df['shipper_wholesale_price'] / incoming_transfer_df['shipped_quantity']
    # by package ID
    per_unit_incoming_package_sort = incoming_transfer_df[['package_id','per_unit_incoming']].groupby(['package_id']).mean().sort_values(by = 'per_unit_incoming',ascending = False).reset_index()
    per_unit_incoming_package_top5 = per_unit_incoming_package_sort[0:5]
    print('printing per unit incoming by package ID summary ...')
    print(f'max per unit incoming: {per_unit_incoming_package_sort["per_unit_incoming"][0]}' + f' from package ID: {per_unit_incoming_package_sort["package_id"][0]}')
    print(f'min per unit incoming: {per_unit_incoming_package_sort.dropna()["per_unit_incoming"][-1:].values[0]}' + f' from package ID: {per_unit_incoming_package_sort.dropna()["package_id"][-1:].values[0]}')
    #print(per_unit_incoming_package_top5)
    print('')
    
    #by product name
    per_unit_incoming_product_sort = incoming_transfer_df[['product_name','per_unit_incoming']].groupby(['product_name']).max().sort_values(by = 'per_unit_incoming',ascending = False).reset_index()
    per_unit_incoming_product_top5 = per_unit_incoming_product_sort[0:5]
    #print('printing per unit incoming by product name summary ...')
    #print(f'max per unit incoming: {per_unit_incoming_product_sort["per_unit_incoming"][0]}' + f' from product name: {per_unit_incoming_product_sort["product_name"][0]}')
    #print(f'min per unit incoming: {per_unit_incoming_product_sort.dropna()["per_unit_incoming"][-1:].values[0]}' + f' from product_name: {per_unit_incoming_product_sort.dropna()["product_name"][-1:].values[0]}')
    #print(per_unit_incoming_product_top5)
    return per_unit_incoming_package_sort,per_unit_incoming_product_sort


In [None]:
per_unit_incoming_package_sort,per_unit_incoming_product_sort = check_per_unit_incoming(company_incoming_transfer_packages_dataframe)


In [None]:
# top 5
print('Top 5 per unit incoming packages')
per_unit_incoming_package_sort.head(5)

In [None]:
print('Bottom 5 per unit incoming packages')
per_unit_incoming_package_sort.dropna().tail(5)


## Trim outliers in per unit incoming before COGS analysis
Trim the packages from incoming transfer data which per unit incoming is in the top and bottom 0.5% 

In [None]:
keep= company_incoming_transfer_packages_dataframe['per_unit_incoming'].between(company_incoming_transfer_packages_dataframe['per_unit_incoming'].quantile(.005), company_incoming_transfer_packages_dataframe['per_unit_incoming'].quantile(.995))



In [None]:
# date with trimmed packages
#company_incoming_transfer_packages_dataframe[['per_unit_incoming','created_date']][~keep].groupby(['created_date']).count()

In [None]:
company_incoming_transfer_packages_dataframe_trim = company_incoming_transfer_packages_dataframe[keep]

In [None]:
print('Trimmed ' + str(round((1 - keep.sum() / keep.shape[0])*100,2)) + '%, '  + str(keep.shape[0] - keep.sum()) + ' of ' + str(keep.shape[0])+' total packages')

In [None]:
# show break down by month the trimmed records
#display the last few weeks updated margin %

# COGS analysis

In [None]:
def cogs_analysis(df_in:pd.DataFrame,df_out:pd.DataFrame,df_inventory:pd.DataFrame,df_sales:pd.DataFrame,freq: str) -> Tuple[pd.DataFrame,pd.DataFrame]:
    df_sales['per_unit'] = df_sales['tx_total_price'] / df_sales['tx_quantity_sold']
    # set frequency
    if freq == 'monthly':
        df_sales['date'] = df_sales['sales_datetime'].dt.strftime("%Y-%m")
    elif freq == 'weekly':
        df_sales['date'] = df_sales['sales_datetime'].dt.strftime("%Y-%W")
        df_sales['week'] = df_sales['sales_datetime'].dt.strftime("%W")
    # total # of trxns
    s_total_count = df_sales.groupby('date')['tx_total_price'].count()
    df_total_count = pd.Series(s_total_count).to_frame()
    df_total_count = df_total_count.reset_index()
    df_total_count.rename(columns={'tx_total_price':'total_count'}, inplace=True)
    # revenue
    s_revenue = df_sales.groupby('date')['tx_total_price'].sum()
    df_revenue = pd.Series(s_revenue).to_frame()
    df_revenue = df_revenue.reset_index()
    df_revenue.rename(columns={'tx_total_price': 'revenue'}, inplace=True)

    df_in['per_unit_incoming'] = df_in['shipper_wholesale_price'] / df_in['shipped_quantity']
    
    # per unit price by package id
    df_in_price = df_in[df_in['shipper_wholesale_price'].notnull()]
    average_incoming_package_id = df_in_price.groupby('package_id')['per_unit_incoming'].mean()
    df_avg_incoming_price = pd.Series(average_incoming_package_id).to_frame()
    df_avg_incoming_price = df_avg_incoming_price.reset_index()
    # per unit price by product name
    average_incoming_product = df_in_price.groupby('product_name')['per_unit_incoming'].mean()
    df_avg_product = pd.Series(average_incoming_product).to_frame()
    df_avg_product = df_avg_product.reset_index()
    df_avg_product.rename(columns={'per_unit_incoming':'per_unit_product'}, inplace=True)

    # merge with (cogs by package id)
    df_cogs_package_id = pd.merge(df_sales, df_avg_incoming_price, left_on='tx_package_id', right_on='package_id', how='left')
    df_cogs_package_id['total_incoming'] = df_cogs_package_id['per_unit_incoming'] * df_cogs_package_id['tx_quantity_sold']
    df_cogs_package_id.replace([numpy.inf], numpy.nan, inplace=True)
    df_cogs_package_id_notnull = df_cogs_package_id[df_cogs_package_id['total_incoming'].notnull()]

    # sum cogs by package id
    s_cogs = df_cogs_package_id_notnull.groupby('date')['total_incoming'].sum()
    df_cogs_id = pd.Series(s_cogs).to_frame()
    df_cogs_id = df_cogs_id.reset_index()
    # count # of trxn by package id
    s_cogs_count = df_cogs_package_id_notnull.groupby('date')['total_incoming'].count()
    df_cogs_count = pd.Series(s_cogs_count).to_frame()
    df_cogs_count = df_cogs_count.reset_index()
    df_cogs_count.rename(columns={'total_incoming':'count_incoming'}, inplace=True)
    
    # merge with (cogs by product name)
    df_cogs_average_product = pd.merge(df_cogs_package_id, df_avg_product, left_on='tx_product_name', right_on='product_name', how='left')
    df_cogs_average_product['total_product'] = df_cogs_average_product['tx_quantity_sold'] * df_cogs_average_product['per_unit_product']
    df_cogs_null = df_cogs_average_product[df_cogs_average_product['per_unit_incoming'].isnull()]
    df_cogs_product = df_cogs_null[df_cogs_null['per_unit_product'].notnull()]
    # sum cogs filldown by product name
    product_sum = df_cogs_product.groupby('date')['total_product'].sum()
    df_product_sum = pd.Series(product_sum).to_frame()
    df_product_sum = df_product_sum.reset_index()
    df_product_sum.rename(columns={'total_product':'product_sum'}, inplace=True)
    # count # of trxn filldown by product name
    product_count = df_cogs_product.groupby('date')['total_product'].count()
    df_product_count = pd.Series(product_count).to_frame()
    df_product_count = df_product_count.reset_index()
    df_product_count.rename(columns={'total_product':'product_count'}, inplace=True)
    df_cogs_product_df = pd.merge(df_product_sum, df_product_count)
    
    # prepare summary
    df_summary = pd.merge(df_revenue, df_cogs_product_df, how='left')
    df_summary = pd.merge(df_summary, df_cogs_id, how='left')
    df_summary['product_sum'] = df_summary['product_sum'].fillna(0)
    df_summary['product_count'] = df_summary['product_count'].fillna(0)
    # total cogs = by product id cogs + by product name cogs
    df_summary['cogs'] = df_summary['total_incoming'] + df_summary['product_sum']
    df_summary = pd.merge(df_summary, df_cogs_count)
    df_summary = pd.merge(df_summary, df_total_count)
    # total count = by package id count + by product count
    df_summary['total_count_incoming'] = df_summary['count_incoming'] + df_summary['product_count']
    df_summary['margin_$'] = df_summary['revenue'] - df_summary['cogs']
    df_summary['margin_%'] = df_summary['margin_$'] / df_summary['revenue']
    df_summary['coverage'] = df_summary['total_count_incoming'] / df_summary['total_count']
    df_summary_simp = df_summary[['date', 'revenue', 'cogs', 'margin_$', 'margin_%', 'total_count_incoming','product_count','count_incoming', 'coverage']]
    
    return df_summary_simp,df_cogs_average_product




In [None]:
df_in = company_incoming_transfer_packages_dataframe 
df_out = company_outgoing_transfer_packages_dataframe
df_inventory = company_inventory_packages_dataframe
df_sales = deduped_sales_receipts_with_transactions_dataframe

In [None]:
df_summary_simp,df_cogs_average_product = cogs_analysis(df_in,df_out,df_inventory,df_sales,'monthly')
df_summary_simp.index = df_summary_simp.date


In [None]:
# rolling
#rolling_4m_margin = df_summary_simp[['margin_%']].rolling(4).mean()
#rolling_4m_margin.columns = ['rolling_4m_margin_%']

In [None]:
df_summary_simp

In [None]:
#df_summary_simp.to_csv('surveillance/'+COMPANY_IDENTIFIER[0]+'/COGS/' + COMPANY_IDENTIFIER[0] + '_'+ vc_end_date + '_' +license_number +'_cogs.csv')



## Monthly 

In [None]:
#create subplot figure with having two side by side plots
fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
# plot first pandas frame in subplot style
df_summary_simp['cogs'].plot(ax = axes[0],subplots=True,marker = 'o',style='r-') 
df_summary_simp['revenue'].plot(ax = axes[0],subplots=True,marker = 'o',style='g-',) 
axes[0].set_title("Revenue & COGS")
axes[0].legend()
# plot second pandas frame in subplot style
df_summary_simp['margin_%'].plot(ax = axes[1],subplots=True,marker = 'o',style='b-')
rolling_4m_margin['rolling_4m_margin_%'].plot(ax = axes[1],subplots=True,style='--')
axes[1].legend()
axes[1] = plt.title('Monthly Margin %')

## Monthly overlay

In [None]:
df_summary_simp_2021 = df_summary_simp[(df_summary_simp['date'] >= '2021-01')&(df_summary_simp['date'] <= '2021-12')]
df_summary_simp_2020 = df_summary_simp[(df_summary_simp['date'] >= '2020-01')&(df_summary_simp['date'] <= '2020-12')]
df_summary_simp_2022 = df_summary_simp[(df_summary_simp['date'] >= '2022-01')&(df_summary_simp['date'] <= '2022-12')]

In [None]:
df_summary_simp_2020['month'] = pd.to_datetime(df_summary_simp_2020['date']).dt.strftime("%m")
df_summary_simp_2021['month'] = pd.to_datetime(df_summary_simp_2021['date']).dt.strftime("%m")
df_summary_simp_2022['month'] = pd.to_datetime(df_summary_simp_2022['date']).dt.strftime("%m")

In [None]:
df_summary_simp_2020.reset_index(drop=True, inplace=True)
df_summary_simp_2021.reset_index(drop=True, inplace=True)
df_summary_simp_2022.reset_index(drop=True, inplace=True)

In [None]:
df_summary_simp_overlay_temp = df_summary_simp_2021.merge(df_summary_simp_2020,on = 'month',how = 'left',suffixes=['_2021','_2020'])
#df.index = df.month

In [None]:
df_summary_simp_overlay = df_summary_simp_overlay_temp.merge(df_summary_simp_2022,on = 'month',how = 'left',suffixes=['_2021','_2020','_2022'])

In [None]:
#create subplot figure with having two side by side plots
fig, axes = plt.subplots(nrows=3,ncols=1,figsize=(12,10))
# plot first pandas frame in subplot style
df_summary_simp_overlay['cogs_2020'].plot(ax = axes[0],subplots=True,marker = 'o',style='r-') 
df_summary_simp_overlay['cogs_2021'].plot(ax = axes[0],subplots=True,marker = 'o',style='g-')
df_summary_simp_overlay['cogs'].plot(ax = axes[0],subplots=True,marker = 'o',style='b-')

axes[0].set_title("COGS")
axes[0].legend(bbox_to_anchor = (1.05, 0.6))
df_summary_simp_overlay['revenue_2020'].plot(ax = axes[1],subplots=True,marker = 'o',style='r-',) 
df_summary_simp_overlay['revenue_2021'].plot(ax = axes[1],subplots=True,marker = 'o',style='g-',) 
df_summary_simp_overlay['revenue'].plot(ax = axes[1],subplots=True,marker = 'o',style='b-')
axes[1].set_title("Revenue")
axes[1].legend(bbox_to_anchor = (1.25, 0.6))
df_summary_simp_overlay['margin_%_2020'].plot(ax = axes[2],subplots=True,marker = 'o',style='r-',) 
df_summary_simp_overlay['margin_%_2021'].plot(ax = axes[2],subplots=True,marker = 'o',style='g-',) 
df_summary_simp_overlay['margin_%'].plot(ax = axes[2],subplots=True,marker = 'o',style='b-')
axes[2].set_title("% Margin")
ax = axes[2].legend(bbox_to_anchor = (1.05, 0.6))

In [None]:
## Monthly since 2021

In [None]:
#df_summary_simp_2021.index = df_summary_simp_2021.date

In [None]:
#create subplot figure with having two side by side plots
#fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
# plot first pandas frame in subplot style
#df_summary_simp_2021['cogs'].plot(ax = axes[0],subplots=True,marker = 'o',style='r-') 
#df_summary_simp_2021['revenue'].plot(ax = axes[0],subplots=True,marker = 'o',style='g-',) 
#axes[0].set_title("Revenue & COGS")
#axes[0].legend()
# plot second pandas frame in subplot style
#df_summary_simp_2021['margin_%'].plot(ax = axes[1],subplots=True,marker = 'o',style='b-')
#axes[1].legend()
#axes[1] = plt.title('Monthly Margin %')

In [None]:
#df_summary_simp_2021

In [None]:
#df_summary_simp.to_csv('./dw_monthly.csv')

## Weekly

In [None]:
df_summary_simp_weekly,df_cogs_average_product_weekly = cogs_analysis(df_in,df_out,df_inventory,df_sales,'weekly')
df_summary_simp_weekly.index = df_summary_simp_weekly.date


In [None]:
# rolling
rolling_4w_margin = df_summary_simp_weekly[['margin_%']].rolling(4).mean().reset_index()
rolling_4w_margin.columns = ['date','rolling_4w_margin_%']
rolling_4w_margin.index = rolling_4w_margin.date

In [None]:
#create subplot figure with having two side by side plots
fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
# plot first pandas frame in subplot style
df_summary_simp_weekly['cogs'].plot(ax = axes[0],subplots=True,marker = 'o',style='r-') 
df_summary_simp_weekly['revenue'].plot(ax = axes[0],subplots=True,marker = 'o',style='g-',) 
axes[0].set_title("Revenue & COGS")
axes[0].legend()
# plot second pandas frame in subplot style
df_summary_simp_weekly['margin_%'].plot(ax = axes[1],subplots=True,marker = 'o',style='b-')
rolling_4w_margin['rolling_4w_margin_%'].plot(ax = axes[1],subplots=True,style='--')
axes[1].legend()
axes[1] = plt.title('Weekly Margin %')

In [None]:
#df_summary_simp_weekly_2021 = df_summary_simp_weekly[df_summary_simp_weekly['date'] >= '2021-01']

In [None]:
## Weekly since 2021

In [None]:
#create subplot figure with having two side by side plots
#fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
# plot first pandas frame in subplot style
#df_summary_simp_weekly_2021['cogs'].plot(ax = axes[0],subplots=True,marker = 'o',style='r-') 
#df_summary_simp_weekly_2021['revenue'].plot(ax = axes[0],subplots=True,marker = 'o',style='g-',) 
#axes[0].set_title("Revenue & COGS")
#axes[0].legend()
# plot second pandas frame in subplot style
#df_summary_simp_weekly_2021['margin_%'].plot(ax = axes[1],subplots=True,marker = 'o',style='b-')
#axes[1] = plt.title('Weekly Margin %')

## Recent weeks in 2022

In [None]:
df_summary_simp_weekly[df_summary_simp_weekly['date'] >= '2022-01'][['date','revenue','cogs','margin_%','coverage']]

In [None]:
rolling_4w_margin[rolling_4w_margin['date'] >= '2022-01']

# Items sold at discount (sales price < cost)

In [None]:
def calculate_discount(df_sales_with_incoming_filled:pd.DataFrame) -> pd.DataFrame:
    # receipt count
    total_receipt_count = df_sales_with_incoming_filled.groupby(['date']).apply(lambda x: len(x['receipt_number'].unique()))
    discounted = df_sales_with_incoming_filled[(df_sales_with_incoming_filled['per_unit_incoming'] > df_sales_with_incoming_filled['per_unit'])|(df_sales_with_incoming_filled['per_unit_product'] > df_sales_with_incoming_filled['per_unit'])]
    total_discounted_receipt_count = discounted[['date','receipt_number']].groupby(['date']).apply(lambda x: len(x['receipt_number'].unique()))
    discount_rate = total_discounted_receipt_count / total_receipt_count
    discount_rate_df = pd.DataFrame([total_receipt_count,total_discounted_receipt_count,discount_rate]).T.reset_index()
    discount_rate_df.columns = ['date','total_receipt_count','total_discounted_receipts_count','discount_rate']

    # dollar amount
    total_receipt_amount = df_sales_with_incoming_filled[['date','tx_total_price']].groupby(['date']).sum().reset_index()
    total_receipt_amount.columns = ['date','total_receipt_amount']
    total_discounted_receipt_amount = discounted[['date','tx_total_price']].groupby(['date']).sum().reset_index()
    total_discounted_receipt_amount.columns = ['date','total_discounted_receipt_amount']
    discount_rate_dollar_df = total_receipt_amount.merge(total_discounted_receipt_amount,on = 'date',how = 'left')
    discount_rate_dollar_df['discount_rate_dollar'] = discount_rate_dollar_df['total_discounted_receipt_amount'] / discount_rate_dollar_df['total_receipt_amount']
    discount_df = discount_rate_df.merge(discount_rate_dollar_df, on = 'date',how = 'left')
    discount_df.index = discount_df.date
    return discount_df
                                                                                                                                           
                                                                                                                                           
                                                                                                                                           

In [None]:
discount_rate_monthly = calculate_discount(df_cogs_average_product)
discount_rate_weekly = calculate_discount(df_cogs_average_product_weekly)

In [None]:
ax = discount_rate_monthly[['discount_rate','discount_rate_dollar']].fillna(0).plot(marker = 'o',figsize = (12,8))
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
ax = plt.title('Monthly Discount Rate')

In [None]:
ax = discount_rate_weekly[discount_rate_weekly['date'] >= '2021-01'].fillna(0)[['discount_rate','discount_rate_dollar']].plot(marker = 'o',figsize = (12,8))
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
ax = plt.title('Weekly Discount Rate')

# Items refunded (transaction total < 0)

In [None]:
def calculate_refund(df_sales_with_incoming_filled:pd.DataFrame) -> pd.DataFrame:
    #total receipt count
    count_receipt = df_sales_with_incoming_filled[['date','receipt_number']].groupby(['date']).apply(lambda x: len(x['receipt_number'].unique()))
    refunded = df_sales_with_incoming_filled[df_sales_with_incoming_filled['tx_total_price'] < 0]
    count_refund_receipt = refunded[['date','receipt_number']].groupby(['date']).apply(lambda x: len(x['receipt_number'].unique()))
    refund_rate = count_refund_receipt / count_receipt
    refund_rate_df = pd.DataFrame([count_receipt,count_refund_receipt,refund_rate]).T.reset_index()
    refund_rate_df.columns = ['date','total_receipt_count','total_refunded_receipts_count','refund_rate']
    
    # dollar amount
    total_receipt_amount = df_sales_with_incoming_filled[['date','tx_total_price']].groupby(['date']).sum().reset_index()
    total_receipt_amount.columns = ['date','total_receipt_amount']
    total_refunded_receipt_amount = refunded[['date','tx_total_price']].groupby(['date']).sum().reset_index()
    total_refunded_receipt_amount.columns = ['date','total_refunded_receipt_amount']
    refund_rate_dollar_df = total_receipt_amount.merge(total_refunded_receipt_amount,on = 'date',how = 'left')
    refund_rate_dollar_df['refund_rate_dollar'] = refund_rate_dollar_df['total_refunded_receipt_amount'] * -1.00 / refund_rate_dollar_df['total_receipt_amount']
    refund_df = refund_rate_df.merge(refund_rate_dollar_df, on = 'date',how = 'left')
    refund_df.index = refund_df.date
    return refund_df


In [None]:
refund_rate_monthly = calculate_refund(df_cogs_average_product)
refund_rate_weekly = calculate_refund(df_cogs_average_product_weekly)

In [None]:
ax = refund_rate_monthly[['refund_rate','refund_rate_dollar']].fillna(0).plot(marker = 'o',figsize = (12,8))
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
ax = plt.title('Monthly Refund Rate')

In [None]:
ax = refund_rate_weekly[['refund_rate','refund_rate_dollar']].fillna(0).plot(marker = 'o',figsize = (12,8))
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
ax = plt.title('Weekly Refund Rate')

# COGS analysis with further filldown using product category name

In [None]:
def cogs_analysis_fill_by_product_category(df_in:pd.DataFrame,df_out:pd.DataFrame,df_inventory:pd.DataFrame,df_sales:pd.DataFrame,freq: str) -> pd.DataFrame:
    df_sales['per_unit'] = df_sales['tx_total_price'] / df_sales['tx_quantity_sold']
    # set frequency
    if freq == 'monthly':
        df_sales['date'] = df_sales['sales_datetime'].dt.strftime("%Y-%m")

    elif freq == 'weekly':
        df_sales['date'] = df_sales['sales_datetime'].dt.strftime("%Y-%W")

    # total # trxns    
    s_total_count = df_sales.groupby('date')['tx_total_price'].count()
    df_total_count = pd.Series(s_total_count).to_frame()
    df_total_count = df_total_count.reset_index()
    df_total_count.rename(columns={'tx_total_price':'total_count'}, inplace=True)
    # revenue
    s_revenue = df_sales.groupby('date')['tx_total_price'].sum()
    df_revenue = pd.Series(s_revenue).to_frame()
    df_revenue = df_revenue.reset_index()
    df_revenue.rename(columns={'tx_total_price': 'revenue'}, inplace=True)

    df_in['per_unit_incoming'] = df_in['shipper_wholesale_price'] / df_in['shipped_quantity']
    
    # per unit price by package id
    df_in_price = df_in[df_in['shipper_wholesale_price'].notnull()]
    average_incoming_package_id = df_in_price.groupby('package_id')['per_unit_incoming'].mean()
    df_avg_incoming_price = pd.Series(average_incoming_package_id).to_frame()
    df_avg_incoming_price = df_avg_incoming_price.reset_index()
    # per unit price by product name
    average_incoming_product = df_in_price.groupby('product_name')['per_unit_incoming'].mean()
    df_avg_product = pd.Series(average_incoming_product).to_frame()
    df_avg_product = df_avg_product.reset_index()
    df_avg_product.rename(columns={'per_unit_incoming':'per_unit_product'}, inplace=True)
    # per unit price by product category name
    average_incoming_product_category = df_in_price.groupby('product_category_name')['per_unit_incoming'].mean()
    df_avg_product_cat = pd.Series(average_incoming_product_category).to_frame()
    df_avg_product_cat = df_avg_product_cat.reset_index()
    df_avg_product_cat.rename(columns={'per_unit_incoming':'per_unit_product_cat'}, inplace=True)
    
    # merge with (cogs by package id)
    df_cogs_package_id = pd.merge(df_sales, df_avg_incoming_price, left_on='tx_package_id', right_on='package_id', how='left')
    df_cogs_package_id['total_incoming'] = df_cogs_package_id['per_unit_incoming'] * df_cogs_package_id['tx_quantity_sold']
    df_cogs_package_id.replace([numpy.inf], numpy.nan, inplace=True)
    df_cogs_package_id_notnull = df_cogs_package_id[df_cogs_package_id['total_incoming'].notnull()]
    # sum cogs by package id
    s_cogs = df_cogs_package_id_notnull.groupby('date')['total_incoming'].sum()
    df_cogs_id = pd.Series(s_cogs).to_frame()
    df_cogs_id = df_cogs_id.reset_index()
    # count # of trxns by package id
    s_cogs_count = df_cogs_package_id_notnull.groupby('date')['total_incoming'].count()
    df_cogs_count = pd.Series(s_cogs_count).to_frame()
    df_cogs_count = df_cogs_count.reset_index()
    df_cogs_count.rename(columns={'total_incoming':'count_incoming'}, inplace=True)
    
    # merge with (cogs by product name)
    df_cogs_average_product = pd.merge(df_cogs_package_id, df_avg_product, left_on='tx_product_name', right_on='product_name', how='left')
    df_cogs_average_product['total_product'] = df_cogs_average_product['tx_quantity_sold'] * df_cogs_average_product['per_unit_product']
    # merge with (cogs by product category name)
    df_cogs_average_product_cat = pd.merge(df_cogs_average_product, df_avg_product_cat, left_on='tx_product_category_name', right_on='product_category_name', how='left')
    df_cogs_average_product_cat['total_product_cat'] = df_cogs_average_product_cat['tx_quantity_sold'] * df_cogs_average_product_cat['per_unit_product_cat']
    df_cogs_null = df_cogs_average_product_cat[df_cogs_average_product_cat['per_unit_incoming'].isnull()]
    df_cogs_product = df_cogs_null[(df_cogs_null['per_unit_product'].notnull())]
    df_cogs_product_null = df_cogs_average_product_cat[(df_cogs_average_product_cat['per_unit_incoming'].isnull())&(df_cogs_average_product_cat['per_unit_product'].isnull())]
    df_cogs_product_cat = df_cogs_product_null[df_cogs_product_null['per_unit_product_cat'].notnull()]
    
    # sum cogs filldown by product name
    product_sum = df_cogs_product.groupby('date')['total_product'].sum()
    df_product_sum = pd.Series(product_sum).to_frame()
    df_product_sum = df_product_sum.reset_index()
    df_product_sum.rename(columns={'total_product':'product_sum'}, inplace=True)
    # count # of trxn filldown by product name
    product_count = df_cogs_product.groupby('date')['total_product'].count()
    df_product_count = pd.Series(product_count).to_frame()
    df_product_count = df_product_count.reset_index()
    df_product_count.rename(columns={'total_product':'product_count'}, inplace=True)
    df_cogs_product_df = pd.merge(df_product_sum, df_product_count)
    
    # sum cogs filldown by product category name
    product_cat_sum = df_cogs_product_cat.groupby('date')['total_product_cat'].sum()
    df_product_cat_sum = pd.Series(product_cat_sum).to_frame()
    df_product_cat_sum = df_product_cat_sum.reset_index()
    df_product_cat_sum.rename(columns={'total_product_cat':'product_cat_sum'}, inplace=True)
    # count # of trxn filldown by product category name
    product_cat_count = df_cogs_product_cat.groupby('date')['total_product_cat'].count()
    df_product_cat_count = pd.Series(product_cat_count).to_frame()
    df_product_cat_count = df_product_cat_count.reset_index()
    df_product_cat_count.rename(columns={'total_product_cat':'product_cat_count'}, inplace=True)
    df_cogs_product_cat_df = pd.merge(df_product_cat_sum, df_product_cat_count)
    
    # prepare summary
    df_summary = pd.merge(df_revenue, df_cogs_product_df, how='left')
    df_summary = pd.merge(df_summary, df_cogs_product_cat_df, how='left')
    df_summary = pd.merge(df_summary, df_cogs_id, how='left')
    df_summary['product_sum'] = df_summary['product_sum'].fillna(0)
    df_summary['product_count'] = df_summary['product_count'].fillna(0)
    df_summary['product_cat_sum'] = df_summary['product_cat_sum'].fillna(0)
    df_summary['product_cat_count'] = df_summary['product_cat_count'].fillna(0)
    # total cogs = by product id cogs + by product name cogs + by product category name cogs
    df_summary['cogs'] = df_summary['total_incoming'] + df_summary['product_sum'] + df_summary['product_cat_sum']
    df_summary = pd.merge(df_summary, df_cogs_count)
    df_summary = pd.merge(df_summary, df_total_count)
    # total count = by package id count + by product count + by product category count
    df_summary['total_count_incoming'] = df_summary['count_incoming'] + df_summary['product_count'] + df_summary['product_cat_count']
    df_summary['margin_$'] = df_summary['revenue'] - df_summary['cogs']
    df_summary['margin_%'] = df_summary['margin_$'] / df_summary['revenue']
    df_summary['coverage'] = df_summary['total_count_incoming'] / df_summary['total_count']
    df_summary_simp = df_summary[['date', 'revenue', 'cogs', 'margin_$', 'margin_%', 'total_count_incoming','product_count','product_cat_count','total_count','count_incoming', 'coverage']]
    df_summary_simp.index = df_summary_simp.date
    return df_summary_simp,df_cogs_average_product_cat




In [None]:
df_in = company_incoming_transfer_packages_dataframe
df_out = company_outgoing_transfer_packages_dataframe
df_inventory = company_inventory_packages_dataframe
df_sales = deduped_sales_receipts_with_transactions_dataframe

In [None]:
df_summary_simp_cat_monthly,df_cogs_average_product_cat_monthly = cogs_analysis_fill_by_product_category(df_in,df_out,df_inventory,df_sales,'monthly')
df_summary_simp_cat_weekly,df_cogs_average_product_cat_weekly = cogs_analysis_fill_by_product_category(df_in,df_out,df_inventory,df_sales,'weekly')




In [None]:
df_summary_simp_cat_monthly['package_id_perc'] = df_summary_simp_cat_monthly['count_incoming'] / df_summary_simp_cat_monthly['total_count_incoming']
df_summary_simp_cat_monthly['product_perc'] = df_summary_simp_cat_monthly['product_count'] / df_summary_simp_cat_monthly['total_count_incoming']
df_summary_simp_cat_monthly['product_cat_perc'] = df_summary_simp_cat_monthly['product_cat_count'] / df_summary_simp_cat_monthly['total_count_incoming']
df_summary_simp_cat_monthly['total'] = (df_summary_simp_cat_monthly['product_cat_count'] + df_summary_simp_cat_monthly['count_incoming'] + df_summary_simp_cat_monthly['product_count']) / df_summary_simp_cat_monthly['total_count_incoming'] 



In [None]:
df_summary_simp_cat_weekly['package_id_perc'] = df_summary_simp_cat_weekly['count_incoming'] / df_summary_simp_cat_weekly['total_count_incoming']
df_summary_simp_cat_weekly['product_perc'] = df_summary_simp_cat_weekly['product_count'] / df_summary_simp_cat_weekly['total_count_incoming']
df_summary_simp_cat_weekly['product_cat_perc'] = df_summary_simp_cat_weekly['product_cat_count'] / df_summary_simp_cat_weekly['total_count_incoming']
df_summary_simp_cat_weekly['total'] = (df_summary_simp_cat_weekly['product_cat_count'] + df_summary_simp_cat_weekly['count_incoming'] + df_summary_simp_cat_weekly['product_count']) / df_summary_simp_cat_weekly['total_count_incoming'] 



In [None]:
df_summary_simp_cat_monthly.tail()

In [None]:
df_summary_simp.tail()

In [None]:
#df_summary_simp_cat_monthly.to_csv('surveillance/'+COMPANY_IDENTIFIER[0]+'/COGS/' + COMPANY_IDENTIFIER[0] + '_'+ vc_end_date + '_' +license_number +'_cogs_pc_filldown.csv')



## Monthly

In [None]:
#create subplot figure with having two side by side plots
fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
# plot first pandas frame in subplot style
df_summary_simp_cat_monthly['cogs'].plot(ax = axes[0],subplots=True,marker = 'o',style='r-') 
df_summary_simp_cat_monthly['revenue'].plot(ax = axes[0],subplots=True,marker = 'o',style='g-',) 
axes[0].set_title("Revenue & COGS")
axes[0].legend()
# plot second pandas frame in subplot style
df_summary_simp_cat_monthly['margin_%'].plot(ax = axes[1],subplots=True,marker = 'o',style='b-')
axes[1] = plt.title('Monthly Margin %')

In [None]:
## Monthly since 2021

In [None]:
#df_summary_simp_cat_monthly_2021 = df_summary_simp_cat_monthly[df_summary_simp_cat_monthly['date'] >= '2021-01']

In [None]:
#create subplot figure with having two side by side plots
#fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
# plot first pandas frame in subplot style
#df_summary_simp_cat_monthly_2021['cogs'].plot(ax = axes[0],subplots=True,marker = 'o',style='r-') 
#df_summary_simp_cat_monthly_2021['revenue'].plot(ax = axes[0],subplots=True,marker = 'o',style='g-',) 
#axes[0].set_title("Revenue & COGS")
#axes[0].legend()
# plot second pandas frame in subplot style
#df_summary_simp_cat_monthly_2021['margin_%'].plot(ax = axes[1],subplots=True,marker = 'o',style='b-')
#axes[1] = plt.title('Monthly Margin %')

## Weekly

In [None]:
#create subplot figure with having two side by side plots
fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
# plot first pandas frame in subplot style
df_summary_simp_cat_weekly['cogs'].plot(ax = axes[0],subplots=True,marker = 'o',style='r-') 
df_summary_simp_cat_weekly['revenue'].plot(ax = axes[0],subplots=True,marker = 'o',style='g-',) 
axes[0].set_title("Revenue & COGS")
axes[0].legend()
# plot second pandas frame in subplot style
df_summary_simp_cat_weekly['margin_%'].plot(ax = axes[1],subplots=True,marker = 'o',style='b-')
axes[1] = plt.title('Weekly Margin %')

In [None]:
## Weekly since 2021

In [None]:
#df_summary_simp_cat_weekly_2021 = df_summary_simp_cat_weekly[df_summary_simp_cat_weekly['date'] >= '2021-01']

In [None]:
#create subplot figure with having two side by side plots
#fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(15,6))
# plot first pandas frame in subplot style
#df_summary_simp_cat_weekly_2021['cogs'].plot(ax = axes[0],subplots=True,marker = 'o',style='r-') 
#df_summary_simp_cat_weekly_2021['revenue'].plot(ax = axes[0],subplots=True,marker = 'o',style='g-',) 
#axes[0].set_title("Revenue & COGS")
#axes[0].legend()
# plot second pandas frame in subplot style
#df_summary_simp_cat_weekly_2021['margin_%'].plot(ax = axes[1],subplots=True,marker = 'o',style='b-')
#axes[1] = plt.title('Weekly Margin %')

# Compare product name fill VS product category name fill

## Compare margin & cogs

In [None]:
# compare
#create subplot figure with having two side by side plots
# plot first pandas frame in subplot style
ax = df_summary_simp_cat_weekly['margin_%'].plot(subplots=True,marker = 'o',style='r-',figsize=(15,6)) 
ax = df_summary_simp_weekly['margin_%'].plot(subplots=True,marker = 'o',style='g-',) 
ax= plt.legend()

In [None]:
# compare
#create subplot figure with having two side by side plots
# plot first pandas frame in subplot style
ax = df_summary_simp_cat_weekly['cogs'].plot(subplots=True,marker = 'o',style='r-',figsize=(15,6)) 
ax = df_summary_simp_weekly['cogs'].plot(subplots=True,marker = 'o',style='g-',) 
ax= plt.legend()

## Fill percentage breakdown

In [None]:
ax = df_summary_simp_cat_monthly[['package_id_perc','product_perc','product_cat_perc','total']].plot(figsize = (12,8),marker = 'o')


In [None]:
ax = df_summary_simp_cat_weekly[['package_id_perc','product_perc','product_cat_perc','total']].plot(figsize = (12,8),marker = 'o')


## Look at what product categories were filled

In [None]:
# what was filled by product category?

df_cogs_average_product_cat_monthly[(df_cogs_average_product_cat_monthly['per_unit_product'].isnull())&(df_cogs_average_product_cat_monthly['per_unit_incoming'].isnull())&(df_cogs_average_product_cat_monthly['per_unit_product_cat'].notnull())].groupby(['date', 'tx_product_category_name'])['tx_package_id'].count().unstack().plot.bar(figsize=(12, 8), stacked=True,colormap='Paired')
ax = plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))


In [None]:
# what was filled by product category?

df_cogs_average_product_cat_weekly[(df_cogs_average_product_cat_weekly['per_unit_product'].isnull())&(df_cogs_average_product_cat_weekly['per_unit_incoming'].isnull())&(df_cogs_average_product_cat_weekly['per_unit_product_cat'].notnull())].groupby(['date', 'tx_product_category_name'])['tx_package_id'].count().unstack().plot.bar(figsize=(12, 8), stacked=True,colormap='Paired')
ax = plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))


## Estimated price error using product name average filldown

In [None]:
df_cogs_average_product_cat_weekly['product_price_error'] = df_cogs_average_product_cat_weekly['per_unit_incoming'] - df_cogs_average_product_cat_weekly['per_unit_product']
df_cogs_average_product_cat_weekly['product_cat_price_error'] = df_cogs_average_product_cat_weekly['per_unit_incoming'] - df_cogs_average_product_cat_weekly['per_unit_product_cat']


In [None]:
ax = df_cogs_average_product_cat_weekly[['product_price_error']].hist(bins = 50)

In [None]:
df_cogs_average_product_cat_weekly[['product_price_error']].describe()

## Estimated price error using product category name average filldown

In [None]:
ax = df_cogs_average_product_cat_weekly[['product_cat_price_error']].hist(bins = 50)

In [None]:
df_cogs_average_product_cat_weekly[['product_cat_price_error']].describe()

# ~ THE END ~

In [None]:
from typing import Optional

In [None]:
def parse_gram_numeric(name: str) -> Optional[float]:
    words = name.lower().split(' ')
    outcome = []
    for word in words:
        if len(word) <=1 or word[-1] != 'g':
            continue
        # it has `g` at the end
        try:
            float_number = float(word[:-1])
        except ValueError:
            # can be also of the form `x/y`
            splits = word[:-1].split('/')
            if len(splits) == 2:
                try:
                    float_number = int(splits[0]) / int(splits[1])
                except ValueError:
                    continue
            else:
                continue
        outcome.append(float_number)

    
    if len(outcome) == 1:
        return outcome[0]
    
    else:
        return None

In [None]:
def find_n_pack(name: str) -> Optional[int]:
    name = name.lower().replace('-', ' ')
    words = name.split(' ')
    pack_idx = [i for i, word in enumerate(words) if word == 'pack']
    if len(pack_idx) != 1 or pack_idx[0] == 0:
        return None
    return words[pack_idx[0]-1]


In [None]:
#set(merged[merged['product_category_name'] == 'Raw Pre-Rolls']['product_name'].fillna('').apply(find_n_pack))

In [None]:
merged

In [None]:
merged['parsed_unit_weight'] = merged['tx_product_name'].fillna('').apply(parse_gram_numeric)
merged['parsed_unit_pack'] = merged['tx_product_name'].fillna('').apply(find_n_pack)

In [None]:
merged['parsed_unit_pack'] = pd.to_numeric(merged['parsed_unit_pack'],
      errors = 'coerce')

In [None]:
merged['test'] = merged['parsed_unit_weight'] * merged['parsed_unit_pack']

In [None]:
f = merged[merged['test'].notnull()]

In [None]:
f['per_unit_product_cat_2'] = f['per_unit_product_cat'] * f['test']

In [None]:
f['product_cat_price_error_2'] = f['per_unit_incoming'] - f['per_unit_product_cat_2']

In [None]:
pr = df_in[df_in['product_category_name'] == 'Raw Pre-Rolls']

In [None]:
a = pr['per_unit_incoming'].between(pr['per_unit_incoming'].quantile(.05), pr['per_unit_incoming'].quantile(.95))




In [None]:
pr['per_unit_incoming'][a]

In [None]:
pr

In [None]:
f[['product_cat_price_error']].describe()

In [None]:
f[['product_cat_price_error_2']].describe()

In [None]:
df_in[['product_category_name','shipped_unit_of_measure','per_unit_incoming']].groupby(['product_category_name','shipped_unit_of_measure']).mean()



In [None]:
df_in[(df_in['product_category_name'] == 'Raw Pre-Rolls')&(df_in['product_name'] == 'M00000949931: LA Kush Cake 4 Pack 0.5g Blunts')]

In [None]:
merged['test'].notnull().sum() / merged.shape[0]

In [None]:
#df_in[['product_category_name','received_unit_of_measure']].groupby(['product_category_name','received_unit_of_measure']).count()

In [None]:
## Compare computed inventory vs Metrc inventory (metrc_packages)

In [None]:
TODAY_DATE = date.today().strftime('%m/%d/%Y')
INVENTORY_DATES = [TODAY_DATE]
ANALYSIS_PARAMS = {
    'sold_threshold': 1.0,
    'find_parent_child_relationships': False,
}
print('Today is {}'.format(TODAY_DATE))

In [None]:
analysis_ctx = inventory_types.AnalysisContext(
    output_root_dir='tmp',
    read_params=inventory_types.ReadParams(
        use_cached_dataframes=False
    ),
    write_params=inventory_types.WriteOutputParams(
        save_download_dataframes=False
    )
)

d = util.Download()
sql_helper = util.BigQuerySQLHelper(
    ctx=analysis_ctx,
    engine=engine,
)
d.download_dataframes(
    all_dataframes_dict=download_util.AllDataframesDict(
        incoming_transfer_packages_dataframe=company_incoming_transfer_packages_dataframe,
        outgoing_transfer_packages_dataframe=company_outgoing_transfer_packages_dataframe,
        sales_transactions_dataframe=company_sales_transactions_dataframe,
        sales_receipts_dataframe=company_sales_receipts_dataframe,
        inventory_packages_dataframe=company_inventory_packages_dataframe,
    ),
    sql_helper=sql_helper
)

In [None]:
q = util.Query(
    inventory_dates=[], # gets filled in once we have the dataframes
    transfer_packages_start_date=TRANSFER_PACKAGES_START_DATE,
    sales_transactions_start_date=SALES_TRANSACTIONS_START_DATE,
    company_name=COMPANY_IDENTIFIER,
    company_identifier=COMPANY_IDENTIFIER,
    license_numbers=[],
)

id_to_history = util.get_histories(d, ANALYSIS_PARAMS)
util.print_counts(analysis_ctx, id_to_history)

In [None]:
date_to_inventory_packages_dataframe = {}

id_to_history = util.get_histories(d, ANALYSIS_PARAMS)
inventory_valuations = []

for inventory_date in INVENTORY_DATES:
    computed_inventory_package_records = util.create_inventory_dataframe_by_date(
        id_to_history, inventory_date, params=ANALYSIS_PARAMS)    
    computed_inventory_packages_dataframe = pandas.DataFrame(
        computed_inventory_package_records,
        columns=util.get_inventory_column_names(),
    )
    date_to_inventory_packages_dataframe[inventory_date] = computed_inventory_packages_dataframe
    inventory_valuations.append(valuations_util.get_total_valuation_for_date(
        computed_inventory_packages_dataframe=computed_inventory_packages_dataframe,
        company_incoming_transfer_packages_dataframe=company_incoming_transfer_packages_dataframe,
        inventory_date=inventory_date,
        using_nb=True,
    ))

In [None]:
from_packages_inventory_dataframe = company_inventory_packages_dataframe[[
    'package_id',
    'packaged_date',
    'unit_of_measure',
    'product_category_name',
    'product_name',
    'quantity',
]].sort_values('package_id')

package_id_to_actual_row = {}
for index, row in from_packages_inventory_dataframe.iterrows():
    package_id_to_actual_row[str(row['package_id'])] = row
    
res = util.compare_inventory_dataframes(
    ctx=analysis_ctx,
    computed=date_to_inventory_packages_dataframe[TODAY_DATE],
    actual=from_packages_inventory_dataframe,
    options={
        'num_errors_to_show': 10,
        'accept_computed_when_sold_out': True
    }
)

In [None]:
inventory_cost_valuation = valuations_util.get_inventory_valuation(
    inventory_packages_dataframe=company_inventory_packages_dataframe,
    incoming_transfer_packages_dataframe=company_incoming_transfer_packages_dataframe,
)
print(f'Cost valuation of Metrc-reported inventory as of today: ${round(inventory_cost_valuation, 2)}')

In [None]:
## Export data

In [None]:
# Flip flag if you want to export data to files.
is_export_enabled = False

In [None]:
import time
from datetime import date

TODAY_DATE = date.today().strftime('%m-%d-%Y')
NOW = int(time.time())

export_incoming_transfer_packages_dataframe = company_incoming_transfer_packages_dataframe

date_columns = export_incoming_transfer_packages_dataframe.select_dtypes(include=['datetime64[ns, UTC]']).columns
for date_column in date_columns:
    export_incoming_transfer_packages_dataframe[date_column] = export_incoming_transfer_packages_dataframe[date_column].dt.date

incoming_transfer_packages_file_name = f'~/Downloads/{COMPANY_IDENTIFIER}_incoming_transfer_packages_{TODAY_DATE}_{NOW}'

export_inventory_packages_dataframe = company_inventory_packages_dataframe

date_columns = export_inventory_packages_dataframe.select_dtypes(include=['datetime64[ns, UTC]']).columns
for date_column in date_columns:
    export_inventory_packages_dataframe[date_column] = export_inventory_packages_dataframe[date_column].dt.date

inventory_packages_file_name = f'~/Downloads/{COMPANY_IDENTIFIER}_inventory_packages_{TODAY_DATE}_{NOW}'

export_sales_transactions_dataframe = deduped_sales_receipts_with_transactions_dataframe

date_columns = export_sales_transactions_dataframe.select_dtypes(include=['datetime64[ns, UTC]']).columns
for date_column in date_columns:
    export_sales_transactions_dataframe[date_column] = export_sales_transactions_dataframe[date_column].dt.date

sales_transactions_file_name = f'~/Downloads/{COMPANY_IDENTIFIER}_sales_transactions_{SALES_TRANSACTIONS_START_DATE}_{TODAY_DATE}_{NOW}'

if is_export_enabled:
    num_incoming_transfer_packages = len(export_incoming_transfer_packages_dataframe.index)
    print(f'Exporting {num_incoming_transfer_packages} incoming transfer packages to files...')
    export_incoming_transfer_packages_dataframe.reset_index().to_excel(f'{incoming_transfer_packages_file_name}.xlsx')
    export_incoming_transfer_packages_dataframe.reset_index().to_csv(f'{incoming_transfer_packages_file_name}.csv')
    print(f'Exported {num_incoming_transfer_packages} incoming transfer packages to files')

    num_inventory_packages = len(export_inventory_packages_dataframe.index)
    print(f'Exporting {num_inventory_packages} inventory packages to files...')
    export_inventory_packages_dataframe.reset_index().to_excel(f'{inventory_packages_file_name}.xlsx')
    export_inventory_packages_dataframe.reset_index().to_csv(f'{inventory_packages_file_name}.csv')
    print(f'Exported {num_inventory_packages} inventory packages to files')

    num_sales_transactions = len(export_sales_transactions_dataframe.index)
    print(f'Exporting {num_sales_transactions} sales transactions to files...')
    export_inventory_packages_dataframe.reset_index().to_excel(f'{sales_transactions_file_name}.xlsx')
    export_inventory_packages_dataframe.reset_index().to_csv(f'{sales_transactions_file_name}.csv')
    print(f'Exported {num_sales_transactions} sales transactions to files')