In [None]:
"""
  A manager that facilitates reading and writing files to GCP Storage
"""
import logging
import os
import subprocess
from io import BytesIO
from typing import List, Dict, Callable, Tuple, Union
from mypy_extensions import TypedDict

from google.cloud import storage # type: ignore

def get_path_prefix(root_dir: str, relative_path: str) -> str:
    prefix = ''
    if root_dir:
        prefix = root_dir.rstrip('/') + '/'

    if relative_path and relative_path != '/':
        prefix = prefix + relative_path.strip('/') + '/'

    return prefix

class PathNode(TypedDict, total=False):
    name: str
    type: str
    size: float
        
class GCPStorageManager(object):

    def __init__(self, storage_details: Dict, verbose: bool) -> None:
        self._bucket_name = storage_details.get('bucket')
        self._root_dir = storage_details.get('root')
        self.client = storage.Client()
        self.verbose = verbose

    def _abs_path(self, rel_path: str) -> str:
        if not self._root_dir:
            return rel_path

        return os.path.join(self._root_dir, rel_path)

    def _build_current_url(self) -> str:
        if self._root_dir:
            return "https://console.cloud.google.com/storage/browser/" + self._bucket_name + "/" + self._root_dir
        else:
            return "https://console.cloud.google.com/storage/browser/" + self._bucket_name

    def get_root_dir(self) -> str:
        return self._root_dir

    def get_storage_details(self) -> Dict:
        return {
            'provider': file_utils.ProviderList.GCP_STORAGE,
            'bucket': self._bucket_name,
            'root': self._root_dir
        }

    def get_sync_login_command(self, env_vars: Dict) -> List[str]:
        return None

    def get_sync_url(self, path: str) -> str:
        if not path or path == '/':
            abs_path = self._root_dir
        else:
            abs_path = self._abs_path(path)
        return f'gs://{self._bucket_name}/{abs_path}'

    def get_sync_command(self, src_dir: str, remote_path: str) -> Callable:

        def sync_call() -> Tuple[int, str]:
            cmd = ['gsutil', 'rsync', '-r', src_dir, self.get_sync_url(remote_path)]
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout, stderr = p.communicate()
            err_code = 0
            stderr_val = ''

            if p.returncode != 0:
                logging.error('Copy response is: {}'.format(stderr.decode('utf-8')))
                stderr_val = stderr.decode('utf-8')
                err_code = p.returncode

            return err_code, stderr_val

        return sync_call

    def rm_file(self, relative_path: str) -> None:
        bucket = self.client.bucket(self._bucket_name)
        path = self._abs_path(relative_path)
        blob = bucket.blob(path)
        logging.info('Deleting file at ' + path)
        blob.delete()

    def rm_dir(self, relative_path: str) -> None:
        bucket = self.client.bucket(self._bucket_name)
        prefix = get_path_prefix(self._root_dir, relative_path)
        blob = bucket.blob(prefix)
        logging.info('Deleting files at ' + prefix)
        blob.delete()

    def read_content(self, path: str, throw_exception: bool, read_range: str = None, streaming: bool = False) -> bytes:
        try:
            path = self._abs_path(path)
            bucket = self.client.bucket(self._bucket_name)
            blob = bucket.blob(path)
            result = blob.download_as_bytes()

            if self.verbose:
                logging.info(f"Downloading content from {self._build_current_url()}/{path}")

            return result
        except Exception as e:
            raise

    #def gen_presigned_url(self, path: str) -> str:
    #    # dask and other frameworks explicit s3 link rather than a byte stream or contet (e.g. s3://bucket/foo.csv)
    #    return self._s3.generate_presigned_url('get_object',
    #                                           Params={'Bucket': self._bucket_name, 'Key': self._abs_path(path)})

    # Checks to see if the job directory exists.  No side-effects.
    def check_dir_exists(self, path: str) -> bool:
        # Create the prefix for this particular job.
        exists = False
        prefix = get_path_prefix(self._root_dir, path)
        blobs = list(self.client.list_blobs(
            self._bucket_name, prefix=prefix
        ))
        if len(blobs) > 0:
            exists = True

        return exists

    def _download_content(self, remote_path: str) -> bytes:
        remote_path = self._abs_path(remote_path)
        bucket = self.client.bucket(self._bucket_name)
        blob = bucket.blob(remote_path)
        if self.verbose:
            logging.info(f"Downloading content from {self._build_current_url()}/{remote_path}")

        fileobj = BytesIO()
        blob.download_to_file(fileobj)
        return fileobj.getvalue()

    def download_file(self, remote_path: str, file_name: str) -> None:
        # Move references to large data items across folders
        remote_path = self._abs_path(remote_path)
        bucket = self.client.bucket(self._bucket_name)
        blob = bucket.blob(remote_path)
        if self.verbose:
            logging.info(f"Downloading file {self._build_current_url()}/{remote_path} to {file_name}")

        dirname = os.path.dirname(file_name)

        if dirname:
            # Only create a directory if it's not ''
            if not os.path.exists(dirname):
                os.makedirs(dirname)

        blob.download_to_filename(file_name)

    def download_and_unzip(self, remote_path: str, local_dir: str) -> None:
        zip_bytes = self._download_content(remote_path)
        file_utils.unzip_into_dir(zip_bytes, local_dir)

    def download_dir(self, remote_path: str, local_path: str) -> int:
        nFiles = 0
        if self.verbose:
            logging.info("Downloading folder: " + remote_path + " to " + local_path)

        prefix = file_utils.get_path_prefix(self._root_dir, remote_path)
        for blob in self.client.list_blobs(self._bucket_name, prefix=prefix):
            if blob.name.endswith('/'):
                continue

            rel_path = os.path.relpath(blob.name, prefix)
            dest_pathname = os.path.join(local_path, rel_path)

            if not os.path.exists(os.path.dirname(dest_pathname)):
                os.makedirs(os.path.dirname(dest_pathname))

            if self.verbose:
                logging.info(f"Downloading file {blob.name} to {dest_pathname}")

            blob.download_to_filename(dest_pathname)  # Download
            nFiles += 1

        return nFiles

    def upload_content(self, content: bytes, file_name: str) -> None:
        # Uploads file content to a specific filename location
        bucket = self.client.bucket(self._bucket_name)
        if self.verbose:
            logging.info(f"Uploading content to {self._build_current_url()}/{file_name}")

        blob = bucket.blob(self._abs_path(file_name))
        blob.upload_from_file(BytesIO(content))

    def list_directory(self, path: str, with_size: bool = False) -> Dict:
        prefix = get_path_prefix(self._root_dir, path)
        bucket = self.client.bucket(self._bucket_name)
        blobs = bucket.list_blobs(prefix=prefix)
        nodes = []
        for blob in blobs:
            if blob.name.endswith('/'):
                nodes.append(PathNode(name=os.path.basename(blob.name[:-1]), type='folder'))
            else:
                nodes.append(PathNode(name=os.path.basename(blob.name), type='file'))

        return {
            'nodes': nodes
        }


In [None]:
import os

from dotenv import load_dotenv
load_dotenv(verbose=True)

print(os.environ.get('GOOGLE_APPLICATION_CREDENTIALS'))

In [None]:
manager = GCPStorageManager({
    'bucket': 'partnerships-data-reporting',
    'root': ''
}, verbose=True)

In [None]:
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [None]:
import json
import pandas as pd
directory_contents = manager.list_directory('')['nodes']

In [None]:
len(directory_contents)

In [None]:
directory_contents[:10]

In [None]:
company_ids_set = set([])

for file_metadata in directory_contents:
    file_name = file_metadata['name']
    # Company locations file case.
    if '_locations.csv' in file_name:
        company_id = file_name.replace('_locations.csv', '')
        company_ids_set.add(company_id)

company_ids = sorted(list(company_ids_set))
company_ids_count = len(company_ids)
print(f'There are {company_ids_count} companies in the dataset')

In [None]:
company_location_ids_set = set([])

for index, company_id in enumerate(company_ids):
    try:
        locations_csv_string = manager.read_content(f'{company_id}/{company_id}_locations.csv', throw_exception=True)
        locations_io = StringIO(locations_csv_string.decode("utf-8"))
        locations_dataframe = pd.read_csv(locations_io)
        for location_index in locations_dataframe.index:
            location_id = locations_dataframe['locationId'][location_index]
            location_state = locations_dataframe['state'][location_index]
            company_location_ids_set.add((company_id, location_id, location_state))
    except:
        print(f'[{index + 1}] An exception occurred for company ({company_id}) for default path, trying FirstRunWithColorado/ path...')
        try:
            locations_csv_string = manager.read_content(f'/FirstRunWithColorado/{company_id}/{company_id}_locations.csv', throw_exception=True)
            locations_io = StringIO(locations_csv_string.decode("utf-8"))
            locations_dataframe = pd.read_csv(locations_io)
            for location_index in locations_dataframe.index:
                location_id = locations_dataframe['locationId'][location_index]
                location_state = locations_dataframe['state'][location_index]
                company_location_ids_set.add((company_id, location_id, location_state))
        except:
            print(f'[{index + 1}] An exception occurred for company ({company_id}) for both paths, skipping...')

company_location_ids_count = len(list(company_location_ids_set))
print(f'There are {company_location_ids_count} company locations in the dataset')

In [None]:
company_location_ids_set

In [None]:
# (company_id, location_id, location_sales_dataframe)
company_location_dataframes = []

# for index, company_location_tuple in enumerate(list([('4A5DE8Zj5gDtSmCbn', 'JYAZRDbztXRLwJ5r6', 'CO')])):
for index, company_location_tuple in enumerate(list(company_location_ids_set)):
    company_id, location_id, location_state = company_location_tuple
    print(f'[{index + 1}] Downloading dataframes for location ({company_id}, {location_state})')

    try:
        location_sales_csv_string = manager.read_content(f'{company_id}/{location_id}/{location_id}_sales.csv', throw_exception=True)
        location_sales_io = StringIO(location_sales_csv_string.decode("utf-8"))
        location_sales_dataframe = pd.read_csv(location_sales_io)
        
        location_inventory_csv_string = manager.read_content(f'{company_id}/{location_id}/{location_id}_inventory.csv', throw_exception=True)
        location_inventory_io = StringIO(location_inventory_csv_string.decode("utf-8"))
        location_inventory_dataframe = pd.read_csv(location_inventory_io)

        company_location_dataframes.append((company_id, location_state, location_state, location_sales_dataframe, location_inventory_dataframe))
    except:
        print(f'[{index + 1}] An exception occurred for location ({company_id}, {location_id}, {location_state}) for default path, trying FirstRunWithColorado/ path...')
        try:
            location_sales_csv_string = manager.read_content(f'/FirstRunWithColorado/{company_id}/{location_id}/{location_id}_sales.csv', throw_exception=True)
            location_sales_io = StringIO(location_sales_csv_string.decode("utf-8"))
            location_sales_dataframe = pd.read_csv(location_sales_io)

            location_inventory_csv_string = manager.read_content(f'/FirstRunWithColorado/{company_id}/{location_id}/{location_id}_inventory.csv', throw_exception=True)
            location_inventory_io = StringIO(location_inventory_csv_string.decode("utf-8"))
            location_inventory_dataframe = pd.read_csv(location_inventory_io)

            company_location_dataframes.append((company_id, location_state, location_state, location_sales_dataframe, location_inventory_dataframe))
        except:
            print(f'[{index + 1}] An exception occurred for company ({company_id}, {location_id}, {location_state}) for both paths, skipping...')

company_location_dataframes_count = len(list(company_location_dataframes))
print(f'There are {company_location_dataframes_count} company location dataframes in the dataset')

In [None]:
state_to_count = {}

for company_id, location_id, location_state, location_sales_dataframe, location_inventory_dataframe in company_location_dataframes:
    # TODO: do data science stuff on these dataframes.
#     print(company_id, location_id, location_state, location_sales_dataframe.shape)
    if location_state not in state_to_count:
        state_to_count[location_state] = 0
    state_to_count[location_state] += 1

state_to_count

In [None]:
for company_id, location_id, location_state, location_sales_dataframe, location_inventory_dataframe in company_location_dataframes:
    # TODO: do data science stuff on these dataframes.
    print(company_id, location_id, location_state, location_sales_dataframe.shape, location_inventory_dataframe.shape)

In [None]:
# company_location_dataframes[0][4]

# PROCESS DATA FRAMES

In [None]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline  
import seaborn as sns

In [None]:
def pre_process_df(df,loc_id,loc_state):
    df = (
        df
        .assign(**{"SalesCreatedAt": lambda df_: pd.to_datetime(df_['SalesCreatedAt'])})
        .assign(**{"year_month": lambda df_: df_['SalesCreatedAt'].dt.strftime('%Y-%m')})
        .replace('None', np.nan).fillna(0)
    )
    #exclude cost < $1.0
    df_low_cost = df[df['costInDollars'] < 1.0]
    df_non_low_cost = df[df['costInDollars'] >= 1.0]
    perc_low_cost = df_low_cost.shape[0] / df.shape[0]
    #exclude category = Accessory
    df_accessory = df_non_low_cost[df_non_low_cost['category'] == 'Accessory']
    df_non_accessory = df_non_low_cost[df_non_low_cost['category'] != 'Accessory']
    perc_accessory = df_accessory.shape[0] / df_non_low_cost.shape[0]
    # cogs
    cogs = df_non_accessory[['year_month','NetSales','costInDollars']].groupby('year_month').sum()
    cogs['profit'] = cogs['NetSales'] - cogs['costInDollars']
    cogs['margin_perc'] = cogs['profit'] / cogs['NetSales']
    cogs['location_id'] = loc_id
    cogs['location_state'] = loc_state
    return cogs.reset_index(),perc_low_cost,perc_accessory
    

In [None]:
res = []
low_cost_trim_perc= []
accessory_trim_perc = []

for company_id, location_id, location_state, df in tqdm(company_location_dataframes):
    if len(df) == 0:
        low_cost_trim_perc.append(np.nan)
        accessory_trim_perc.append(np.nan)
        continue
    res.append(pre_process_df(df, location_id, location_state)[0])
    low_cost_trim_perc.append(pre_process_df(df, location_id, location_state)[1])
    accessory_trim_perc.append(pre_process_df(df, location_id, location_state)[2])

In [None]:
low_cost_trim_perc_df = pd.DataFrame(low_cost_trim_perc)
low_cost_trim_perc_df.columns = ['low_cost_trim_perc']
accessory_trim_perc_df = pd.DataFrame(accessory_trim_perc)
accessory_trim_perc_df.columns = ['accessory_trim_perc']

In [None]:
low_cost_trim_perc_df.describe()

In [None]:
accessory_trim_perc_df.describe()

In [None]:
# 178 company locations
# 1 failed download -> 177 locations
# 15 empty df
# 1 company TdyAhrZWq6MDeyNvu: trimmed by accessory and cost > 1

In [None]:
final_df = pd.concat(res).reset_index(drop=True)

In [None]:
# 178 company locations
# 1 failed download -> 177 locations
# 15 empty df
# 1 company TdyAhrZWq6MDeyNvu: trimmed by accessory and cost > 1

In [None]:
#final_df.to_csv('flowhub_sales_final_df_v0.csv')

# sanity checks

In [None]:
#t = company_location_dataframes[0][3]

In [None]:
#t = t.assign(**{"SalesCreatedAt": lambda df_: pd.to_datetime(df_['SalesCreatedAt'])}).assign(**{"year_month": lambda df_: df_['SalesCreatedAt'].dt.strftime('%Y-%m')})

In [None]:
#t[(t['costInDollars'] >= 1.0)&(t['category'] != 'Accessory')&(t['year_month'] == '2021-05')]['costInDollars'].sum()

## Count

In [None]:
state_count = final_df[['location_state','location_id','year_month']].groupby(['location_state','year_month']).count().unstack().T.reset_index()
state_count.index = pd.to_datetime(state_count['year_month'])


In [None]:
ax = state_count.plot(kind='bar', stacked=True,figsize = (12,9))
ax.legend(loc = 2)

# calculate monthly averages by states

In [None]:
#simple_avg_margin_by_state = final_df[['year_month','margin_perc','location_state']].groupby(['location_state','year_month']).mean().reset_index()

In [None]:
#simple_avg_margin_by_state.head()

In [None]:
#simple_avg_margin_by_state['year_month'] = pd.to_datetime(simple_avg_margin_by_state['year_month'])

In [None]:
#line,ax = plt.subplots(figsize=(15,10))
#plt.ylim([0.7, 0.9])
#ax.set_title("monthly simple avg margin by state", fontsize=15)
#ax.set_xlabel ("year month")
#ax.set_ylabel ("margin %")
#ax.legend (loc="upper right")


#sns.lineplot(data=simple_avg_margin_by_state, x="year_month", y="margin_perc", hue="location_state",marker= 'o', markersize=9)

In [None]:
#state_count.to_csv('flowhub_sales_state_count_df_v0.csv')

In [None]:
#simple_avg_margin_by_state.to_csv('flowhub_sales_state_simple_avg_df_v0.csv')

In [None]:
#simple_avg_margin_by_state

## weighted average

In [None]:
# assign an column that gives each location's gmv weight in its (month,state)
#final_df = final_df.groupby(['year_month','location_state']).apply(lambda df: df.assign(weight=df['NetSales'] / df['NetSales'].sum())).reset_index(drop=True)


In [None]:
#weighted_avg_margin_by_state = pd.DataFrame(final_df[['year_month','margin_perc','location_state','weight']].groupby(['location_state','year_month']).apply(lambda x: sum(x['weight']*x['margin_perc'])).reset_index())



In [None]:
#weighted_avg_margin_by_state['year_month'] = pd.to_datetime(weighted_avg_margin_by_state['year_month'])
#weighted_avg_margin_by_state.columns = ['location_state','year_month','margin_perc']

In [None]:
#line,ax = plt.subplots(figsize=(15,10))
#plt.ylim([0.7, 0.9])
#ax.set_title("monthly weighted avg margin by state", fontsize=15)
#ax.set_xlabel ("year month")
#ax.set_ylabel ("margin %")
#ax.legend (loc="upper right")
#sns.lineplot(data=weighted_avg_margin_by_state, x="year_month", y="margin_perc", hue="location_state",marker= 'o', markersize=9)



In [None]:
#weighted_avg_margin_by_state.to_csv('flowhub_sales_state_wgt_avg_df_v0.csv')

In [None]:
#co.sort_values(by = 'margin_perc')

# Distribution of margin by states

In [None]:
final_df.head()

In [None]:
final_df.nunique()

In [None]:
final_df.describe()

In [None]:
#final_df.to_csv('flowhub_location_analysis.csv')

In [None]:
# distribution of margin
margin_distr = final_df.groupby(['location_state','year_month'])['margin_perc'].describe().reset_index()
margin_distr['year_month'] = pd.to_datetime(margin_distr['year_month'])

In [None]:
margin_distr

In [None]:
#margin_distr.to_csv('monthly_margin%_distribution_by_state.csv')

In [None]:

fig, ax =plt.subplots(1,2)
sns.lineplot(data=margin_distr, x="year_month", y="min", hue="location_state",marker= 'o', markersize=9,ax=ax[0])
ax[0].legend(loc = 4)
ax[0].set_title("min", fontsize=15)
ax[0].set_xlabel ("year month")
ax[0].set_ylabel ("margin %")
sns.lineplot(data=margin_distr, x="year_month", y="max", hue="location_state",marker= 'o', markersize=9,ax=ax[1])
plt.ylim([0.7, 1])
ax[1].legend(loc = 4)
ax[1].set_title("max", fontsize=15)
ax[1].set_xlabel ("year month")
ax[1].set_ylabel ("margin %")
sns.set(rc={'figure.figsize':(20,6)})


In [None]:
fig, ax =plt.subplots(1,2)
sns.lineplot(data=margin_distr, x="year_month", y="mean", hue="location_state",marker= 'o', markersize=9,ax=ax[0])
ax[0].set(ylim=(0.7, 1))
ax[0].set_title("mean", fontsize=15)
ax[0].set_xlabel ("year month")
ax[0].set_ylabel ("margin %")
sns.lineplot(data=margin_distr, x="year_month", y="50%", hue="location_state",marker= 'o', markersize=9,ax=ax[1])
ax[1].set(ylim=(0.7, 1))
ax[1].set_title("median", fontsize=15)
ax[1].set_xlabel ("year month")
ax[1].set_ylabel ("margin %")
sns.set(rc={'figure.figsize':(20,6)})


In [None]:
fig, ax =plt.subplots(1,2)
sns.lineplot(data=margin_distr, x="year_month", y="25%", hue="location_state",marker= 'o', markersize=9,ax=ax[0])
ax[0].set(ylim=(0.6, 1))
ax[0].set_title("25th", fontsize=15)
ax[0].set_xlabel ("year month")
ax[0].set_ylabel ("margin %")
sns.lineplot(data=margin_distr, x="year_month", y="75%", hue="location_state",marker= 'o', markersize=9,ax=ax[1])
ax[1].set(ylim=(0.6, 1))
ax[1].set_title("75th", fontsize=15)
ax[1].set_xlabel ("year month")
ax[1].set_ylabel ("margin %")
sns.set(rc={'figure.figsize':(20,6)})


# Distribution of revenue by states

In [None]:
# distribution of revenue
revenue_distr = final_df.groupby(['location_state','year_month'])['NetSales'].describe().reset_index()
revenue_distr['year_month'] = pd.to_datetime(revenue_distr['year_month'])


In [None]:
revenue_distr.head()

In [None]:

fig, ax =plt.subplots(1,2)
sns.lineplot(data=revenue_distr, x="year_month", y="min", hue="location_state",marker= 'o', markersize=9,ax=ax[0])
ax[0].legend(loc = 4)
ax[0].set_title("min", fontsize=15)
ax[0].set_xlabel ("year month")
ax[0].set_ylabel ("margin %")
sns.lineplot(data=revenue_distr, x="year_month", y="max", hue="location_state",marker= 'o', markersize=9,ax=ax[1])
ax[1].legend(loc = 4)
ax[1].set_title("max", fontsize=15)
ax[1].set_xlabel ("year month")
ax[1].set_ylabel ("margin %")
sns.set(rc={'figure.figsize':(20,6)})


In [None]:
fig, ax =plt.subplots(1,2)
sns.lineplot(data=revenue_distr, x="year_month", y="mean", hue="location_state",marker= 'o', markersize=9,ax=ax[0])
#ax[0].set(ylim=(0.7, 1))
ax[0].set_title("mean", fontsize=15)
ax[0].set_xlabel ("year month")
ax[0].set_ylabel ("margin %")
sns.lineplot(data=revenue_distr, x="year_month", y="50%", hue="location_state",marker= 'o', markersize=9,ax=ax[1])
#ax[1].set(ylim=(0.7, 1))
ax[1].set_title("median", fontsize=15)
ax[1].set_xlabel ("year month")
ax[1].set_ylabel ("margin %")
sns.set(rc={'figure.figsize':(20,6)})


In [None]:
fig, ax =plt.subplots(1,2)
sns.lineplot(data=revenue_distr, x="year_month", y="25%", hue="location_state",marker= 'o', markersize=9,ax=ax[0])
#ax[0].set(ylim=(0.6, 1))
ax[0].set_title("25%", fontsize=15)
ax[0].set_xlabel ("year month")
ax[0].set_ylabel ("margin %")
sns.lineplot(data=revenue_distr, x="year_month", y="75%", hue="location_state",marker= 'o', markersize=9,ax=ax[1])
#ax[1].set(ylim=(0.6, 1))
ax[1].set_title("75%", fontsize=15)
ax[1].set_xlabel ("year month")
ax[1].set_ylabel ("margin %")
sns.set(rc={'figure.figsize':(20,6)})



In [None]:
#revenue_distr.to_csv('monthly_revenue_distribution_by_state.csv')

# by product category

In [None]:
final_df.head()

In [None]:
def pre_process_df_by_cat(df,loc_id,loc_state):
    df = (
        df
        .assign(**{"SalesCreatedAt": lambda df_: pd.to_datetime(df_['SalesCreatedAt'])})
        .assign(**{"year_month": lambda df_: df_['SalesCreatedAt'].dt.strftime('%Y-%m')})
        .replace('None', np.nan).fillna(0)
    )
    #exclude cost < $1.0
    df_low_cost = df[df['costInDollars'] < 1.0]
    df_non_low_cost = df[df['costInDollars'] >= 1.0]
    perc_low_cost = df_low_cost.shape[0] / df.shape[0]
    #exclude category = Accessory
    df_accessory = df_non_low_cost[df_non_low_cost['category'] == 'Accessory']
    df_non_accessory = df_non_low_cost[df_non_low_cost['category'] != 'Accessory']
    perc_accessory = df_accessory.shape[0] / df.shape[0]
    # cogs
    cogs = df_non_accessory[['year_month','category','NetSales','costInDollars']].groupby(['year_month','category']).sum()
    cogs['profit'] = cogs['NetSales'] - cogs['costInDollars']
    cogs['margin_perc'] = cogs['profit'] / cogs['NetSales']
    cogs['location_id'] = loc_id
    cogs['location_state'] = loc_state
    return cogs.reset_index(),perc_low_cost,perc_accessory
    

In [None]:
res_cat = []

for company_id, location_id, location_state, df in tqdm(company_location_dataframes):
    if len(df) == 0:
        continue
    res_cat.append(pre_process_df_by_cat(df, location_id, location_state)[0])

In [None]:
final_df_cat = pd.concat(res_cat).reset_index(drop=True)

In [None]:
cat_margin_distr = final_df_cat.groupby(['category','year_month'])['margin_perc'].describe().reset_index()
cat_margin_distr['year_month'] = pd.to_datetime(cat_margin_distr['year_month'])

In [None]:
cat_margin_distr

In [None]:
sns.set(rc={'figure.figsize':(20,9)})
palette = sns.color_palette("cubehelix",15)
sns.lineplot(data=cat_margin_distr, x="year_month", y="50%", hue="category",marker= 'o', markersize=9,palette=palette)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
cat_by_state = final_df_cat[['location_state','category','margin_perc']].groupby(['location_state','category']).median().reset_index()

In [None]:
sns.set(rc={'figure.figsize':(12,10)})
palette = sns.color_palette("Paired_r",15)
ax = sns.barplot(x="margin_perc", y="location_state", hue="category", data=cat_by_state,palette=palette)
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax.set_xlabel ("median margin %")
ax.set_ylabel ("location state")

In [None]:
cat_by_state[cat_by_state['location_state'] == 'CA'].sort_values(by = 'margin_perc', ascending = False)