# Pricing Project
## Identify Exclusions
Need to identify exclusions for:
* Outliers
* Customers
* Items

In [1]:
import pandas as pd
from pathlib import Path

pd.set_option('display.max_columns', 100)

In [2]:
loc = Path.cwd().parent / 'data' / 'generated' / 'invoice_data_0_raw.csv'
inv_df = pd.read_csv(loc)

  inv_df = pd.read_csv(loc)


In [3]:
inv_df.head()

Unnamed: 0,whse,orderno,ordersuf,lineno,invoicedt,custno,custname,item,itemdesc,unit,unitconv,units,unitcost,replcost,unitprice,netamt,returnfl,transtype,prod_type,prodcat,cat_descrip,rowpointer,priceorigcd,pdrecno,vendno,xcost_adj,GP$,Margin,month
0,101,933,0,1,2022-02-09,107113,K & S,PAICADS2,CADS-2 DUCT SEALANT GRAY 2 GAL,EA,1.0,1.0,20.20459,21.7,26.8,26.8,0,CS,stocked,6600,ADHESIVES/SEALANTS/TAPES,d41fa94f-2390-e6b4-9014-1a70708d5123,2,57983,16632.0,20.20459,6.59541,0.246097,2022-02
1,101,933,0,2,2022-02-09,107113,K & S,DIVB503,"B-503 2"" CHIP BRUSH",PC,1.0,1.0,0.25046,0.26,1.01,1.01,0,CS,stocked,6850,EQUIPMENT AND ACC,e5691b06-5a13-6685-9014-1a70d0a6f5a0,7,3285,12773.0,0.25046,0.75954,0.75202,2022-02
2,101,911000109,0,1,2022-01-27,100151,BD OF ED GARFIELD,GSS62,16GA. 48X96 GALV STEEL,SH,1.0,24.0,98.09914,117.26,148.57,3565.68,0,SO,stocked,5300,"SHEETS, GALVANIZED STEEL",b12fdaef-1b54-6791-8514-397f4855a7a5,O,0,15105.0,2354.37936,1211.30064,0.339711,2022-01
3,101,911000109,0,2,2022-01-27,100151,BD OF ED GARFIELD,GSS62,16GA. 48X96 GALV STEEL,SH,1.0,10.0,98.09914,96.824,148.57,1485.7,0,SO,stocked,5300,"SHEETS, GALVANIZED STEEL",b12fdaef-1b54-6791-8514-397f98c915aa,O,0,15105.0,980.9914,504.7086,0.339711,2022-01
4,101,911000527,0,2,2022-03-15,109762,DAP HVAC - 154 HAMILTON ST,ICP9511,N92ESN0601412A 92% 60M FURNACE,EA,1.0,5.0,759.02801,819.54,710.0,3550.0,0,SO,stocked,4060,ICP 92%+ GAS FURNACES,8687f4d8-b12e-279a-8514-25b170dfd11e,O,0,14204.0,3795.14005,-245.14005,-0.069054,2022-03


In [4]:
# add Exclusion column to raw file
inv_df['Exclusion'] = None

## Items
* Nonstock
* Labor Codes

In [5]:
# Remove any Nonstock Items
null_criteria = inv_df['Exclusion'].isnull()
ns_criteria = inv_df['prod_type'] == 'nonstock'
inv_df.loc[null_criteria & ns_criteria, 'Exclusion'] = 'Item - nonstock'

In [6]:
# Remove Labor Code Items
null_criteria = inv_df['Exclusion'].isnull()
labor_criteria = inv_df['prodcat'] == 8888
inv_df.loc[null_criteria & labor_criteria, 'Exclusion'] = 'Item - labor'

In [7]:
inv_df['prod_type'].unique()

array(['stocked', 'nonstock', 'special order'], dtype=object)

## Customers
* Any customers that appear in the ../data/inputs/customer_exclusion.xlsx file

In [8]:
# Remove any customers in the ../data/customer_exclusions.xlsx file
loc = Path.cwd().parent / 'data' / 'inputs' / 'customer_exclusions (reduced).xlsx'
cust_ex_df = pd.read_excel(loc, engine='openpyxl')
cust_exclusion_map = {i[0]:i[1] for i in cust_ex_df.to_dict('split')['data']}
inv_df['Exclusion'] = inv_df['Exclusion'].combine_first(inv_df['custno'].map(cust_exclusion_map))

## Outliers
* Quantity Shipped < 0
* Unit Price = 0
* Net Amount <= 0

In [9]:
# Remove any lines with negative quantity shipped
null_criteria = inv_df['Exclusion'].isnull()
qty_ship_criteria = inv_df['units'] < 0
inv_df.loc[null_criteria & qty_ship_criteria, 'Exclusion'] = 'Outlier - negative quantity'

In [10]:
# Remove any lines with zero unit price
null_criteria = inv_df['Exclusion'].isnull()
unit_price_criteria = inv_df['unitprice'] == 0
inv_df.loc[null_criteria & unit_price_criteria, 'Exclusion'] = 'Outlier - zero price'

In [11]:
# Remove any lines with net amount less than or equal to zero
null_criteria = inv_df['Exclusion'].isnull()
unit_price_criteria = inv_df['netamt'] <= 0
inv_df.loc[null_criteria & unit_price_criteria, 'Exclusion'] = 'Outlier - zero net amount'

## Summary

In [12]:
inv_df.loc[inv_df['prod_type'] == 'nonstock'].groupby(by='vendno', as_index=False).agg({'netamt':'sum'}).to_clipboard(index=False)

In [13]:
inv_df['Exclusion'].value_counts()

Outlier - negative quantity    14085
Item - nonstock                10713
Outlier - zero net amount       8208
Item - labor                    1969
Customer - fake acct            1284
Outlier - zero price             725
Customer - delete                 10
Name: Exclusion, dtype: int64

In [14]:
inv_df[['Exclusion','netamt']].groupby(by='Exclusion').sum('netamt')

Unnamed: 0_level_0,netamt
Exclusion,Unnamed: 1_level_1
Customer - delete,1638.62
Customer - fake acct,78256.22
Item - labor,148806.88
Item - nonstock,16693520.92
Outlier - negative quantity,-5602357.24
Outlier - zero net amount,0.0
Outlier - zero price,0.0


In [15]:
inv_df[['Exclusion']].isna().any(axis=1).sum()

295879

## Save to csv

In [16]:
loc = Path.cwd().parent / 'data' / 'generated' / 'invoice_data_1_exclusions_labeled.csv'
inv_df.to_csv(loc, index=False)

In [17]:
inv_df.loc[inv_df['Exclusion'] == 'Customer - inactive acct']['netamt'].sum()

0.0

In [18]:
inv_df.loc[inv_df['Exclusion'] == 'Warehouse - 602']['netamt'].sum()

0.0

In [19]:
sum(inv_df['netamt'])

141621252.46999747