### Pandas filtering is very powerful for wrangling and analyzing

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
csv_url = 'https://talkpython.fm/2019_customer_transactions.csv'
df = pd.read_csv(csv_url, parse_dates=['invoice_date_time'])

In [3]:
df.head()

Unnamed: 0,cust_num,sku,category,qty,list_price,discount_rate,invoice_price,invoice_num,invoice_date_time,invoice_total
0,Z00562,PB200,pens,1,3.0,0.27,2.19,7612,2019-08-04 20:41:56.521097,2.19
1,H79033,PB24,pens,5,3.0,0.24,2.28,86854,2019-01-17 03:17:41.873383,11.4
2,U23721,PB24,pens,14,3.0,0.13,2.61,21950,2019-03-03 21:22:11.135135,36.54
3,N80809,PE11,pencils,14,2.0,0.04,1.92,23805,2019-12-19 19:54:38.391874,26.88
4,K89557,PG22,pens,10,3.0,0.06,2.82,35154,2019-02-27 20:19:52.681473,28.2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100000 entries, 0 to 1099999
Data columns (total 10 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   cust_num           1100000 non-null  object        
 1   sku                1100000 non-null  object        
 2   category           1100000 non-null  object        
 3   qty                1100000 non-null  int64         
 4   list_price         1100000 non-null  float64       
 5   discount_rate      1100000 non-null  float64       
 6   invoice_price      1100000 non-null  float64       
 7   invoice_num        1100000 non-null  int64         
 8   invoice_date_time  1100000 non-null  datetime64[ns]
 9   invoice_total      1100000 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(2), object(3)
memory usage: 83.9+ MB


In [5]:
# Find all pens sold in Jan, Feb and March with a quantity >= 5
date_filter = df['invoice_date_time'].between('1-1-2019', '3-31-2019')
qty_filter = df['qty'] >= 5
product_filter = df['category'].isin(['pens'])

In [6]:
date_filter

0          False
1           True
2           True
3          False
4           True
           ...  
1099995     True
1099996     True
1099997    False
1099998    False
1099999     True
Name: invoice_date_time, Length: 1100000, dtype: bool

In [7]:
df.loc[date_filter & qty_filter & product_filter]

Unnamed: 0,cust_num,sku,category,qty,list_price,discount_rate,invoice_price,invoice_num,invoice_date_time,invoice_total
1,H79033,PB24,pens,5,3.0,0.24,2.28,86854,2019-01-17 03:17:41.873383,11.40
2,U23721,PB24,pens,14,3.0,0.13,2.61,21950,2019-03-03 21:22:11.135135,36.54
4,K89557,PG22,pens,10,3.0,0.06,2.82,35154,2019-02-27 20:19:52.681473,28.20
35,N33530,PB24,pens,63,3.0,0.08,2.76,74659,2019-03-30 07:45:45.818329,173.88
49,G05537,PB200,pens,8,3.0,0.09,2.73,83053,2019-03-03 18:35:46.064656,21.84
...,...,...,...,...,...,...,...,...,...,...
1099952,F21295,PB24,pens,14,3.0,0.32,2.04,33548,2019-03-05 01:30:39.349739,28.56
1099965,S44897,PG22,pens,7,3.0,0.26,2.22,94625,2019-01-19 02:56:49.115738,15.54
1099979,Y33924,PB21,pens,40,3.0,0.10,2.70,82434,2019-02-09 14:43:58.807008,108.00
1099982,X59522,PB24,pens,7,3.0,0.15,2.55,955,2019-02-14 13:34:12.734684,17.85


In [8]:
df['special_pen_sale'] = False
df.loc[date_filter & qty_filter & product_filter, 'special_pen_sale'] = True

In [9]:
df.head()

Unnamed: 0,cust_num,sku,category,qty,list_price,discount_rate,invoice_price,invoice_num,invoice_date_time,invoice_total,special_pen_sale
0,Z00562,PB200,pens,1,3.0,0.27,2.19,7612,2019-08-04 20:41:56.521097,2.19,False
1,H79033,PB24,pens,5,3.0,0.24,2.28,86854,2019-01-17 03:17:41.873383,11.4,True
2,U23721,PB24,pens,14,3.0,0.13,2.61,21950,2019-03-03 21:22:11.135135,36.54,True
3,N80809,PE11,pencils,14,2.0,0.04,1.92,23805,2019-12-19 19:54:38.391874,26.88,False
4,K89557,PG22,pens,10,3.0,0.06,2.82,35154,2019-02-27 20:19:52.681473,28.2,True


In [10]:
pd.pivot_table(data=df,
               index=['category'],
               columns=['special_pen_sale'],
               aggfunc=['sum'],
               values=['invoice_total'],
               fill_value=0, margins=True).style.format('${0:,.0f}')

Unnamed: 0_level_0,sum,sum,sum
Unnamed: 0_level_1,invoice_total,invoice_total,invoice_total
special_pen_sale,False,True,All
category,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
books,"$52,478,337",$0,"$52,478,337"
pencils,"$13,964,096",$0,"$13,964,096"
pens,"$15,919,855","$5,055,839","$20,975,695"
All,"$82,362,288","$5,055,839","$87,418,127"


In [11]:
pd.crosstab(df['category'],
            df['special_pen_sale'],
            values=df['invoice_total'],
            aggfunc='sum',
            normalize='index').style.format('{:.2%}')

special_pen_sale,False,True
category,Unnamed: 1_level_1,Unnamed: 2_level_1
books,100.00%,0.00%
pencils,100.00%,0.00%
pens,75.90%,24.10%


In [12]:
pd.crosstab(df['category'],
            df['special_pen_sale'],
            values=df['invoice_total'],
            aggfunc='sum',
            normalize=True).style.format('{:.2%}')

special_pen_sale,False,True
category,Unnamed: 1_level_1,Unnamed: 2_level_1
books,60.03%,0.00%
pencils,15.97%,0.00%
pens,18.21%,5.78%
