In [1]:
# Import packages
import pandas as pd
import numpy as np

## 1. Clean duplicate and null value

In [2]:
# Import data
retail = pd.read_csv('online_retail2.csv')
print(retail.head())

# Check shape of data
print(retail.shape)

  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

           InvoiceDate  Price  Customer ID         Country  
0  2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1  2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2  2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3  2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4  2009-12-01 07:45:00   1.25      13085.0  United Kingdom  
(1067371, 8)


In [3]:
# Remove duplicate
retail = retail.drop_duplicates()

# Check shape again
print(retail.shape)

(1033036, 8)


In [4]:
# Check null value
print(retail['Description'].isnull())
print(retail['Description'].isnull().value_counts())

0          False
1          False
2          False
3          False
4          False
           ...  
1067366    False
1067367    False
1067368    False
1067369    False
1067370    False
Name: Description, Length: 1033036, dtype: bool
Description
False    1028761
True        4275
Name: count, dtype: int64


In [5]:
# Check null value of the whole data
print(f'''Null value of each column: {retail.isnull().sum()}''')
print(f'''Total null value: {retail.isnull().sum().sum()}''')

Null value of each column: Invoice             0
StockCode           0
Description      4275
Quantity            0
InvoiceDate         0
Price               0
Customer ID    235151
Country             0
dtype: int64
Total null value: 239426


In [6]:
# Drop null value
retail = retail.dropna()

# Check shape again
retail.shape

(797885, 8)

## 2. Convert data type

In [7]:
# Check data type of each column
retail.info()

<class 'pandas.core.frame.DataFrame'>
Index: 797885 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      797885 non-null  object 
 1   StockCode    797885 non-null  object 
 2   Description  797885 non-null  object 
 3   Quantity     797885 non-null  int64  
 4   InvoiceDate  797885 non-null  object 
 5   Price        797885 non-null  float64
 6   Customer ID  797885 non-null  float64
 7   Country      797885 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 54.8+ MB


In [8]:
# Convert date to datetime
retail['date'] = pd.to_datetime(retail.InvoiceDate)

In [9]:
# Describe data
retail.describe()

Unnamed: 0,Quantity,Price,Customer ID,date
count,797885.0,797885.0,797885.0,797885
mean,12.60298,3.702732,15313.062777,2011-01-02 13:17:34.141160704
min,-80995.0,0.0,12346.0,2009-12-01 07:45:00
25%,2.0,1.25,13964.0,2010-07-02 09:47:00
50%,5.0,1.95,15228.0,2010-12-02 12:33:00
75%,12.0,3.75,16788.0,2011-07-31 15:50:00
max,80995.0,38970.0,18287.0,2011-12-09 12:50:00
std,191.670371,71.392549,1696.466663,


## 3. Filtering

In [10]:
# Select column 'Country'
retail['Country']

0          United Kingdom
1          United Kingdom
2          United Kingdom
3          United Kingdom
4          United Kingdom
                ...      
1067366            France
1067367            France
1067368            France
1067369            France
1067370            France
Name: Country, Length: 797885, dtype: object

In [11]:
# Count value of column 'Country' where value is 'France'
print(f'''Count value of column 'Country' where value is 'France': 
{retail[retail['Country']=='France']['Country'].value_counts()}''')

# Column 'Country' where value is 'France' or 'EIRE'
print(f'''Column 'Country' where value is 'France' or 'EIRE': 
{retail[(retail['Country']=='France') | (retail['Country']=='EIRE')]['Country']}''')

Count value of column 'Country' where value is 'France': 
Country
France    13897
Name: count, dtype: int64
Column 'Country' where value is 'France' or 'EIRE': 
71         France
72         France
73         France
74         France
75         France
            ...  
1067366    France
1067367    France
1067368    France
1067369    France
1067370    France
Name: Country, Length: 29911, dtype: object


In [12]:
# List of countries
countries = ['France','EIRE','Spain']

# Count value of column 'Country' where value is in countries list
retail[retail['Country'].isin(countries)]['Country'].value_counts()

Country
EIRE      16014
France    13897
Spain      3754
Name: count, dtype: int64

In [13]:
# Filter data where date is greater than or equal to '2011-08'
retail[retail['date'] >= '2011-08']

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,date
810882,561904,22075,6 RIBBONS ELEGANT CHRISTMAS,96,2011-08-01 08:30:00,1.45,17941.0,United Kingdom,2011-08-01 08:30:00
810883,561904,85049E,SCANDINAVIAN REDS RIBBONS,156,2011-08-01 08:30:00,1.06,17941.0,United Kingdom,2011-08-01 08:30:00
810884,561905,21385,IVORY HANGING DECORATION HEART,24,2011-08-01 09:31:00,0.85,14947.0,United Kingdom,2011-08-01 09:31:00
810885,561905,84970L,SINGLE HEART ZINC T-LIGHT HOLDER,12,2011-08-01 09:31:00,0.95,14947.0,United Kingdom,2011-08-01 09:31:00
810886,561905,84970S,HANGING HEART ZINC T-LIGHT HOLDER,12,2011-08-01 09:31:00,0.85,14947.0,United Kingdom,2011-08-01 09:31:00
...,...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,2011-12-09 12:50:00
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,2011-12-09 12:50:00
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,2011-12-09 12:50:00
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,2011-12-09 12:50:00


In [14]:
# Filter data where date is greater than or equal to '2011-08-01' and less than or equal to '2011-08-31'
retail[(retail['date'] >= '2011-08-01') & (retail['date'] <= '2011-08-31')]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,date
810882,561904,22075,6 RIBBONS ELEGANT CHRISTMAS,96,2011-08-01 08:30:00,1.45,17941.0,United Kingdom,2011-08-01 08:30:00
810883,561904,85049E,SCANDINAVIAN REDS RIBBONS,156,2011-08-01 08:30:00,1.06,17941.0,United Kingdom,2011-08-01 08:30:00
810884,561905,21385,IVORY HANGING DECORATION HEART,24,2011-08-01 09:31:00,0.85,14947.0,United Kingdom,2011-08-01 09:31:00
810885,561905,84970L,SINGLE HEART ZINC T-LIGHT HOLDER,12,2011-08-01 09:31:00,0.95,14947.0,United Kingdom,2011-08-01 09:31:00
810886,561905,84970S,HANGING HEART ZINC T-LIGHT HOLDER,12,2011-08-01 09:31:00,0.85,14947.0,United Kingdom,2011-08-01 09:31:00
...,...,...,...,...,...,...,...,...,...
844856,564852,82552,WASHROOM METAL SIGN,10,2011-08-30 17:23:00,1.45,14976.0,United Kingdom,2011-08-30 17:23:00
844857,564852,21756,BATH BUILDING BLOCK WORD,3,2011-08-30 17:23:00,5.95,14976.0,United Kingdom,2011-08-30 17:23:00
844858,564852,21908,CHOCOLATE THIS WAY METAL SIGN,7,2011-08-30 17:23:00,2.10,14976.0,United Kingdom,2011-08-30 17:23:00
844859,564852,22116,METAL SIGN HIS DINNER IS SERVED,10,2011-08-30 17:23:00,0.79,14976.0,United Kingdom,2011-08-30 17:23:00


In [15]:
# Change the value of column 'Country' where value is 'EIRE' to 'Eastern Ireland'
retail['Country'] = retail[retail['Country']=='EIRE']['Country']='Eastern Ireland'

# Change the value of column 'StockCode' where value is 'POST' to 'post'
retail['StockCode'] = retail[retail['StockCode']=='POST']['StockCode']='post'

# Filter data where column 'StockCode' is 'post'
retail[retail['StockCode']=='post']

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,date
0,489434,post,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,Eastern Ireland,2009-12-01 07:45:00
1,489434,post,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,Eastern Ireland,2009-12-01 07:45:00
2,489434,post,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,Eastern Ireland,2009-12-01 07:45:00
3,489434,post,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,Eastern Ireland,2009-12-01 07:45:00
4,489434,post,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,Eastern Ireland,2009-12-01 07:45:00
...,...,...,...,...,...,...,...,...,...
1067366,581587,post,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,Eastern Ireland,2011-12-09 12:50:00
1067367,581587,post,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,Eastern Ireland,2011-12-09 12:50:00
1067368,581587,post,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,Eastern Ireland,2011-12-09 12:50:00
1067369,581587,post,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,Eastern Ireland,2011-12-09 12:50:00


## 4. Indexing

In [16]:
# Set 'date' as index
retail = retail.set_index('date')
retail.head()

Unnamed: 0_level_0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2009-12-01 07:45:00,489434,post,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,Eastern Ireland
2009-12-01 07:45:00,489434,post,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,Eastern Ireland
2009-12-01 07:45:00,489434,post,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,Eastern Ireland
2009-12-01 07:45:00,489434,post,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,Eastern Ireland
2009-12-01 07:45:00,489434,post,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,Eastern Ireland


In [17]:
# Filter data where index is '2011-Jul-04'
retail.loc['2011-Jul-04']

Unnamed: 0_level_0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-07-04 10:20:00,558775,post,DOORMAT BLACK FLOCK,300,2011-07-04 10:20:00,4.58,18102.0,Eastern Ireland
2011-07-04 10:20:00,558775,post,DOORMAT HEARTS,300,2011-07-04 10:20:00,4.58,18102.0,Eastern Ireland
2011-07-04 10:20:00,558775,post,DOORMAT WELCOME PUPPIES,300,2011-07-04 10:20:00,4.58,18102.0,Eastern Ireland
2011-07-04 10:20:00,558775,post,DOORMAT NEW ENGLAND,300,2011-07-04 10:20:00,4.58,18102.0,Eastern Ireland
2011-07-04 10:20:00,558775,post,DOORMAT FANCY FONT HOME SWEET HOME,300,2011-07-04 10:20:00,4.58,18102.0,Eastern Ireland
...,...,...,...,...,...,...,...,...
2011-07-04 16:35:00,558906,post,LUNCH BAG SPACEBOY DESIGN,6,2011-07-04 16:35:00,1.65,15555.0,Eastern Ireland
2011-07-04 16:35:00,558906,post,LUNCH BAG SUKI DESIGN,4,2011-07-04 16:35:00,1.65,15555.0,Eastern Ireland
2011-07-04 16:35:00,558906,post,REX CASH+CARRY JUMBO SHOPPER,4,2011-07-04 16:35:00,0.95,15555.0,Eastern Ireland
2011-07-04 16:35:00,558906,post,BROCADE RING PURSE,36,2011-07-04 16:35:00,0.29,15555.0,Eastern Ireland


In [18]:
# Reset index
retail = retail.reset_index()
retail.head()

Unnamed: 0,date,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,2009-12-01 07:45:00,489434,post,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,Eastern Ireland
1,2009-12-01 07:45:00,489434,post,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,Eastern Ireland
2,2009-12-01 07:45:00,489434,post,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,Eastern Ireland
3,2009-12-01 07:45:00,489434,post,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,Eastern Ireland
4,2009-12-01 07:45:00,489434,post,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,Eastern Ireland


In [19]:
# Set multi index 'Country' and 'date'
retail = retail.set_index(['Country','date'])
retail.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID
Country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Eastern Ireland,2009-12-01 07:45:00,489434,post,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0
Eastern Ireland,2009-12-01 07:45:00,489434,post,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0
Eastern Ireland,2009-12-01 07:45:00,489434,post,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0
Eastern Ireland,2009-12-01 07:45:00,489434,post,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0
Eastern Ireland,2009-12-01 07:45:00,489434,post,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0


In [20]:
# Sort index by 'Country' and 'date'
retail_sorted = retail.sort_index(axis=0, ascending=True)
retail_sorted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID
Country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Eastern Ireland,2009-12-01 07:45:00,489434,post,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0
Eastern Ireland,2009-12-01 07:45:00,489434,post,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0
Eastern Ireland,2009-12-01 07:45:00,489434,post,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0
Eastern Ireland,2009-12-01 07:45:00,489434,post,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0
Eastern Ireland,2009-12-01 07:45:00,489434,post,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0


In [21]:
# Filter data where 'Country' is 'Eastern Ireland' and 'date' is '2009-12-01 07:45:00' and select column 'Customer ID' and 'Description'
retail_sorted.loc[[('Eastern Ireland','2009-12-01 07:45:00')],['Customer ID','Description']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Customer ID,Description
Country,date,Unnamed: 2_level_1,Unnamed: 3_level_1
Eastern Ireland,2009-12-01 07:45:00,13085.0,15CM CHRISTMAS GLASS BALL 20 LIGHTS
Eastern Ireland,2009-12-01 07:45:00,13085.0,PINK CHERRY LIGHTS
Eastern Ireland,2009-12-01 07:45:00,13085.0,WHITE CHERRY LIGHTS
Eastern Ireland,2009-12-01 07:45:00,13085.0,"RECORD FRAME 7"" SINGLE SIZE"
Eastern Ireland,2009-12-01 07:45:00,13085.0,STRAWBERRY CERAMIC TRINKET BOX
Eastern Ireland,2009-12-01 07:45:00,13085.0,PINK DOUGHNUT TRINKET POT
Eastern Ireland,2009-12-01 07:45:00,13085.0,SAVE THE PLANET MUG
Eastern Ireland,2009-12-01 07:45:00,13085.0,FANCY FONT HOME SWEET HOME DOORMAT


## 5. Group by

In [22]:
# Import data
retail = pd.read_csv('online_retail2.csv')
retail.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [23]:
# Group data by 'Country' and sum the 'Quantity'
retail.groupby(by=['Country'])['Quantity'].sum()

Country
Australia                103706
Austria                   11306
Bahrain                    1275
Belgium                   35132
Bermuda                    2798
Brazil                      545
Canada                     3657
Channel Islands           20473
Cyprus                    10688
Czech Republic              592
Denmark                  235218
EIRE                     331341
European Community          497
Finland                   14317
France                   184952
Germany                  224581
Greece                     7707
Hong Kong                  7075
Iceland                    3286
Israel                     5485
Italy                     15309
Japan                     31822
Korea                       598
Lebanon                     457
Lithuania                  2958
Malta                      2491
Netherlands              381951
Nigeria                      56
Norway                    27110
Poland                     5644
Portugal                  28058


In [24]:
# Group data by 'Country' and 'Description' and calculate the mean of 'Quantity' and 'Price'
retail.groupby(by=['Country','Description'])[['Quantity','Price']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Price
Country,Description,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,DOLLY GIRL BEAKER,200.0,1.08
Australia,I LOVE LONDON MINI BACKPACK,4.0,4.15
Australia,10 COLOUR SPACEBOY PEN,48.0,0.85
Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,0.55
Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,0.65
...,...,...,...
West Indies,VINTAGE BEAD PINK SCARF,3.0,7.95
West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,1.25
West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.65
West Indies,WOVEN BERRIES CUSHION COVER,2.0,4.95


In [25]:
# Create a dataframe 'a' that contains the group data by 'Country' and 'Description' and calculate the mean and median of 'Quantity' and 'Price'
a = retail.groupby(by=['Country','Description'])[['Quantity','Price']].agg([np.mean, np.median])
a

  a = retail.groupby(by=['Country','Description'])[['Quantity','Price']].agg([np.mean, np.median])
  a = retail.groupby(by=['Country','Description'])[['Quantity','Price']].agg([np.mean, np.median])
  a = retail.groupby(by=['Country','Description'])[['Quantity','Price']].agg([np.mean, np.median])


Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Quantity,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median
Country,Description,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Australia,DOLLY GIRL BEAKER,200.0,200.0,1.08,1.08
Australia,I LOVE LONDON MINI BACKPACK,4.0,4.0,4.15,4.15
Australia,10 COLOUR SPACEBOY PEN,48.0,48.0,0.85,0.85
Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,384.0,0.55,0.55
Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,24.0,0.65,0.65
...,...,...,...,...,...
West Indies,VINTAGE BEAD PINK SCARF,3.0,3.0,7.95,7.95
West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,6.0,1.25,1.25
West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.0,1.65,1.65
West Indies,WOVEN BERRIES CUSHION COVER,2.0,2.0,4.95,4.95


In [26]:
# Sort index of 'a'
a = a.sort_index()

# Filter data where 'Country' is 'Australia' and 'Description' is '10 COLOUR SPACEBOY PEN' and select the median of 'Price'
a.loc[('Australia','10 COLOUR SPACEBOY PEN'), ('Price','median')]

np.float64(0.85)

In [27]:
# Create dataframe 'b' that contains the group data by 'Country' and 'Description' and calculate the mean and median of 'Quantity' and 'Price'
b = retail.groupby(by=['Country','Description'])[['Quantity','Price']].agg([np.mean, np.median]).reset_index()
b

  b = retail.groupby(by=['Country','Description'])[['Quantity','Price']].agg([np.mean, np.median]).reset_index()
  b = retail.groupby(by=['Country','Description'])[['Quantity','Price']].agg([np.mean, np.median]).reset_index()
  b = retail.groupby(by=['Country','Description'])[['Quantity','Price']].agg([np.mean, np.median]).reset_index()


Unnamed: 0_level_0,Country,Description,Quantity,Quantity,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,median,mean,median
0,Australia,DOLLY GIRL BEAKER,200.0,200.0,1.08,1.08
1,Australia,I LOVE LONDON MINI BACKPACK,4.0,4.0,4.15,4.15
2,Australia,10 COLOUR SPACEBOY PEN,48.0,48.0,0.85,0.85
3,Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,384.0,0.55,0.55
4,Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,24.0,0.65,0.65
...,...,...,...,...,...,...
30691,West Indies,VINTAGE BEAD PINK SCARF,3.0,3.0,7.95,7.95
30692,West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,6.0,1.25,1.25
30693,West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.0,1.65,1.65
30694,West Indies,WOVEN BERRIES CUSHION COVER,2.0,2.0,4.95,4.95


In [28]:
# Drop level 0 of column
b.droplevel(level=0, axis=1)

Unnamed: 0,Unnamed: 1,Unnamed: 2,mean,median,mean.1,median.1
0,Australia,DOLLY GIRL BEAKER,200.0,200.0,1.08,1.08
1,Australia,I LOVE LONDON MINI BACKPACK,4.0,4.0,4.15,4.15
2,Australia,10 COLOUR SPACEBOY PEN,48.0,48.0,0.85,0.85
3,Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,384.0,0.55,0.55
4,Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,24.0,0.65,0.65
...,...,...,...,...,...,...
30691,West Indies,VINTAGE BEAD PINK SCARF,3.0,3.0,7.95,7.95
30692,West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,6.0,1.25,1.25
30693,West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.0,1.65,1.65
30694,West Indies,WOVEN BERRIES CUSHION COVER,2.0,2.0,4.95,4.95


In [29]:
# Drop level 1 of column
b.droplevel(level=1, axis=1)

Unnamed: 0,Country,Description,Quantity,Quantity.1,Price,Price.1
0,Australia,DOLLY GIRL BEAKER,200.0,200.0,1.08,1.08
1,Australia,I LOVE LONDON MINI BACKPACK,4.0,4.0,4.15,4.15
2,Australia,10 COLOUR SPACEBOY PEN,48.0,48.0,0.85,0.85
3,Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,384.0,0.55,0.55
4,Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,24.0,0.65,0.65
...,...,...,...,...,...,...
30691,West Indies,VINTAGE BEAD PINK SCARF,3.0,3.0,7.95,7.95
30692,West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,6.0,1.25,1.25
30693,West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.0,1.65,1.65
30694,West Indies,WOVEN BERRIES CUSHION COVER,2.0,2.0,4.95,4.95


In [30]:
# Group data by 'Country' and 'Description' and calculate the mean and median of 'Quantity' and 'Price'
retail.groupby(['Country','Description']).agg(mean_qty = ('Quantity', np.mean), mean_price = ('Price', np.mean), 
                                              median_qty = ('Quantity', np.median), median_price = ('Price', np.median)).reset_index()

  retail.groupby(['Country','Description']).agg(mean_qty = ('Quantity', np.mean), mean_price = ('Price', np.mean),
  retail.groupby(['Country','Description']).agg(mean_qty = ('Quantity', np.mean), mean_price = ('Price', np.mean),
  retail.groupby(['Country','Description']).agg(mean_qty = ('Quantity', np.mean), mean_price = ('Price', np.mean),


Unnamed: 0,Country,Description,mean_qty,mean_price,median_qty,median_price
0,Australia,DOLLY GIRL BEAKER,200.0,1.08,200.0,1.08
1,Australia,I LOVE LONDON MINI BACKPACK,4.0,4.15,4.0,4.15
2,Australia,10 COLOUR SPACEBOY PEN,48.0,0.85,48.0,0.85
3,Australia,12 PENCIL SMALL TUBE WOODLAND,384.0,0.55,384.0,0.55
4,Australia,12 PENCILS SMALL TUBE RED SPOTTY,24.0,0.65,24.0,0.65
...,...,...,...,...,...,...
30691,West Indies,VINTAGE BEAD PINK SCARF,3.0,7.95,3.0,7.95
30692,West Indies,WHITE AND BLUE CERAMIC OIL BURNER,6.0,1.25,6.0,1.25
30693,West Indies,WOODLAND PARTY BAG + STICKER SET,1.0,1.65,1.0,1.65
30694,West Indies,WOVEN BERRIES CUSHION COVER,2.0,4.95,2.0,4.95


## 6. Pivot table

In [31]:
# Rename column 'InvoiceDate' to 'date'
retail = retail.rename(columns={'InvoiceDate':'date'})

# Filter columns 'Country', 'date', 'Quantity', and 'Price'
country_date_qty = retail[['Country','date','Quantity','Price']]
country_date_qty.head()

Unnamed: 0,Country,date,Quantity,Price
0,United Kingdom,2009-12-01 07:45:00,12,6.95
1,United Kingdom,2009-12-01 07:45:00,12,6.75
2,United Kingdom,2009-12-01 07:45:00,12,6.75
3,United Kingdom,2009-12-01 07:45:00,48,2.1
4,United Kingdom,2009-12-01 07:45:00,24,1.25


In [32]:
# Pivot the data where index is 'date', columns is 'Country', values is 'Quantity', fill value is 0, and aggregate function is sum
country_pivoted_mean = country_date_qty.pivot_table(index='date', 
                                               columns='Country', 
                                               values='Quantity', 
                                               fill_value=0, 
                                               aggfunc='mean').reset_index()

# Filter data by by 'date' and 'United Kingdom' then sort
country_pivoted_mean[['date','United Kingdom']].sort_values(by=['date','United Kingdom'])

Country,date,United Kingdom
0,2009-12-01 07:45:00,20.750000
1,2009-12-01 07:46:00,15.000000
2,2009-12-01 09:06:00,10.157895
3,2009-12-01 09:08:00,6.304348
4,2009-12-01 09:24:00,48.588235
...,...,...
47630,2011-12-09 12:23:00,38.000000
47631,2011-12-09 12:25:00,60.000000
47632,2011-12-09 12:31:00,13.238095
47633,2011-12-09 12:49:00,16.500000


In [33]:
# Pivot the data where index is 'date', columns is 'Country', values is 'Quantity', fill value is 0, and aggregate function is sum
country_pivoted_sum = country_date_qty.pivot_table(index='date',
                                               columns='Country',
                                               values='Quantity',
                                               fill_value=0,
                                               aggfunc='sum').reset_index()

# Filter data by by 'date' and 'United Kingdom' then sort
country_pivoted_sum[['date','United Kingdom']].sort_values(by=['date','United Kingdom'])

Country,date,United Kingdom
0,2009-12-01 07:45:00,166
1,2009-12-01 07:46:00,60
2,2009-12-01 09:06:00,193
3,2009-12-01 09:08:00,145
4,2009-12-01 09:24:00,826
...,...,...
47630,2011-12-09 12:23:00,76
47631,2011-12-09 12:25:00,120
47632,2011-12-09 12:31:00,278
47633,2011-12-09 12:49:00,66


In [34]:
# Pivot the data where index is 'date', columns is 'Country', values is 'Quantity' and 'Price', fill value is 0, and aggregate function is sum
country_pivoted_price_qty_sum = country_date_qty.pivot_table(index='date',
                                             columns='Country',
                                             values=['Quantity','Price'],
                                             fill_value=0,
                                             aggfunc='sum').reset_index()
country_pivoted_price_qty_sum.head()

Unnamed: 0_level_0,date,Price,Price,Price,Price,Price,Price,Price,Price,Price,...,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity
Country,Unnamed: 1_level_1,Australia,Austria,Bahrain,Belgium,Bermuda,Brazil,Canada,Channel Islands,Cyprus,...,Singapore,Spain,Sweden,Switzerland,Thailand,USA,United Arab Emirates,United Kingdom,Unspecified,West Indies
0,2009-12-01 07:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,166,0,0
1,2009-12-01 07:46:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,60,0,0
2,2009-12-01 09:06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,193,0,0
3,2009-12-01 09:08:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,145,0,0
4,2009-12-01 09:24:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,826,0,0


In [35]:
# Pivot the data where index is 'date', columns is 'Country', values is 'Quantity' and 'Price', fill value is 0, and aggregate function is sum and mean
country_pivoted_quantitysum_pricemean = country_date_qty.pivot_table(index='date',
                                               columns='Country',
                                               values=['Quantity','Price'],
                                               fill_value=0,
                                               aggfunc={'Quantity': 'sum', 'Price': 'mean'}).reset_index()
country_pivoted_quantitysum_pricemean.head()

Unnamed: 0_level_0,date,Price,Price,Price,Price,Price,Price,Price,Price,Price,...,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity
Country,Unnamed: 1_level_1,Australia,Austria,Bahrain,Belgium,Bermuda,Brazil,Canada,Channel Islands,Cyprus,...,Singapore,Spain,Sweden,Switzerland,Thailand,USA,United Arab Emirates,United Kingdom,Unspecified,West Indies
0,2009-12-01 07:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,166,0,0
1,2009-12-01 07:46:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,60,0,0
2,2009-12-01 09:06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,193,0,0
3,2009-12-01 09:08:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,145,0,0
4,2009-12-01 09:24:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,826,0,0


## 7. Melting

In [36]:
# Rename columns with '_'
country_pivoted_quantitysum_pricemean.columns = country_pivoted_quantitysum_pricemean.columns.map('_'.join)
country_pivoted_quantitysum_pricemean.head()

Unnamed: 0,date_,Price_Australia,Price_Austria,Price_Bahrain,Price_Belgium,Price_Bermuda,Price_Brazil,Price_Canada,Price_Channel Islands,Price_Cyprus,...,Quantity_Singapore,Quantity_Spain,Quantity_Sweden,Quantity_Switzerland,Quantity_Thailand,Quantity_USA,Quantity_United Arab Emirates,Quantity_United Kingdom,Quantity_Unspecified,Quantity_West Indies
0,2009-12-01 07:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,166,0,0
1,2009-12-01 07:46:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,60,0,0
2,2009-12-01 09:06:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,193,0,0
3,2009-12-01 09:08:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,145,0,0
4,2009-12-01 09:24:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,826,0,0


In [37]:
country_pivoted_quantitysum_pricemean = country_pivoted_quantitysum_pricemean.melt(id_vars='date_', var_name='Measure')
country_pivoted_quantitysum_pricemean.head()

Unnamed: 0,date_,Measure,value
0,2009-12-01 07:45:00,Price_Australia,0.0
1,2009-12-01 07:46:00,Price_Australia,0.0
2,2009-12-01 09:06:00,Price_Australia,0.0
3,2009-12-01 09:08:00,Price_Australia,0.0
4,2009-12-01 09:24:00,Price_Australia,0.0


## 8. Joining

In [38]:
# Create dataframes of designation and age
designation_data = pd.DataFrame(dict({'name':['mike','jonathan','mo','lisa','raj','teo'], 
                                     'title':['manager','supervisor','director','associate','assistant','sectionhead']}))

age_data = pd.DataFrame(dict({'name':['mike','jonathan','lee','lisa','tom','teo'], 
                                     'age':['40','50','34','25','60','29']}))

# Merge dataframes with left join
pd.merge(designation_data,age_data,how='left')

Unnamed: 0,name,title,age
0,mike,manager,40.0
1,jonathan,supervisor,50.0
2,mo,director,
3,lisa,associate,25.0
4,raj,assistant,
5,teo,sectionhead,29.0


In [39]:
# Merge dataframes with right join
pd.merge(designation_data,age_data,how='right')

Unnamed: 0,name,title,age
0,mike,manager,40
1,jonathan,supervisor,50
2,lee,,34
3,lisa,associate,25
4,tom,,60
5,teo,sectionhead,29


In [40]:
# Merge dataframes with outer join
pd.merge(designation_data,age_data,how='outer')

Unnamed: 0,name,title,age
0,jonathan,supervisor,50.0
1,lee,,34.0
2,lisa,associate,25.0
3,mike,manager,40.0
4,mo,director,
5,raj,assistant,
6,teo,sectionhead,29.0
7,tom,,60.0


In [41]:
# Merge dataframes with inner join
pd.merge(designation_data,age_data,how='inner')

Unnamed: 0,name,title,age
0,mike,manager,40
1,jonathan,supervisor,50
2,lisa,associate,25
3,teo,sectionhead,29


In [42]:
# Merge dataframes with cross join
pd.merge(designation_data,age_data,how='cross')

Unnamed: 0,name_x,title,name_y,age
0,mike,manager,mike,40
1,mike,manager,jonathan,50
2,mike,manager,lee,34
3,mike,manager,lisa,25
4,mike,manager,tom,60
5,mike,manager,teo,29
6,jonathan,supervisor,mike,40
7,jonathan,supervisor,jonathan,50
8,jonathan,supervisor,lee,34
9,jonathan,supervisor,lisa,25
