In [1]:
from IPython.core.display import HTML
HTML("""
<style>
    h1{background-color:black; color:white; padding: 10px 10px 10px 10px}
    h2{background-color:green; color:white;  padding: 5px 5px 5px 5px}
</style>
""")

In [2]:
import pandas as pd
import numpy as np
import json
import datetime
import itertools
import re

# Config Parameters

In [3]:
## Data on Exchanges and Returns
FILEPATH_DATA_EXCHANGES_RETURNS = '../data/data_science_exchanges_returns.file'

## Data Conversions
FILEPATH_DATA_CONVERSIONS = '../data/data_science_test_conversions.file'

## Data Events
## Note : had to download in 2 parts and join together due to network timeout issues faced 
## when downloading whole dataset directly.
FILEPATH_DATA_EVENTS = '../data/data_science_test_events_FULL.json'

## Period Range for analysis
ORDER_START_DATE = datetime.date(2019,2,1)
ORDER_END_DATE = datetime.date(2019,2,28)

# Utility Functions

In [4]:
## Function to normalize character string
## - replace NaN with '<NA>'
## - convert to lowercase
## - remove white space
def normalize_string(string):
    return '<NA>' if type(string) is not str else ''.join(string.lower().strip().split())

# Data on Exchanges / Returns

## Read in dataset

In [5]:
DATA_EXCHANGE_RETURNS = pd.read_table(
    FILEPATH_DATA_EXCHANGES_RETURNS,
    delimiter='\t',
    dtype = {'Order nr':'object'}
)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
DATA_EXCHANGE_RETURNS.head()

Unnamed: 0,Country,Order nr,SKU ID,Reason Name,original_size_system_name,original_size_name,exchanged_size_system_name,exchanged_size_name
0,ph,239748222,EV032US96WVNPH,1._size_does_not_fit_-_too_large,International,One Size,,
1,ph,251645722,CH672SH32LPNPH,2._size_does_not_fit_-_too_small,US,7,,
2,ph,235646762,EM520AC06VDDPH,1._size_does_not_fit_-_too_large,International,One Size,,
3,ph,275962462,RU506SH93IHUMY,1._size_does_not_fit_-_too_large,EU,36,,
4,ph,259521762,MC141AC39HVKPH,2._size_does_not_fit_-_too_small,US,M,,


In [7]:
DATA_EXCHANGE_RETURNS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387055 entries, 0 to 387054
Data columns (total 8 columns):
Country                       387055 non-null object
Order nr                      386344 non-null object
SKU ID                        387055 non-null object
Reason Name                   91242 non-null object
original_size_system_name     385998 non-null object
original_size_name            386344 non-null object
exchanged_size_system_name    295813 non-null object
exchanged_size_name           295813 non-null object
dtypes: object(8)
memory usage: 23.6+ MB


## Filter for 'size_does_not_fit' reasons

In [8]:
DATA_EXCHANGE_RETURNS['Reason Name'].unique()

array(['1._size_does_not_fit_-_too_large',
       '2._size_does_not_fit_-_too_small', nan], dtype=object)

In [9]:
selected_reasons = {
    '1._size_does_not_fit_-_too_large':'too_large',
    '2._size_does_not_fit_-_too_small':'too_small'
}

In [10]:
DATA_EXCHANGE_RETURNS = DATA_EXCHANGE_RETURNS[
    DATA_EXCHANGE_RETURNS['Reason Name'].map(lambda x: x in list(selected_reasons.keys()))
]

In [11]:
DATA_EXCHANGE_RETURNS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91242 entries, 0 to 98282
Data columns (total 8 columns):
Country                       91242 non-null object
Order nr                      91242 non-null object
SKU ID                        91242 non-null object
Reason Name                   91242 non-null object
original_size_system_name     90896 non-null object
original_size_name            91242 non-null object
exchanged_size_system_name    0 non-null object
exchanged_size_name           0 non-null object
dtypes: object(8)
memory usage: 6.3+ MB


## Simplify Reason Names

In [12]:
DATA_EXCHANGE_RETURNS['Reason Name'] = DATA_EXCHANGE_RETURNS['Reason Name'].map(selected_reasons)

## Normalize strings for size system names and size names

In [13]:
for sizetype in ['original','exchanged']:
    for sizenametype in ['size_system_name','size_name']:
        DATA_EXCHANGE_RETURNS[f'{sizetype}_{sizenametype}'] = \
          DATA_EXCHANGE_RETURNS[f'{sizetype}_{sizenametype}'].apply(normalize_string)

## Get original and exchanged sizes by joining size system name with size name

In [14]:
for sizetype in ['original','exchanged']:
    DATA_EXCHANGE_RETURNS[f'{sizetype}_size'] = [
        str(sys_name) + ' ' + str(name) 
        for sys_name,name in zip(
            DATA_EXCHANGE_RETURNS[f'{sizetype}_size_system_name'],
            DATA_EXCHANGE_RETURNS[f'{sizetype}_size_name']
        )
    ]
    
    DATA_EXCHANGE_RETURNS[f'{sizetype}_size'] = DATA_EXCHANGE_RETURNS[f'{sizetype}_size'].apply(normalize_string)

In [15]:
DATA_EXCHANGE_RETURNS.shape

(91242, 10)

In [16]:
DATA_EXCHANGE_RETURNS.head()

Unnamed: 0,Country,Order nr,SKU ID,Reason Name,original_size_system_name,original_size_name,exchanged_size_system_name,exchanged_size_name,original_size,exchanged_size
0,ph,239748222,EV032US96WVNPH,too_large,international,onesize,,,internationalonesize,<na><na>
1,ph,251645722,CH672SH32LPNPH,too_small,us,7,,,us7,<na><na>
2,ph,235646762,EM520AC06VDDPH,too_large,international,onesize,,,internationalonesize,<na><na>
3,ph,275962462,RU506SH93IHUMY,too_large,eu,36,,,eu36,<na><na>
4,ph,259521762,MC141AC39HVKPH,too_small,us,m,,,usm,<na><na>


## Columns of interest

In [17]:
DATA_EXCHANGE_RETURNS = DATA_EXCHANGE_RETURNS[[
    'Order nr',
    'SKU ID',
    'Reason Name',
    'original_size',
    'exchanged_size'
]]

## Consolidating unique records

In [18]:
DATA_EXCHANGE_RETURNS.head(10)

Unnamed: 0,Order nr,SKU ID,Reason Name,original_size,exchanged_size
0,239748222,EV032US96WVNPH,too_large,internationalonesize,<na><na>
1,251645722,CH672SH32LPNPH,too_small,us7,<na><na>
2,235646762,EM520AC06VDDPH,too_large,internationalonesize,<na><na>
3,275962462,RU506SH93IHUMY,too_large,eu36,<na><na>
4,259521762,MC141AC39HVKPH,too_small,usm,<na><na>
5,231545122,SP132SH0IJXLPH,too_small,us9,<na><na>
6,228331822,CH672SH99QBCPH,too_small,us8,<na><na>
7,276191162,LO601AA67KQUPH,too_small,internationalxl,<na><na>
8,282982362,F1525AA74BFHPH,too_large,internationalonesize,<na><na>
9,282982362,F1525AA74BFHPH,too_large,internationalonesize,<na><na>


Each exchange/return record shall be uniquely identified by

* **Transaction ID** -- *Order nr*
* **Product ID purchased in that Transaction** -- *SKU ID*
* **Size of Product Purchased for that Product ID** -- *original_size*

### Remove duplicate records

In [19]:
print('{0} out of {1} rows are duplicate records ({2}%)'.format(
    DATA_EXCHANGE_RETURNS.duplicated(keep='first').sum(),
    DATA_EXCHANGE_RETURNS.shape[0],
    round(100*DATA_EXCHANGE_RETURNS.duplicated(keep='first').sum()/DATA_EXCHANGE_RETURNS.shape[0],2)
))

467 out of 91242 rows are duplicate records (0.51%)


In [20]:
DATA_EXCHANGE_RETURNS = DATA_EXCHANGE_RETURNS[~DATA_EXCHANGE_RETURNS.duplicated(keep='first')]

In [21]:
DATA_EXCHANGE_RETURNS.shape

(90775, 5)

### For each unique exchange/return record, concatenate all the associated **Reason Names** and **Exchanged Sizes** if there are more than 1 

In [22]:
DATA_EXCHANGE_RETURNS = DATA_EXCHANGE_RETURNS\
  .groupby(['Order nr','SKU ID','original_size'])\
  .agg({
    'Reason Name' : lambda x: '__AND__'.join(sorted(list(set(x)))),
    'exchanged_size' : lambda x: '__AND__'.join(sorted(list(set(x))))
  })\
  .reset_index()

In [23]:
DATA_EXCHANGE_RETURNS.shape

(90740, 5)

In [24]:
DATA_EXCHANGE_RETURNS.head()

Unnamed: 0,Order nr,SKU ID,original_size,Reason Name,exchanged_size
0,211113522,CBF67AA8DA5D5EGS,internationals,too_large,<na><na>
1,211113662,AE351AA0039298GS,internationall,too_small,<na><na>
2,211113822,BI090SH55HNOMY,eu39,too_large,<na><na>
3,211114862,C8695AAD6F487CGS,eu38,too_small,<na><na>
4,211114922,5E1B9AA609044BGS,internationalm,too_large,<na><na>


In [25]:
## 35 records with more than 1 Reason Names (includes both too large and too small)
DATA_EXCHANGE_RETURNS[DATA_EXCHANGE_RETURNS['Reason Name'].apply(
    lambda x: True if re.search(pattern='_AND_',string=x) else False
)].shape

(35, 5)

## Summary of data processing steps

1. Filter for 'size_does_not_fit' related reason names
  * Original row count : 387055 ==> New row count : 91242


2. Get original and exchanged sizes (system_name concat with size_name)


3. Normalize original / exchanged size strings
  * strip whitespace
  * lowercasing


4. Removing duplicates
  * Original row count : 91242 ==> New row count : 90775
  
  
5. Consolidate Reason Names and Exchanged sizes for each unique exchange/return record
  * Original row count : 90775 ==> New row count : 90740

# Data on Conversions

## Read in Conversions data (data dump from json stream, 1 line per record)

In [26]:
JSON_DUMP = []

In [27]:
with open(FILEPATH_DATA_CONVERSIONS) as f:
    for line in f.readlines():
        JSON_DUMP.append(json.loads(line))

In [28]:
len(JSON_DUMP)

62482

In [29]:
JSON_DUMP[:1]

[{'_index': 'tracking_prod',
  '_type': 'conversion',
  '_id': '8a8b6404-e312-4bef-98cd-df3637d09b2f',
  '_score': 7.5778923,
  '_source': {'uid': 'unknown',
   'amount': '1394',
   'clientId': 'ubu8zhxrh8hg',
   'currency': 'PHP',
   'transaction': '257871622',
   'products': [{'quantity': 1,
     'size': 'International L',
     'price': '413.59',
     'sku': '3AE72AA36B07C6GS'},
    {'quantity': 1,
     'size': 'International M',
     'price': '465.36',
     'sku': 'B9586AA24B4E70GS'},
    {'quantity': 1,
     'size': 'International M',
     'price': '515.05',
     'sku': '079A3AAEAB518DGS'}],
   'timestamp': '2018-11-09T05:59:08.254Z'}}]

The data we are interested in is nested inside the **_source** key in the record

In [30]:
DATA_CONVERSIONS = pd.DataFrame.from_records([
    record['_source'] for record in JSON_DUMP
])

In [31]:
DATA_CONVERSIONS.head()

Unnamed: 0,amount,clientId,currency,products,timestamp,transaction,uid
0,1394.0,ubu8zhxrh8hg,PHP,"[{'quantity': 1, 'size': 'International L', 'p...",2018-11-09T05:59:08.254Z,257871622,unknown
1,299.0,ubu8zhxrh8hg,PHP,"[{'quantity': 1, 'size': 'International One Si...",2018-11-09T06:32:29.749Z,265371622,0fc636a4-a03f-493d-8122-64f98c12e6df
2,24000.0,ubu8zhxrh8hg,PHP,"[{'quantity': 12, 'size': 'International One S...",2018-11-09T08:40:44.673Z,231351622,3c3c50b0-99a2-494e-8228-b559eafccc02
3,1299.0,ubu8zhxrh8hg,PHP,"[{'quantity': 1, 'size': 'Brazil 37/38', 'pric...",2018-11-09T11:52:16.426Z,294431622,unknown
4,2091.56,ubu8zhxrh8hg,PHP,"[{'quantity': 1, 'size': 'International L', 'p...",2018-11-09T12:55:39.748Z,218931622,unknown


## Parse Date Time

In [32]:
DATA_CONVERSIONS['timestamp_parsed'] = DATA_CONVERSIONS['timestamp'].map(
    lambda x: datetime.datetime.strptime(x,'%Y-%m-%dT%H:%M:%S.%fz')
)

## Filter for transactions falling within date range of interest

In [33]:
DATA_CONVERSIONS = DATA_CONVERSIONS[DATA_CONVERSIONS['timestamp_parsed'].map(lambda x: x.date() >= ORDER_START_DATE)]
DATA_CONVERSIONS = DATA_CONVERSIONS[DATA_CONVERSIONS['timestamp_parsed'].map(lambda x: x.date() <= ORDER_END_DATE)]

In [34]:
DATA_CONVERSIONS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4705 entries, 5185 to 56886
Data columns (total 8 columns):
amount              4705 non-null object
clientId            4705 non-null object
currency            4705 non-null object
products            4705 non-null object
timestamp           4705 non-null object
transaction         4705 non-null object
uid                 4705 non-null object
timestamp_parsed    4705 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(7)
memory usage: 330.8+ KB


In [35]:
print(np.min(DATA_CONVERSIONS['timestamp_parsed']))
print(np.max(DATA_CONVERSIONS['timestamp_parsed']))

2019-02-01 00:03:44.419000
2019-02-28 23:51:17.679000


In [36]:
DATA_CONVERSIONS.head()

Unnamed: 0,amount,clientId,currency,products,timestamp,transaction,uid,timestamp_parsed
5185,1143.12,ubu8zhxrh8hg,PHP,"[{'quantity': 1, 'size': 'International S', 'p...",2019-02-07T08:29:55.536Z,215125962,unknown,2019-02-07 08:29:55.536
5186,1150.0,ubu8zhxrh8hg,PHP,"[{'quantity': 1, 'size': 'UK 10', 'price': '11...",2019-02-07T08:49:49.141Z,215325962,f6fb8346-1b77-4e7f-a631-cfb4b571ceb6,2019-02-07 08:49:49.141
5187,1996.0,ubu8zhxrh8hg,PHP,"[{'quantity': 2, 'size': 'International XXXL',...",2019-02-07T10:04:50.854Z,279965962,c5c23898-6998-4ae3-b4e9-9f4477ecd6bf,2019-02-07 10:04:50.854
5188,3650.0,ubu8zhxrh8hg,PHP,"[{'quantity': 1, 'size': 'International One Si...",2019-02-07T17:41:34.990Z,267385962,a695c21f-e7ea-4a80-8719-a658a9af045e,2019-02-07 17:41:34.990
5189,1750.0,ubu8zhxrh8hg,PHP,"[{'quantity': 1, 'size': 'International One Si...",2019-02-08T01:31:22.243Z,273455962,unknown,2019-02-08 01:31:22.243


## Extract Products in Transactions

In [37]:
# Number of unique transaction IDs
len(DATA_CONVERSIONS.transaction.unique())

4705

In [38]:
# Number of rows
DATA_CONVERSIONS.shape[0]

4705

Each data conversion record corresponds to 1 unique transaction ID

In [39]:
DATA_CONVERSION_PRODUCTS = DATA_CONVERSIONS['products'].tolist()

In [40]:
# For each product object in each transaction record, append the transaction id
# This is to facilitate conversion to dataframe later on
for i in range(DATA_CONVERSIONS.shape[0]):
    for obj in DATA_CONVERSION_PRODUCTS[i]:
        obj['transaction'] = DATA_CONVERSIONS['transaction'].tolist()[i]

In [41]:
# flatten / unnest list of records
DATA_CONVERSION_PRODUCTS = list(itertools.chain.from_iterable(DATA_CONVERSION_PRODUCTS))

In [42]:
# format data into pandas dataframe
DATA_CONVERSION_PRODUCTS = pd.DataFrame.from_records(DATA_CONVERSION_PRODUCTS)

In [43]:
# remove records where sku is undefined
DATA_CONVERSION_PRODUCTS = DATA_CONVERSION_PRODUCTS[~(DATA_CONVERSION_PRODUCTS['sku']=='undefined')]

In [44]:
DATA_CONVERSION_PRODUCTS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8578 entries, 0 to 8599
Data columns (total 5 columns):
price          8578 non-null object
quantity       8578 non-null float64
size           8578 non-null object
sku            8578 non-null object
transaction    8578 non-null object
dtypes: float64(1), object(4)
memory usage: 402.1+ KB


In [45]:
DATA_CONVERSION_PRODUCTS.head()

Unnamed: 0,price,quantity,size,sku,transaction
0,1143.12,1.0,International S,04BF2AA2B31011GS,215125962
1,1150.0,1.0,UK 10,79CE8US7578ECEGS,215325962
2,998.0,2.0,International XXXL,6C34EAAE0207FDGS,279965962
3,3650.0,1.0,International One Size,4524FAC279C76EGS,267385962
4,1750.0,1.0,International One Size,DB581ACD3B11F1GS,273455962


## Check for duplicates

In [46]:
DATA_CONVERSION_PRODUCTS.duplicated().sum() ## No duplicates

0

## Standardize size names

In [47]:
DATA_CONVERSION_PRODUCTS['size_name'] = DATA_CONVERSION_PRODUCTS['size'].apply(normalize_string)

In [48]:
DATA_CONVERSION_PRODUCTS.head()

Unnamed: 0,price,quantity,size,sku,transaction,size_name
0,1143.12,1.0,International S,04BF2AA2B31011GS,215125962,internationals
1,1150.0,1.0,UK 10,79CE8US7578ECEGS,215325962,uk10
2,998.0,2.0,International XXXL,6C34EAAE0207FDGS,279965962,internationalxxxl
3,3650.0,1.0,International One Size,4524FAC279C76EGS,267385962,internationalonesize
4,1750.0,1.0,International One Size,DB581ACD3B11F1GS,273455962,internationalonesize


# Merge Exchange/Return Data with Conversion Data

## Step 1 : Find SKU IDs of Transaction IDs that were returned, regardless of size

In [49]:
DATA_CONVERSIONS_EXCHANGE_RETURNS = DATA_CONVERSION_PRODUCTS.merge(
  right = DATA_EXCHANGE_RETURNS[['Order nr','SKU ID']].drop_duplicates(keep='first'),
  how = 'left',
  left_on = ['transaction','sku'],
  right_on = ['Order nr','SKU ID'],
  indicator = True
)

In [50]:
# ind_exchangereturn is an indicator variable of whether the product (SKUID) in the transaction (transaction)
# was exchanged/returned with a stated size related reason (too_large/too_small)
DATA_CONVERSIONS_EXCHANGE_RETURNS = DATA_CONVERSIONS_EXCHANGE_RETURNS.rename(
    columns={'_merge':'ind_exchangereturn'}
)

In [51]:
DATA_CONVERSIONS_EXCHANGE_RETURNS['ind_exchangereturn'] = \
    DATA_CONVERSIONS_EXCHANGE_RETURNS['ind_exchangereturn'].map({
        'both':1,
        'left_only':0
    })

In [52]:
DATA_CONVERSIONS_EXCHANGE_RETURNS.sort_values('ind_exchangereturn').tail()

Unnamed: 0,price,quantity,size,sku,transaction,size_name,Order nr,SKU ID,ind_exchangereturn
8208,831.2,1.0,International XS,68F75AA80E37FCGS,221563862,internationalxs,221563862,68F75AA80E37FCGS,1
5979,704.25,1.0,International XL,4ABCEAAD9A3FDBGS,262794862,internationalxl,262794862,4ABCEAAD9A3FDBGS,1
5978,974.25,1.0,International XL,C7503AA6E8554CGS,262794862,internationalxl,262794862,C7503AA6E8554CGS,1
8222,1295.0,1.0,International XL,E9468AA1B4AEB1GS,244553862,internationalxl,244553862,E9468AA1B4AEB1GS,1
2196,1049.44,1.0,International XS,F0615AA3F9EFE0GS,223818962,internationalxs,223818962,F0615AA3F9EFE0GS,1


In [53]:
DATA_CONVERSIONS_EXCHANGE_RETURNS.groupby('ind_exchangereturn').size()

ind_exchangereturn
0    8099
1     479
dtype: int64

In [54]:
DATA_CONVERSIONS_EXCHANGE_RETURNS.drop(columns=['Order nr','SKU ID'],inplace=True)

## Step 2 : Append Return Reason if available by matching based on transaction id, sku id and size

In [55]:
DATA_CONVERSIONS_EXCHANGE_RETURNS = DATA_CONVERSIONS_EXCHANGE_RETURNS.merge(
  right = DATA_EXCHANGE_RETURNS,
  how = 'left',
  left_on = ['transaction','sku','size_name'],
  right_on = ['Order nr','SKU ID','original_size']
)

In [56]:
DATA_CONVERSIONS_EXCHANGE_RETURNS = DATA_CONVERSIONS_EXCHANGE_RETURNS.drop(columns=[
    'Order nr',
    'SKU ID',
    'original_size'
])

In [57]:
DATA_CONVERSIONS_EXCHANGE_RETURNS.sort_values('exchanged_size').head()

Unnamed: 0,price,quantity,size,sku,transaction,size_name,ind_exchangereturn,Reason Name,exchanged_size
30,438.0,1.0,International L,4F639AA908FCCAGS,299595962,internationall,1,too_large,<na><na>
6146,0.0,1.0,International M,ED28EAADEF2243GS,292799862,internationalm,1,too_large,<na><na>
6066,1959.2,1.0,International L,EF930AA374ED11GS,231936862,internationall,1,too_large,<na><na>
6057,831.36,1.0,EU 38,5E3E4SH1951D33GS,223128862,eu38,1,too_small,<na><na>
6033,898.5,1.0,International L,9E0D0AAD9B0E23GS,235329862,internationall,1,too_large,<na><na>


In [58]:
DATA_CONVERSIONS_EXCHANGE_RETURNS['Reason Name'].fillna('<No size match>',inplace=True)

In [59]:
DATA_CONVERSIONS_EXCHANGE_RETURNS.groupby(['ind_exchangereturn','Reason Name']).size().reset_index()

Unnamed: 0,ind_exchangereturn,Reason Name,0
0,0,<No size match>,8099
1,1,<No size match>,86
2,1,too_large,222
3,1,too_small,171


86 out of the 479 product-transaction records that had product exchanges/returns have no size match 

## Examine unmatched size names

In [60]:
### Records with exchange/returns, but cannot match by size
unmatched_purchased_sizenames = DATA_CONVERSIONS_EXCHANGE_RETURNS[
    (DATA_CONVERSIONS_EXCHANGE_RETURNS.ind_exchangereturn==1) &
    (DATA_CONVERSIONS_EXCHANGE_RETURNS['Reason Name']=='<No size match>')
][['transaction','sku','size','size_name']]

### Get all size records from DATA_EXCHANGE_RETURNS for these unmatched TransactionID-SKUID pairs
unmatched_purchased_sizenames = unmatched_purchased_sizenames.merge(
    ### Concatenate all orginal_sizes in DATA_EXCHANGE_RETURNS by transaction and sku
    right = DATA_EXCHANGE_RETURNS \
        .groupby(['Order nr','SKU ID']) \
        .agg({'original_size': lambda x: '__AND__'.join(sorted(list(set(x))))}) \
        .reset_index(),
    how = 'left',
    left_on = ['transaction','sku'],
    right_on = ['Order nr','SKU ID']
)

unmatched_purchased_sizenames = unmatched_purchased_sizenames.rename(columns={
    'size_name':'size_from_conversions_data',
    'original_size':'all_sizes_from_exchangereturn_data'
})

unmatched_purchased_sizenames \
    .groupby(['size_from_conversions_data','all_sizes_from_exchangereturn_data']) \
    .size() \
    .reset_index() \
    .rename(columns={0:'count'}) \
    .sort_values(by='count',ascending=False)

Unnamed: 0,size_from_conversions_data,all_sizes_from_exchangereturn_data,count
48,us6,eu36,5
41,us4,internationalxs,5
51,us6.5,eu37,3
55,us7.5,eu38,3
13,internationall,uk18,3
34,us12,internationalxl,2
20,internationals,uk10,2
7,eu42,uk14,2
58,us8,internationalm,2
9,eu44,internationalxl,2


3 types of mismatches :

1) Both Size System and Size are different (eg: us10 and aus14). May be mitigated using size system mapping tables
2) Same size, different system : international10 vs uk10
3) Same system, different size : us34 vs eu34

For 2) and 3), most likely due to entry errors (customer forgot purchase size?).

# Obtain UID of transcations of interest

This is to match with the events dataset, in order to determine whether a Pixibo size recommendation was associated with the purchase, and if so, what was the size recommedation (to compare with actual purchased size)

In [61]:
DATA_CONVERSIONS[['uid','transaction']].shape

(4705, 2)

In [62]:
len(DATA_CONVERSIONS['transaction'].unique())

4705

Each transaction is associated with 1 unique UID and 1 unique client ID. We can just merge with DATA_CONVERSIONS_EXCHANGE_RETURNS

In [63]:
DATA_CONVERSIONS_EXCHANGE_RETURNS = DATA_CONVERSIONS_EXCHANGE_RETURNS.merge(
    right = DATA_CONVERSIONS[['uid','transaction']],
    left_on = 'transaction',
    right_on = 'transaction'
)

In [64]:
DATA_CONVERSIONS_EXCHANGE_RETURNS.shape

(8578, 10)

In [65]:
DATA_CONVERSIONS_EXCHANGE_RETURNS.sort_values('Reason Name',ascending=False).head()

Unnamed: 0,price,quantity,size,sku,transaction,size_name,ind_exchangereturn,Reason Name,exchanged_size,uid
4719,1042.53,1.0,International M,A818FAA6DEC855GS,251119762,internationalm,1,too_small,<na><na>,7ffcd6f4-400d-4c91-a295-fd2e765f3672
8223,476.0,1.0,International XL,E6ABAAA9AA7A5AGS,244553862,internationalxl,1,too_small,<na><na>,unknown
2580,0.0,1.0,International M,CECC0AA5189CFDGS,232182862,internationalm,1,too_small,<na><na>,unknown
2579,0.0,1.0,International M,7B30FAA04AD0FFGS,232182862,internationalm,1,too_small,<na><na>,unknown
2578,0.0,1.0,International M,609BCAA0ACA06CGS,232182862,internationalm,1,too_small,<na><na>,unknown


In [66]:
sum(DATA_CONVERSIONS_EXCHANGE_RETURNS['uid'].fillna('unknown') != 'unknown')

4128

Only 4128 out of the 8578 records have known UIDs to merge to events dataset, to get information on whether there was a Pixibo size recommendation.

# Data from events

We only focus on events whereby Event Type = "size", which is when a size recommendation was made by Pixibo

In [67]:
EVENTS = []
with open(FILEPATH_DATA_EVENTS,'r') as f:
    for line in f:
        if not line:
            break
        try:
            line = line.strip()
            obj = json.loads(line)
            if obj['_source']['eventType'] == 'size':
                EVENTS.append(obj)
        except:
            pass

In [68]:
len(EVENTS)

321848

In [69]:
## Events are not arranged in order of timestamp in data file

print('First record timestamp: ',EVENTS[0]['_source']['timestamp'])
print('Last record timestamp : ',EVENTS[-1]['_source']['timestamp'])

event_timestamps = [obj['_source']['timestamp'] for obj in EVENTS]
print('Earliest timestamp: ',min(event_timestamps))
print('Latest timestamp: ',max(event_timestamps))

First record timestamp:  2019-03-17T23:23:10.060Z
Last record timestamp :  2019-04-17T06:18:14.895Z
Earliest timestamp:  2018-10-31T01:19:47.318Z
Latest timestamp:  2019-04-17T06:20:58.757Z


In [70]:
EVENTS[0]

{'_index': 'eventlogs_prod',
 '_type': 'log',
 '_id': '0779078b-ee55-418d-a13b-f7388850067c',
 '_score': 3.412435,
 '_source': {'uid': 'efb9cd27-28c9-46c8-bb23-174570004680',
  'took': 13,
  'clientId': 'ubu8zhxrh8hg',
  'response': {'fys': [{'size': 'INT M',
     'confidence': 3,
     'bust': 0,
     'waist': 2,
     'hip8Bit': 2,
     'hip': 2,
     'bust8Bit': 0,
     'waist8Bit': 2,
     'recommended': False},
    {'size': 'INT L',
     'confidence': 5,
     'bust': 0,
     'waist': 3,
     'hip8Bit': 5,
     'hip': 3,
     'bust8Bit': 0,
     'waist8Bit': 4,
     'recommended': True},
    {'size': 'INT XL',
     'confidence': 4,
     'bust': 0,
     'waist': 4,
     'hip8Bit': 7,
     'hip': 4,
     'bust8Bit': 0,
     'waist8Bit': 7,
     'recommended': False}],
   'gcount': 5,
   'gender': 'female',
   'type': 'Skirts',
   'skuId': 'EC633AAA9A0CA3GS'},
  'eventType': 'size',
  'page': 'fyf',
  'queryString': {'mode': 'showmore',
   'bs': '34',
   'uid': 'efb9cd27-28c9-46c8-bb23-

In [71]:
# only interested in payload in _source
EVENTS = [obj['_source'] for obj in EVENTS]

In [72]:
EVENTS_UIDs = [obj['uid'] for obj in EVENTS]

## Obtain actual product recommendations by Pixibo

In [73]:
RECOMMENDATIONS = []
for obj in EVENTS:
    tmpuid = obj['uid']
    tmptimestamp = obj['timestamp']
    tmpskuid = obj['skuId'].upper()
    try:
        if 'response' in obj.keys():
            if 'fys' in obj['response'].keys():
                for choice_obj in obj['response']['fys']:
                    if str(choice_obj['recommended']) == 'True':
                        recommendation_obj = choice_obj.copy()
                        recommendation_obj['uid'] = tmpuid
                        recommendation_obj['timestamp'] = tmptimestamp
                        recommendation_obj['sku'] = tmpskuid
                        RECOMMENDATIONS.append(recommendation_obj)
    except:
        pass

In [74]:
DATA_RECOMMENDATIONS = pd.DataFrame.from_records(RECOMMENDATIONS)

In [75]:
# Remove duplicates
DATA_RECOMMENDATIONS = DATA_RECOMMENDATIONS.drop_duplicates(keep='first')

In [76]:
DATA_RECOMMENDATIONS.shape

(178488, 12)

## Filter for Recommendations associated with transactions using UID

In [77]:
UIDS_OF_INTEREST = set(DATA_RECOMMENDATIONS['uid']).intersection(set(DATA_CONVERSIONS_EXCHANGE_RETURNS['uid']))

In [78]:
DATA_RECOMMENDATIONS = DATA_RECOMMENDATIONS[DATA_RECOMMENDATIONS['uid'].map(lambda x: x in UIDS_OF_INTEREST)]

In [79]:
DATA_RECOMMENDATIONS.shape

(16317, 12)

In [80]:
DATA_RECOMMENDATIONS.head()

Unnamed: 0,bust,bust8Bit,confidence,hip,hip8Bit,recommended,size,sku,timestamp,uid,waist,waist8Bit
6,3,4.0,5.0,0,0.0,True,INT M,A5A16AAA726439GS,2019-03-18T01:18:46.250Z,2b24e92b-1bb0-4930-b0e2-f12ee1dd6d57,3,4.0
19,3,4.0,5.0,0,0.0,True,EU 38,43EBFAACF616F2GS,2019-03-18T02:07:09.361Z,0470b342-d648-4c4a-b948-cfc6a02f31cb,3,4.0
30,0,0.0,5.0,3,5.0,True,INT L,6448EAA7FC013DGS,2019-03-18T02:18:52.463Z,31fcec31-b9d5-4b84-ba8a-53625d799a2d,3,5.0
45,3,4.0,5.0,0,0.0,True,INT M,EDC4EAA78E3CFDGS,2019-03-18T01:41:17.999Z,2b24e92b-1bb0-4930-b0e2-f12ee1dd6d57,3,4.0
46,3,5.0,5.0,0,0.0,True,INT M,14341AA8254DCDGS,2019-03-18T01:44:01.180Z,0470b342-d648-4c4a-b948-cfc6a02f31cb,3,4.0


In [81]:
# Select fields of interest
DATA_RECOMMENDATIONS = DATA_RECOMMENDATIONS[[
    'uid',
    'sku',
    'size'
]]

In [82]:
DATA_RECOMMENDATIONS.rename(columns={'size':'recommended_size'},inplace=True)

In [83]:
DATA_RECOMMENDATIONS.shape

(16317, 3)

# Merging Pixibo Recommendation Info to main dataset

In [84]:
# create tmp index before joining
DATA_CONVERSIONS_EXCHANGE_RETURNS['tmpindex'] = [i for i in range(DATA_CONVERSIONS_EXCHANGE_RETURNS.shape[0])]

In [85]:
# ensure all SKUs are uppercase
DATA_CONVERSIONS_EXCHANGE_RETURNS['sku'] = DATA_CONVERSIONS_EXCHANGE_RETURNS['sku'].apply(str.upper)
DATA_RECOMMENDATIONS['sku'] = DATA_RECOMMENDATIONS['sku'].apply(str.upper)

# ensure all SKUs have no spaces
DATA_CONVERSIONS_EXCHANGE_RETURNS['sku'] = DATA_CONVERSIONS_EXCHANGE_RETURNS['sku'].apply(str.strip)
DATA_RECOMMENDATIONS['sku'] = DATA_RECOMMENDATIONS['sku'].apply(str.strip)

In [86]:
# No. of rows in DATA_CONVERSIONS_EXCHANGE_RETURNS that have matching UID
sum(DATA_CONVERSIONS_EXCHANGE_RETURNS['uid'].map(lambda x: x in UIDS_OF_INTEREST))

663

In [87]:
# No. of rows in DATA_CONVERSIONS_EXCHANGE_RETURNS that have matching SKU
sum(DATA_CONVERSIONS_EXCHANGE_RETURNS['sku'].map(lambda x: x in DATA_RECOMMENDATIONS['sku'].tolist()))

993

In [88]:
DATA_CONVERSIONS_EXCHANGE_RETURNS = DATA_CONVERSIONS_EXCHANGE_RETURNS.merge(
    right = DATA_RECOMMENDATIONS,
    how = 'left',
    left_on = ['uid','sku'],
    right_on = ['uid','sku'],
    indicator = True
)

In [89]:
DATA_CONVERSIONS_EXCHANGE_RETURNS.rename(columns={'_merge':'ind_pixibo_recommendation'},inplace=True)
DATA_CONVERSIONS_EXCHANGE_RETURNS['ind_pixibo_recommendation'] = \
    DATA_CONVERSIONS_EXCHANGE_RETURNS['ind_pixibo_recommendation'].map({
        'both':1,
        'left_only':0
    })

In [90]:
DATA_CONVERSIONS_EXCHANGE_RETURNS['recommended_size'].fillna('<No Recommendation>',inplace=True)

In [91]:
## No. of rows in DATA_CONVERSIONS_EXCHANGE_RETURNS that have both matching UID and SKU
sum(DATA_CONVERSIONS_EXCHANGE_RETURNS['ind_pixibo_recommendation'])

352

# Main Analysis of purchased products influenced by Pixibo recommendations

In [92]:
DATA_CONVERSIONS_EXCHANGE_RETURNS = DATA_CONVERSIONS_EXCHANGE_RETURNS[
    DATA_CONVERSIONS_EXCHANGE_RETURNS['ind_pixibo_recommendation'] == 1
]

In [93]:
DATA_CONVERSIONS_EXCHANGE_RETURNS.shape # number of records involving pixibo recommendations

(352, 13)

## Normalizing size strings

In [94]:
set(DATA_CONVERSIONS_EXCHANGE_RETURNS['recommended_size'])

{'29',
 'AU 8',
 'EU 34',
 'EU 36',
 'EU 38',
 'INT L',
 'INT M',
 'INT S',
 'INT XL',
 'INT XS',
 'UK 10',
 'UK 12',
 'UK 14',
 'UK 8'}

In [95]:
set(DATA_CONVERSIONS_EXCHANGE_RETURNS['size_name'])

{'aus8',
 'eu34',
 'eu36',
 'eu38',
 'internationall',
 'internationalm',
 'internationals',
 'internationalxl',
 'internationalxs',
 'internationalxs/s',
 'internationalxxs',
 'uk10',
 'uk12',
 'uk14',
 'uk8',
 'us4',
 'waistxlengthw2926'}

### Standardizing system names

In [96]:
DATA_CONVERSIONS_EXCHANGE_RETURNS['recommended_size'] = \
DATA_CONVERSIONS_EXCHANGE_RETURNS['recommended_size'].apply(normalize_string)

In [97]:
## (Very) rudimentary string substitution to standardize system names
DATA_CONVERSIONS_EXCHANGE_RETURNS['recommended_size'] = DATA_CONVERSIONS_EXCHANGE_RETURNS['recommended_size'].map(
    lambda x: re.sub(string=x,pattern='^int',repl='international')
)
DATA_CONVERSIONS_EXCHANGE_RETURNS['recommended_size'] = DATA_CONVERSIONS_EXCHANGE_RETURNS['recommended_size'].map(
    lambda x: re.sub(string=x,pattern='^au',repl='aus')
)

## Check whether recommendation size matches purchase size

In [98]:
DATA_CONVERSIONS_EXCHANGE_RETURNS['ind_recommendation_matches_purchase_size'] = [
    int(r['size_name'] == r['recommended_size']) for r in DATA_CONVERSIONS_EXCHANGE_RETURNS.to_records()
]

In [99]:
# sanity check
DATA_CONVERSIONS_EXCHANGE_RETURNS[[
    'size_name',
    'recommended_size',
    'ind_recommendation_matches_purchase_size'
]].drop_duplicates(keep='first').sort_values('ind_recommendation_matches_purchase_size')

Unnamed: 0,size_name,recommended_size,ind_recommendation_matches_purchase_size
8645,waistxlengthw2926,29,0
3675,internationalxs,internationals,0
2982,internationalm,internationals,0
6502,internationalxxs,internationalxs,0
2905,internationals,internationalxs,0
2666,internationall,internationalm,0
2621,internationals,internationalm,0
2494,internationals,internationall,0
4929,internationalxs/s,uk12,0
6550,internationalm,internationalxs,0


## Save merged dataset on Pixibo influenced purchases to csv 

In [100]:
## save to csv file
DATA_CONVERSIONS_EXCHANGE_RETURNS.to_csv('../data/merged_data.csv')

## Statistics to compute


We want to find out, for Pixbo influenced purchases *(ie. transaction-SKU pairs that are associated with size recommendations)*, 

1. Total products sold
2. Total products returned
3. Total prodicts returns / Total products sold

For each of the 3 statistics, we want to compute for 3 differnt subsets:

a. All Pixibo influenced product purchases

b. All Pixibo influenced product purchases whereby purchase size = recommended size

c. All Pixibo influenced product purchases whereby purchase size =/= recommended size


For each of the 3 count metrics, we can choose to count in 2 different ways:
    
1. Count Number of unique SKUs (regardless of transaction ID) **with at least 1 exchange/return record**
2. Count Number of unique Transaction-SKU pairs (in layman terms : *product purchases*) **with at least 1 exchange/return record**




The first metric would make sense if we are more focused on the performance over different unique products regardless of who purchases it, but the second metric would make more sense if we also want to take into account real-world business costs due to transaction volumes (eg. more transactions ==> more different customers to attend to for exchanging/returning ==> more transporation / inventory costs) 

In [101]:
# Necessary fields for computations
MAIN_DATA = DATA_CONVERSIONS_EXCHANGE_RETURNS[[
    'transaction','sku',
    'ind_pixibo_recommendation',
    'ind_recommendation_matches_purchase_size',
    'ind_exchangereturn'
]]

MAIN_DATA['transaction_sku'] = [
    row['transaction'] + '__' + row['sku'] for row in MAIN_DATA.to_records()
]

MAIN_DATA.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,transaction,sku,ind_pixibo_recommendation,ind_recommendation_matches_purchase_size,ind_exchangereturn,transaction_sku
228,236742962,A00F7AA5E23E13GS,1,1,0,236742962__A00F7AA5E23E13GS
399,273448962,625B4AA0F3327BGS,1,0,0,273448962__625B4AA0F3327BGS
400,273448962,625B4AA0F3327BGS,1,0,0,273448962__625B4AA0F3327BGS
686,291477962,B0420AA4264AEBGS,1,1,0,291477962__B0420AA4264AEBGS
687,291477962,B0420AA4264AEBGS,1,1,0,291477962__B0420AA4264AEBGS


## All Pixibo purchases

In [102]:
DATA_ALL = MAIN_DATA

In [103]:
## Find SKUs and Transaction-SKU pairs with at least 1 exchange/return event
DATA_ALL_BY_SKU = DATA_ALL \
  .groupby(['sku','ind_pixibo_recommendation'])['ind_exchangereturn'] \
  .apply(lambda x: int(sum(x)>0)) \
  .reset_index()

DATA_ALL_BY_TRANSACTION_SKU = DATA_ALL \
    .groupby(['transaction_sku','ind_pixibo_recommendation'])['ind_exchangereturn'] \
    .apply(lambda x: int(sum(x)>0)) \
    .reset_index()

In [104]:
DATA_ALL_BY_SKU.head()

Unnamed: 0,sku,ind_pixibo_recommendation,ind_exchangereturn
0,01C90AA9A078D8GS,1,0
1,029C3AACAE8040GS,1,0
2,0C686AA2A4C1EAGS,1,0
3,0EFA9AA957D91EGS,1,0
4,0FD43AAD4BB401GS,1,0


In [105]:
DATA_ALL_BY_TRANSACTION_SKU.head()

Unnamed: 0,transaction_sku,ind_pixibo_recommendation,ind_exchangereturn
0,212349862__E309EAAEBDD62AGS,1,0
1,213119762__0EFA9AA957D91EGS,1,0
2,213119762__3A584AAAB10867GS,1,0
3,213741862__5428CAAA8F40EFGS,1,0
4,213741862__9373FAA84D76F4GS,1,0


### By unique SKUs

In [106]:
DATA_ALL_BY_SKU = DATA_ALL_BY_SKU \
.groupby('ind_pixibo_recommendation') \
.agg({
    'sku': {
        'total_products_sold':lambda x: len(x)
    },
    'ind_exchangereturn' : {
        'total_returns':lambda x:sum(x),
        'return_rate':lambda x:sum(x)/len(x)
    }
}) \
.reset_index(drop=True) \
.rename(columns={
    'sku':'total_products_sold'
})

DATA_ALL_BY_SKU.columns = DATA_ALL_BY_SKU.columns.droplevel()
DATA_ALL_BY_SKU['count_type'] = 'Count By Unique SKU'
DATA_ALL_BY_SKU['subset'] = 'All Pixibo Influenced Transactions'
DATA_ALL_BY_SKU = DATA_ALL_BY_SKU[[
    'subset',
    'count_type',
    'total_products_sold',
    'total_returns',
    'return_rate'
]]

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [107]:
DATA_ALL_BY_SKU

Unnamed: 0,subset,count_type,total_products_sold,total_returns,return_rate
0,All Pixibo Influenced Transactions,Count By Unique SKU,102,12,0.117647


In [108]:
print('### By Unique SKU Count ###:')
print()
print('Total Products sold : ',DATA_ALL_BY_SKU['total_products_sold'][0])
print('Total Products sold with at least 1 exchange/return : ',DATA_ALL_BY_SKU['total_returns'][0])
print('Return Rate : ',DATA_ALL_BY_SKU['return_rate'][0])

### By Unique SKU Count ###:

Total Products sold :  102
Total Products sold with at least 1 exchange/return :  12
Return Rate :  0.11764705882352941


### By unique Transaction-SKUs

In [109]:
DATA_ALL_BY_TRANSACTION_SKU = DATA_ALL_BY_TRANSACTION_SKU \
.groupby('ind_pixibo_recommendation') \
.agg({
    'transaction_sku': {
        'total_products_sold':lambda x: len(x)
    },
    'ind_exchangereturn' : {
        'total_returns':lambda x:sum(x),
        'return_rate':lambda x:sum(x)/len(x)
    }
}) \
.reset_index(drop=True) \
.rename(columns={
    'sku':'total_products_sold'
})

DATA_ALL_BY_TRANSACTION_SKU.columns = DATA_ALL_BY_TRANSACTION_SKU.columns.droplevel()
DATA_ALL_BY_TRANSACTION_SKU['count_type'] = 'Count By Unique Transaction-SKU pairs'
DATA_ALL_BY_TRANSACTION_SKU['subset'] = 'All Pixibo Influenced Transactions'
DATA_ALL_BY_TRANSACTION_SKU = DATA_ALL_BY_TRANSACTION_SKU[[
    'subset',
    'count_type',
    'total_products_sold',
    'total_returns',
    'return_rate'
]]

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [110]:
DATA_ALL_BY_TRANSACTION_SKU

Unnamed: 0,subset,count_type,total_products_sold,total_returns,return_rate
0,All Pixibo Influenced Transactions,Count By Unique Transaction-SKU pairs,105,12,0.114286


In [111]:
print('### By Unique Transaction-SKU Count ###:')
print()
print('Total Products sold : ',DATA_ALL_BY_TRANSACTION_SKU['total_products_sold'][0])
print('Total Products sold with at least 1 exchange/return : ',DATA_ALL_BY_TRANSACTION_SKU['total_returns'][0])
print('Return Rate : ',DATA_ALL_BY_TRANSACTION_SKU['return_rate'][0])

### By Unique Transaction-SKU Count ###:

Total Products sold :  105
Total Products sold with at least 1 exchange/return :  12
Return Rate :  0.11428571428571428


## Pixibo Purchases where recommendation size = purchased size

In [112]:
DATA_PURCHASED_RECOMMENDED = MAIN_DATA[MAIN_DATA['ind_recommendation_matches_purchase_size']==1]

In [113]:
## Find SKUs and Transaction-SKU pairs with at least 1 exchange/return event
DATA_PURCHASED_RECOMMENDED_BY_SKU = DATA_PURCHASED_RECOMMENDED \
  .groupby(['sku','ind_pixibo_recommendation'])['ind_exchangereturn'] \
  .apply(lambda x: int(sum(x)>0)) \
  .reset_index()

DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU = DATA_PURCHASED_RECOMMENDED \
    .groupby(['transaction_sku','ind_pixibo_recommendation'])['ind_exchangereturn'] \
    .apply(lambda x: int(sum(x)>0)) \
    .reset_index()

In [114]:
## By Unique SKUs
DATA_PURCHASED_RECOMMENDED_BY_SKU = DATA_PURCHASED_RECOMMENDED_BY_SKU \
.groupby('ind_pixibo_recommendation') \
.agg({
    'sku': {
        'total_products_sold':lambda x: len(x)
    },
    'ind_exchangereturn' : {
        'total_returns':lambda x:sum(x),
        'return_rate':lambda x:sum(x)/len(x)
    }
}) \
.reset_index(drop=True) \
.rename(columns={
    'sku':'total_products_sold'
})

DATA_PURCHASED_RECOMMENDED_BY_SKU.columns = DATA_PURCHASED_RECOMMENDED_BY_SKU.columns.droplevel()
DATA_PURCHASED_RECOMMENDED_BY_SKU['count_type'] = 'Count By Unique SKU'
DATA_PURCHASED_RECOMMENDED_BY_SKU['subset'] = 'Purchased Recommended Size'
DATA_PURCHASED_RECOMMENDED_BY_SKU = DATA_PURCHASED_RECOMMENDED_BY_SKU[[
    'subset',
    'count_type',
    'total_products_sold',
    'total_returns',
    'return_rate'
]]

# By Unique Transaction-SKU pairs
DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU = DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU \
.groupby('ind_pixibo_recommendation') \
.agg({
    'transaction_sku': {
        'total_products_sold':lambda x: len(x)
    },
    'ind_exchangereturn' : {
        'total_returns':lambda x:sum(x),
        'return_rate':lambda x:sum(x)/len(x)
    }
}) \
.reset_index(drop=True) \
.rename(columns={
    'sku':'total_products_sold'
})

DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU.columns = DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU.columns.droplevel()
DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['count_type'] = 'Count By Unique Transaction-SKU pairs'
DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['subset'] = 'Purchased Recommended Size'
DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU = DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU[[
    'subset',
    'count_type',
    'total_products_sold',
    'total_returns',
    'return_rate'
]]

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [115]:
print('### By Unique SKU Count ###:')
print()
print('Total Products sold : ',DATA_PURCHASED_RECOMMENDED_BY_SKU['total_products_sold'][0])
print('Total Products sold with at least 1 exchange/return : ',DATA_PURCHASED_RECOMMENDED_BY_SKU['total_returns'][0])
print('Return Rate : ',DATA_PURCHASED_RECOMMENDED_BY_SKU['return_rate'][0])
print()
print('### By Unique Transaction-SKU Count ###:')
print()
print('Total Products sold : ',DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['total_products_sold'][0])
print('Total Products sold with at least 1 exchange/return : ',DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['total_returns'][0])
print('Return Rate : ',DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['return_rate'][0])

### By Unique SKU Count ###:

Total Products sold :  60
Total Products sold with at least 1 exchange/return :  4
Return Rate :  0.06666666666666667

### By Unique Transaction-SKU Count ###:

Total Products sold :  61
Total Products sold with at least 1 exchange/return :  4
Return Rate :  0.06557377049180328


## Pixibo Purchases where recommendation size =/= purchased size

In [116]:
DATA_DIDNOT_PURCHASED_RECOMMENDED = MAIN_DATA[MAIN_DATA['ind_recommendation_matches_purchase_size']==0]

In [117]:
## Find SKUs and Transaction-SKU pairs with at least 1 exchange/return event
DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU = DATA_DIDNOT_PURCHASED_RECOMMENDED \
  .groupby(['sku','ind_pixibo_recommendation'])['ind_exchangereturn'] \
  .apply(lambda x: int(sum(x)>0)) \
  .reset_index()

DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU = DATA_DIDNOT_PURCHASED_RECOMMENDED \
    .groupby(['transaction_sku','ind_pixibo_recommendation'])['ind_exchangereturn'] \
    .apply(lambda x: int(sum(x)>0)) \
    .reset_index()

In [118]:
## By Unique SKUs
DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU = DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU \
.groupby('ind_pixibo_recommendation') \
.agg({
    'sku': {
        'total_products_sold':lambda x: len(x)
    },
    'ind_exchangereturn' : {
        'total_returns':lambda x:sum(x),
        'return_rate':lambda x:sum(x)/len(x)
    }
}) \
.reset_index(drop=True) \
.rename(columns={
    'sku':'total_products_sold'
})

DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU.columns = DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU.columns.droplevel()
DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU['count_type'] = 'Count By Unique SKU'
DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU['subset'] = 'Did Not Purchased Recommended Size'
DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU = DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU[[
    'subset',
    'count_type',
    'total_products_sold',
    'total_returns',
    'return_rate'
]]

# By Unique Transaction-SKU pairs
DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU = DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU \
.groupby('ind_pixibo_recommendation') \
.agg({
    'transaction_sku': {
        'total_products_sold':lambda x: len(x)
    },
    'ind_exchangereturn' : {
        'total_returns':lambda x:sum(x),
        'return_rate':lambda x:sum(x)/len(x)
    }
}) \
.reset_index(drop=True) \
.rename(columns={
    'sku':'total_products_sold'
})

DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU.columns = DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU.columns.droplevel()
DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['count_type'] = 'Count By Unique Transaction-SKU pairs'
DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['subset'] = 'Did Not Purchased Recommended Size'
DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU = DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU[[
    'subset',
    'count_type',
    'total_products_sold',
    'total_returns',
    'return_rate'
]]

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [119]:
print('### By Unique SKU Count ###:')
print()
print('Total Products sold : ',DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU['total_products_sold'][0])
print('Total Products sold with at least 1 exchange/return : ',DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU['total_returns'][0])
print('Return Rate : ',DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU['return_rate'][0])
print()
print('### By Unique Transaction-SKU Count ###:')
print()
print('Total Products sold : ',DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['total_products_sold'][0])
print('Total Products sold with at least 1 exchange/return : ',DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['total_returns'][0])
print('Return Rate : ',DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU['return_rate'][0])

### By Unique SKU Count ###:

Total Products sold :  47
Total Products sold with at least 1 exchange/return :  8
Return Rate :  0.1702127659574468

### By Unique Transaction-SKU Count ###:

Total Products sold :  48
Total Products sold with at least 1 exchange/return :  8
Return Rate :  0.16666666666666666


In [120]:
OVERALL_STATS = pd.DataFrame()
OVERALL_STATS = pd.concat([OVERALL_STATS,DATA_ALL_BY_SKU])
OVERALL_STATS = pd.concat([OVERALL_STATS,DATA_ALL_BY_TRANSACTION_SKU])
OVERALL_STATS = pd.concat([OVERALL_STATS,DATA_PURCHASED_RECOMMENDED_BY_SKU])
OVERALL_STATS = pd.concat([OVERALL_STATS,DATA_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU])
OVERALL_STATS = pd.concat([OVERALL_STATS,DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_SKU])
OVERALL_STATS = pd.concat([OVERALL_STATS,DATA_DIDNOT_PURCHASED_RECOMMENDED_BY_TRANSACTION_SKU])

In [121]:
OVERALL_STATS

Unnamed: 0,subset,count_type,total_products_sold,total_returns,return_rate
0,All Pixibo Influenced Transactions,Count By Unique SKU,102,12,0.117647
0,All Pixibo Influenced Transactions,Count By Unique Transaction-SKU pairs,105,12,0.114286
0,Purchased Recommended Size,Count By Unique SKU,60,4,0.066667
0,Purchased Recommended Size,Count By Unique Transaction-SKU pairs,61,4,0.065574
0,Did Not Purchased Recommended Size,Count By Unique SKU,47,8,0.170213
0,Did Not Purchased Recommended Size,Count By Unique Transaction-SKU pairs,48,8,0.166667
