In [27]:
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Data Inspection

In [28]:
file_path = 'online_retail_II.xlsx'

df_1 = pd.read_excel(file_path, sheet_name='Year 2009-2010')
df_2 = pd.read_excel(file_path, sheet_name='Year 2010-2011')

df = pd.concat([df_1, df_2])

In [29]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [30]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 1067371 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   Invoice      1067371 non-null  object        
 1   StockCode    1067371 non-null  object        
 2   Description  1062989 non-null  object        
 3   Quantity     1067371 non-null  int64         
 4   InvoiceDate  1067371 non-null  datetime64[ns]
 5   Price        1067371 non-null  float64       
 6   Customer ID  824364 non-null   float64       
 7   Country      1067371 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 73.3+ MB


Unnamed: 0,Quantity,InvoiceDate,Price,Customer ID
count,1067371.0,1067371,1067371.0,824364.0
mean,9.938898,2011-01-02 21:13:55.394028544,4.649388,15324.638504
min,-80995.0,2009-12-01 07:45:00,-53594.36,12346.0
25%,1.0,2010-07-09 09:46:00,1.25,13975.0
50%,3.0,2010-12-07 15:28:00,2.1,15255.0
75%,10.0,2011-07-22 10:23:00,4.15,16797.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0
std,172.7058,,123.5531,1697.46445


## Data Cleaning

In [31]:
df1 = df[(df['Price'] > 0) & (df['Quantity'] > 0)]
df1['Date'] = pd.to_datetime(df1['InvoiceDate']).dt.date
df1.drop(['InvoiceDate'], axis=1, inplace=True)

df1['Invoice'] = df1['Invoice'].astype(str)
df1 = df1[df1['Invoice'].str.startswith('C') == False]

df1 = df1.dropna(subset=['Description'])
df1['Description'] = df1['Description'].astype(str)

df1['StockCode'] = df1['StockCode'].astype(str)
df1['StockCode'] = df1['StockCode'].apply(str.upper)

df1 = df1[(df1['StockCode'].str.len() == 5) | (df1['StockCode'].str.len() == 6)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Date'] = pd.to_datetime(df1['InvoiceDate']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(['InvoiceDate'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Invoice'] = df1['Invoice'].astype(str)


In [32]:
missing_values = df1.isna().sum()
print(missing_values)

print(df1.describe())

Invoice             0
StockCode           0
Description         0
Quantity            0
Price               0
Customer ID    234065
Country             0
Date                0
dtype: int64
           Quantity         Price    Customer ID
count  1.035658e+06  1.035658e+06  801593.000000
mean   1.099850e+01  3.343390e+00   15338.095812
std    1.268526e+02  6.952777e+00    1693.881217
min    1.000000e+00  3.000000e-02   12346.000000
25%    1.000000e+00  1.250000e+00   13988.000000
50%    3.000000e+00  2.100000e+00   15276.000000
75%    1.100000e+01  4.130000e+00   16809.000000
max    8.099500e+04  5.117030e+03   18287.000000


In [33]:
def remove_symbols(row):
    row = re.sub(r'[^a-zA-Z0-9\s]', ' ', row)
    row = ' '.join(row.split())
    return row

df1['Description'] = df1['Description'].apply(remove_symbols)

In [34]:
df1['Description'] = df1['Description'].str.lower()

In [35]:
df1.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,Price,Customer ID,Country,Date
0,489434,85048,15cm christmas glass ball 20 lights,12,6.95,13085.0,United Kingdom,2009-12-01
1,489434,79323P,pink cherry lights,12,6.75,13085.0,United Kingdom,2009-12-01
2,489434,79323W,white cherry lights,12,6.75,13085.0,United Kingdom,2009-12-01
3,489434,22041,record frame 7 single size,48,2.1,13085.0,United Kingdom,2009-12-01
4,489434,21232,strawberry ceramic trinket box,24,1.25,13085.0,United Kingdom,2009-12-01


In [36]:
df1 = df1.drop_duplicates()
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1002040 entries, 0 to 532617
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1002040 non-null  object 
 1   StockCode    1002040 non-null  object 
 2   Description  1002040 non-null  object 
 3   Quantity     1002040 non-null  int64  
 4   Price        1002040 non-null  float64
 5   Customer ID  775575 non-null   float64
 6   Country      1002040 non-null  object 
 7   Date         1002040 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 68.8+ MB


## Feature Extraction for Customisable Products

In [37]:
products = df1[['StockCode', 'Description']].drop_duplicates()
products = products.reset_index(drop=True)
print(len(products))
products.head()

5288


Unnamed: 0,StockCode,Description
0,85048,15cm christmas glass ball 20 lights
1,79323P,pink cherry lights
2,79323W,white cherry lights
3,22041,record frame 7 single size
4,21232,strawberry ceramic trinket box


In [38]:
vectorizer = TfidfVectorizer(min_df=1)
tfidf_matrix = vectorizer.fit_transform(products['Description'])

cosine_sim = cosine_similarity(tfidf_matrix)
threshold = 0.75

products['Customisable'] = False
products['Variations'] = ''

In [39]:
for i, description in enumerate(products['Description']):
    similar_indices = [j for j in range(len(products)) if (cosine_sim[i, j] > threshold and i!=j)]
    variations = []
    for j in similar_indices:
        temp = products['Description'].iloc[j]
        if (temp not in variations) and description != temp:
            variations.append(temp)
    if variations:
        products.at[i, 'Customisable'] = True
        products.at[i, 'Variations'] = variations

In [40]:
customisable = products[products['Customisable']].reset_index(drop=True)
print(len(customisable))
customisable.head()

2468


Unnamed: 0,StockCode,Description,Customisable,Variations
0,79323P,pink cherry lights,True,"[white cherry lights, silver cherry lights, bl..."
1,79323W,white cherry lights,True,"[pink cherry lights, silver cherry lights, bla..."
2,21232,strawberry ceramic trinket box,True,"[ceramic strawberry trinket tray, strawberry c..."
3,21871,save the planet mug,True,[paperweight save the planet]
4,21523,fancy font home sweet home doormat,True,"[spotty home sweet home doormat, door mat fanc..."


In [41]:
customisable['Base Product'] = ''

for i, description in enumerate(customisable['Description']):
    variations = customisable.at[i, 'Variations']
    description_words = description.split()
    common = set(description_words)
    for variation in variations:
        variation_words = variation.split()
        common = common.intersection(set(variation_words))
    customisable.at[i, 'Base Product'] = ' '.join([word for word in description_words if word in common])

In [42]:
customisable.head()

Unnamed: 0,StockCode,Description,Customisable,Variations,Base Product
0,79323P,pink cherry lights,True,"[white cherry lights, silver cherry lights, bl...",cherry lights
1,79323W,white cherry lights,True,"[pink cherry lights, silver cherry lights, bla...",cherry lights
2,21232,strawberry ceramic trinket box,True,"[ceramic strawberry trinket tray, strawberry c...",strawberry ceramic trinket
3,21871,save the planet mug,True,[paperweight save the planet],save the planet
4,21523,fancy font home sweet home doormat,True,"[spotty home sweet home doormat, door mat fanc...",home sweet home


## Variation Extraction

In [43]:
nlp = spacy.load("en_core_web_sm")

In [44]:
colors = [
    "red", "blue", "green", "yellow", "black", "white", "pink", "purple", "orange", "brown", "gray", "grey", 
    "beige", "tan", "maroon", "cyan", "magenta", "crimson", "scarlet", "burgundy", "ruby", "rose", "wine", 
    "cherry", "coral", "navy", "sky", "teal", "cobalt", "turquoise", "azure", "sapphire", "baby blue", 
    "lime", "olive", "emerald", "mint", "sea green", "forest", "jade", "chartreuse", "golden", "lemon", 
    "mustard", "amber", "sunflower", "canary", "jet", "charcoal", "ebony", "onyx", "matte black", "ink", 
    "ivory", "snow", "cream", "pearl", "alabaster", "off-white", "bone", "blush", "fuchsia", "hot pink", 
    "peach", "salmon", "bubblegum", "lavender", "lilac", "violet", "plum", "amethyst", "mauve", "orchid", 
    "tangerine", "apricot", "rust", "chocolate", "coffee", "mahogany", "chestnut", "walnut", "bronze", 
    "cocoa", "slate", "ash", "dove", "silver", "steel", "graphite", "sand", "khaki", "taupe", "buff", 
    "camel", "almond", "gold", "silver", "copper", "brass", "rose gold", "platinum", "metallic", 
    "pastel pink", "pastel blue", "pastel green", "pastel yellow", "pastel purple", "pastel orange", 
    "neon pink", "neon green", "neon yellow", "neon blue", "neon orange", "terracotta", "sage", "moss", 
    "umber", "ochre", "clay"
]

materials = [
    "metal", "wood", "plastic", "ceramic", "glass", "leather", "fabric", "stone", "marble", "wool", 
    "cotton", "silk", "linen", "polyester", "nylon", "acrylic", "brass", "steel", "iron", "aluminum", 
    "copper", "bronze", "bamboo", "rubber", "porcelain", "velvet", "suede", "lace", "canvas", "foam", 
    "cardboard", "paper", "resin", "fiber", "synthetic", "polyurethane", "faux leather", "stainless steel",
    "plexiglass", "carbon fiber", "jute", "rattan", "hemp", "wicker", "spandex", "microfiber", "cashmere",
    "denim", "tweed", "charcoal", "glass fiber", "kevlar", "epoxy", "latex", "mesh", "plastic", "zinc"
]

sizes = [
    "extra small", "small", "medium", "large", "extra large", "extra extra large", "xx-small", 
    "x-small", "small", "medium", "large", "x-large", "xx-large", "xxx-large",
    "xs", "s", "m", "l", "xl", "xxl", "xxxl", "xxxxl", "one size", "plus size", "petite", 
    "tall", "regular", "slim fit", "relaxed fit", "oversized", "junior", "youth",
]

styles = [
    "vintage", "bohemian", "modern", "minimalist", "rustic", "gothic", "retro", 
    "industrial", "mid-century modern", "art deco", "shabby chic", "farmhouse", 
    "coastal", "nautical", "scandinavian", "victorian", "steampunk", "futuristic",
    "baroque", "asian-inspired", "mediterranean", "southwestern", "tropical", 
    "contemporary", "urban", "eclectic", "colonial", "beach", "tribal", 
    "organic", "romantic", "classical", "luxury", "glam", "zen", "avant-garde", 
    "cyberpunk", "art nouveau", "country"
]

In [45]:
def extract_variation(description, base_product):
    variation_type = set()
    variation_detail = ""
    
    description_set = set(description.lower().split())
    base_product_set = set(base_product.lower().split())
    
    difference_set = description_set - base_product_set
    
    if not difference_set:
        return None, None, 0

    difference_text = ' '.join(difference_set)
    doc = nlp(difference_text)  
    
    for token in doc:
        if token.text in colors:
            variation_type.add("colour")
            variation_detail += f"colour: {token.text}, "
        elif token.text in sizes:
            variation_type.add("size")
            variation_detail += f"size: {token.text}, "
        elif token.text in materials:
            variation_type.add("material")
            variation_detail += f"material: {token.text}, "
        elif token.text in styles:
            variation_type.add("style")
            variation_detail += f"style: {token.text}, "

    bundle_match = re.search(r'\b(set of|pack of)\s?\d+', description, re.IGNORECASE)
    if bundle_match:
        variation_type.add("bundle")
        variation_detail += f"bundle: {bundle_match.group()}, "

    remaining_difference = difference_set - set(colors) - set(sizes) - set(materials) - set(styles)
    if bundle_match:
        remaining_difference -= set(bundle_match.group().split())
    if remaining_difference:
        variation_type.add("miscellaneous")
        variation_detail += f"miscellaneous: {' '.join(remaining_difference)}, "
    
    customisation_complexity = len(variation_type)
    

    variation_type = ', '.join(variation_type) if variation_type else None
    variation_detail = variation_detail.strip(', ') if variation_detail else None

    return variation_type,  variation_detail, customisation_complexity

def extract_material(description): 
    doc = nlp(description)
    material = None

    for token in doc:
        if token.text.lower() in materials:
            material = token.text
            break
        
    if not material:
        material = "polymer"

    return material

In [46]:
customisable[['Variation Type', 'Variation Detail', 'Customisation Complexity']] = customisable.apply(lambda x: pd.Series(extract_variation(x['Description'], x['Base Product'])), axis=1)
customisable['Material'] = customisable['Description'].apply(extract_material)
customisable['Customisation Complexity'] = customisable['Customisation Complexity'].astype(int)
customisable = customisable[customisable['Customisation Complexity'] > 0]

In [47]:
customisable.head(5)

Unnamed: 0,StockCode,Description,Customisable,Variations,Base Product,Variation Type,Variation Detail,Customisation Complexity,Material
0,79323P,pink cherry lights,True,"[white cherry lights, silver cherry lights, bl...",cherry lights,colour,colour: pink,1,polymer
1,79323W,white cherry lights,True,"[pink cherry lights, silver cherry lights, bla...",cherry lights,colour,colour: white,1,polymer
2,21232,strawberry ceramic trinket box,True,"[ceramic strawberry trinket tray, strawberry c...",strawberry ceramic trinket,miscellaneous,miscellaneous: box,1,ceramic
3,21871,save the planet mug,True,[paperweight save the planet],save the planet,miscellaneous,miscellaneous: mug,1,polymer
4,21523,fancy font home sweet home doormat,True,"[spotty home sweet home doormat, door mat fanc...",home sweet home,miscellaneous,miscellaneous: doormat font fancy,1,polymer


In [48]:
df1.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,Price,Customer ID,Country,Date
0,489434,85048,15cm christmas glass ball 20 lights,12,6.95,13085.0,United Kingdom,2009-12-01
1,489434,79323P,pink cherry lights,12,6.75,13085.0,United Kingdom,2009-12-01
2,489434,79323W,white cherry lights,12,6.75,13085.0,United Kingdom,2009-12-01
3,489434,22041,record frame 7 single size,48,2.1,13085.0,United Kingdom,2009-12-01
4,489434,21232,strawberry ceramic trinket box,24,1.25,13085.0,United Kingdom,2009-12-01


In [49]:
retail = pd.merge(df1, customisable, on=['StockCode', 'Description'], how='left')
retail = retail[retail['Customisable'] == True]
retail = retail.drop(columns=['Customisable'])
retail.info()

<class 'pandas.core.frame.DataFrame'>
Index: 393517 entries, 1 to 1002038
Data columns (total 14 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Invoice                   393517 non-null  object 
 1   StockCode                 393517 non-null  object 
 2   Description               393517 non-null  object 
 3   Quantity                  393517 non-null  int64  
 4   Price                     393517 non-null  float64
 5   Customer ID               302078 non-null  float64
 6   Country                   393517 non-null  object 
 7   Date                      393517 non-null  object 
 8   Variations                393517 non-null  object 
 9   Base Product              393517 non-null  object 
 10  Variation Type            393517 non-null  object 
 11  Variation Detail          393517 non-null  object 
 12  Customisation Complexity  393517 non-null  float64
 13  Material                  393517 non-null  objec

In [50]:
retail.head(5)

Unnamed: 0,Invoice,StockCode,Description,Quantity,Price,Customer ID,Country,Date,Variations,Base Product,Variation Type,Variation Detail,Customisation Complexity,Material
1,489434,79323P,pink cherry lights,12,6.75,13085.0,United Kingdom,2009-12-01,"[white cherry lights, silver cherry lights, bl...",cherry lights,colour,colour: pink,1.0,polymer
2,489434,79323W,white cherry lights,12,6.75,13085.0,United Kingdom,2009-12-01,"[pink cherry lights, silver cherry lights, bla...",cherry lights,colour,colour: white,1.0,polymer
4,489434,21232,strawberry ceramic trinket box,24,1.25,13085.0,United Kingdom,2009-12-01,"[ceramic strawberry trinket tray, strawberry c...",strawberry ceramic trinket,miscellaneous,miscellaneous: box,1.0,ceramic
6,489434,21871,save the planet mug,24,1.25,13085.0,United Kingdom,2009-12-01,[paperweight save the planet],save the planet,miscellaneous,miscellaneous: mug,1.0,polymer
7,489434,21523,fancy font home sweet home doormat,10,5.95,13085.0,United Kingdom,2009-12-01,"[spotty home sweet home doormat, door mat fanc...",home sweet home,miscellaneous,miscellaneous: doormat font fancy,1.0,polymer


In [51]:
retail.to_excel('cleaned.xlsx', index=False)