# Create Anonymised Sample Dataset

This notebook creates a privacy-protected sample of the eBay sales data for portfolio sharing.

**Purpose:**
- Protect business-sensitive information
- Create reproducible sample for public GitHub repository
- Maintain data distributions for meaningful analysis


## 1. Import Libraries

In [29]:
import pandas as pd
import numpy as np
import hashlib
import re
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split

## 2. Configuration

In [30]:
# File paths
DATA_PATH = '../data/raw/'
OUTPUT_PATH = '../data/'

# Sampling parameters
SAMPLE_SIZE = 1000
RANDOM_STATE = 42

# Anonymisation settings
PRICE_NOISE_RANGE = (0.95, 1.05)  # ±5% price variation
DATE_OFFSET_DAYS = 30  # Max days to shift dates

# Columns to drop (based on your original analysis)
DROP_COLUMNS = ['Feedback left', 'Feedback received', 
                'Global Shipping Programme', 'Click and Collect']

## 3. Load Original Data

In [31]:
df = pd.read_csv(f'{DATA_PATH}ebay_march2023_feb2025_less-cols.csv')
print(f"Original dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Basic info
print("\nData types:")
print(df.dtypes)

Original dataset shape: (8778, 14)
Columns: ['Sales record number', 'Order number', 'Buyer username', 'Buyer postcode', 'Item number', 'Item title', 'Sold via Promoted listings', 'Quantity', 'Sold for', 'Sale date', 'Feedback left', 'Feedback received', 'Global Shipping Programme', 'Click and Collect']

Data types:
Sales record number           float64
Order number                   object
Buyer username                 object
Buyer postcode                 object
Item number                   float64
Item title                     object
Sold via Promoted listings     object
Quantity                      float64
Sold for                       object
Sale date                      object
Feedback left                  object
Feedback received              object
Global Shipping Programme      object
Click and Collect              object
dtype: object


## 4. Extract Brand Information (for stratified sampling)

In [32]:
# Brand list
BRANDS = ["Kojie San", "Extract", "Gluta-C", "Silka", "Belo", 
          "Maxi-Peel", "Likas", "GlutaMax", "SkinWhite", "Glupa"]

In [33]:
def extract_brand(title):
    """Extract brand from item title."""
    if pd.isna(title):
        return "Other"
    
    title_lower = str(title).lower()
    for brand in BRANDS:
        if brand.lower() in title_lower:
            return brand
    return "Other"


In [34]:
# Extract brand for stratified sampling
df['Brand'] = df['Item title'].apply(extract_brand)
print("\nBrand distribution:")
print(df['Brand'].value_counts())


Brand distribution:
Brand
Kojie San    3559
Other        1699
Extract      1340
Silka         705
Gluta-C       645
Belo          300
SkinWhite     161
GlutaMax      125
Likas         111
Maxi-Peel     107
Glupa          26
Name: count, dtype: int64


## 5. Create Stratified Sample

In [35]:
# Stratified sampling to maintain brand distribution
sample_df = df.groupby('Brand', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), max(10, int(SAMPLE_SIZE * len(x) / len(df)))), 
                       random_state=RANDOM_STATE)
).reset_index(drop=True)

# Ensure we have roughly SAMPLE_SIZE rows
if len(sample_df) > SAMPLE_SIZE:
    sample_df = sample_df.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE)

print(f"\nSample size: {len(sample_df)}")
print(f"Sample brand distribution:\n{sample_df['Brand'].value_counts()}")


Sample size: 1000
Sample brand distribution:
Brand
Kojie San    404
Other        192
Extract      151
Silka         80
Gluta-C       73
Belo          34
SkinWhite     18
GlutaMax      14
Maxi-Peel     12
Likas         12
Glupa         10
Name: count, dtype: int64


  sample_df = df.groupby('Brand', group_keys=False).apply(


## 6. Anonimise Sensitive Data

In [36]:
# Create working copy
anon_df = sample_df.copy()

# 1. Anonymise buyer usernames
anon_df['Buyer username'] = anon_df['Buyer username'].apply(
    lambda x: 'user_' + hashlib.md5(str(x).encode()).hexdigest()[:8] 
    if pd.notna(x) else 'anonymous'
)

In [37]:
# 2. Generalise postcodes (keep only area)
anon_df['Buyer postcode'] = anon_df['Buyer postcode'].apply(
    lambda x: str(x).split()[0] if pd.notna(x) and str(x) != 'nan' else 'XX1'
)

In [38]:
# 3. Anonymise item numbers
anon_df['Item number'] = anon_df['Item number'].apply(
    lambda x: 'item_' + hashlib.md5(str(x).encode()).hexdigest()[:10]
    if pd.notna(x) else 'item_unknown'
)

In [39]:
# 4. Add noise to prices while preserving discount structure
def calculate_unit_price(sold_for, quantity):
    """Calculate original unit price from total after discount."""
    if quantity == 1:
        return sold_for
    elif quantity == 2:
        return sold_for / (2 * 0.9)
    elif quantity == 3:
        return sold_for / (3 * 0.85)
    else:  # 4+
        return sold_for / (quantity * 0.8)
    
# Extract prices and quantities
anon_df['Price_Numeric'] = anon_df['Sold for'].str.replace('£', '').astype(float)

# Calculate unit prices
anon_df['Unit_Price'] = anon_df.apply(
    lambda row: calculate_unit_price(row['Price_Numeric'], row['Quantity']), 
    axis=1
)

# Add noise to UNIT price (not total)
noise = np.random.uniform(0.95, 1.05, len(anon_df))
anon_df['Unit_Price_Noisy'] = anon_df['Unit_Price'] * noise

# Recalculate total with discount structure
def calculate_total_price(unit_price, quantity):
    """Calculate total price with quantity discounts."""
    if quantity == 1:
        return unit_price
    elif quantity == 2:
        return unit_price * 2 * 0.9
    elif quantity == 3:
        return unit_price * 3 * 0.85
    else:  # 4+
        return unit_price * quantity * 0.8

anon_df['Sold for'] = anon_df.apply(
    lambda row: f"£{calculate_total_price(row['Unit_Price_Noisy'], row['Quantity']):.2f}",
    axis=1
)
    
# Clean up temporary columns
anon_df = anon_df.drop(columns=['Price_Numeric', 'Unit_Price', 'Unit_Price_Noisy'])

In [40]:
# 5. Shift dates (maintain relative ordering)
anon_df['Sale date'] = pd.to_datetime(anon_df['Sale date'])
date_offset = np.random.randint(-DATE_OFFSET_DAYS, DATE_OFFSET_DAYS)
anon_df['Sale date'] = anon_df['Sale date'] + pd.Timedelta(days=date_offset)
anon_df['Sale date'] = anon_df['Sale date'].dt.strftime('%d-%b-%y')

print("Anonymisation complete!")

Anonymisation complete!


  anon_df['Sale date'] = pd.to_datetime(anon_df['Sale date'])


In [41]:
# Add after calculating unit prices
def calculate_discount_percentage(quantity):
    """Return discount percentage based on quantity."""
    if quantity == 1:
        return 0
    elif quantity == 2:
        return 10
    elif quantity == 3:
        return 15
    else:  # 4+
        return 20

anon_df['Discount_Percentage'] = anon_df['Quantity'].apply(calculate_discount_percentage)

In [42]:
categories = {
    'Face Cream': r'face.*cream|Silka Papaya Day|Gluta-C Facial Face Night',
    'Face Wash': r'facial|wash|face',
    'Lotion': r'lotion',
    'Soap': r'soap',
    'Body Washes and Scrubs': r'body wash|scrub|body scrub',
    'Toner/Cleanser': r'toner|cleanser|Maxi-Peel Zero',
    'Serum': r'serum',
    'Shampoo': r'shampoo',
    'Conditioner': r'conditioner',
    'Powder': r'powder',
    'Sensitive Area Products': r'underarm|bikini|gel|roll on|roll-on|deodorant|feminine wash',
    'Cologne': r'cologne'
}

def extract_product_category(title):
    """Extract general product category from title."""
    title_lower = str(title).lower()
    
    # Your existing logic
    for category, pattern in categories.items():
        if re.search(pattern, title_lower):
            return category
    return "Other"

def extract_product_size(title):
    """Extract size information if available."""
    title_lower = str(title).lower()
    
    # Common size patterns
    size_patterns = {
        r'(\d+)\s*g\b': 'grams',
        r'(\d+)\s*ml\b': 'ml',
        r'(\d+)\s*oz\b': 'oz',
        r'(\d+)\s*pcs?\b': 'pieces'
    }
    
    for pattern, unit in size_patterns.items():
        match = re.search(pattern, title_lower)
        if match:
            return f"{match.group(1)}{unit}"
    return "standard"

# Add to your anonymization
anon_df['Product_Category'] = anon_df['Item title'].apply(extract_product_category)
anon_df['Product_Size'] = anon_df['Item title'].apply(extract_product_size)
anon_df['Discount_Percentage'] = anon_df['Quantity'].apply(calculate_discount_percentage)

In [43]:
def extract_product_details(title):
    """Extract pack size, unit size, and total units from product title."""
    title_lower = str(title).lower()
    
    # Pattern for multi-packs: "135g x 2", "65g x 3", etc.
    multipack_pattern = r'(\d+)\s*(g|ml|oz)\s*x\s*(\d+)'
    match = re.search(multipack_pattern, title_lower)
    
    if match:
        unit_size = f"{match.group(1)}{match.group(2)}"
        pack_count = int(match.group(3))
        total_size = int(match.group(1)) * pack_count
        return {
            'Unit_Size': unit_size,
            'Pack_Count': pack_count,
            'Total_Size': f"{total_size}{match.group(2)}",
            'Is_Multipack': True
        }
    
    # Single item pattern: "135g", "100ml", etc.
    single_pattern = r'(\d+)\s*(g|ml|oz)\b'
    match = re.search(single_pattern, title_lower)
    
    if match:
        size = f"{match.group(1)}{match.group(2)}"
        return {
            'Unit_Size': size,
            'Pack_Count': 1,
            'Total_Size': size,
            'Is_Multipack': False
        }
    
    return {
        'Unit_Size': 'unknown',
        'Pack_Count': 1,
        'Total_Size': 'unknown',
        'Is_Multipack': False
    }

# Apply to your data
product_details = anon_df['Item title'].apply(extract_product_details)
anon_df['Unit_Size'] = product_details.apply(lambda x: x['Unit_Size'])
anon_df['Pack_Count'] = product_details.apply(lambda x: x['Pack_Count'])
anon_df['Total_Size'] = product_details.apply(lambda x: x['Total_Size'])
anon_df['Is_Multipack'] = product_details.apply(lambda x: x['Is_Multipack'])

# Calculate actual unit count (important for analysis!)
anon_df['Total_Units'] = anon_df['Pack_Count'] * anon_df['Quantity']

In [44]:
anon_df.head()

Unnamed: 0,Sales record number,Order number,Buyer username,Buyer postcode,Item number,Item title,Sold via Promoted listings,Quantity,Sold for,Sale date,...,Click and Collect,Brand,Discount_Percentage,Product_Category,Product_Size,Unit_Size,Pack_Count,Total_Size,Is_Multipack,Total_Units
626,6062.0,02-11697-85186,user_fb7071bb,DY10,item_c5c0adf470,Kojie San Soap 135g x 2 (Double Pack) - Skin B...,No,1.0,£7.59,09-Jul-24,...,No,Kojie San,0,Soap,135grams,135g,2,270g,True,2.0
629,6095.0,21-11688-35962,user_cfb173e5,DE23,item_eacd302425,Kojie San Soap 100g x 3 (Large Trio Pack) - Sk...,Yes,1.0,£8.17,11-Jul-24,...,No,Kojie San,0,Soap,100grams,100g,3,300g,True,3.0
847,4504.0,22-11277-32971,user_711e970d,DL9,item_0a0d6e1866,Johnson's SUMMER SWING Baby Cologne 125ml,Yes,1.0,£7.49,03-Apr-24,...,No,Other,0,Cologne,125ml,125ml,1,125ml,False,1.0
514,9312.0,25-12714-86351,user_5d503f1f,RM3,item_e66dec75b7,Kojie San Dream White Kojic Acid Soap 135g x 2...,No,2.0,£7.78,18-Mar-25,...,No,Kojie San,10,Soap,135grams,135g,2,270g,True,4.0
365,9317.0,13-12730-17041,user_ac605fd5,NE6,item_812f83cac0,Kojie San Soap 65g x 3 (Triple Pack) - Skin Br...,No,3.0,£6.02,18-Mar-25,...,No,Kojie San,15,Soap,65grams,65g,3,195g,True,9.0


In [45]:
# Drop unnecessary columns
columns_to_drop = ['Feedback left', 'Feedback received', 
                   'Global Shipping Programme', 'Click and Collect']
final_df = anon_df.drop(columns=columns_to_drop, errors='ignore')

# Save the enhanced sample dataset
final_df.to_csv('../data/sample_data.csv', index=False)
print(f"Enhanced sample dataset saved with {len(final_df)} rows and {len(final_df.columns)} columns")
print(f"Columns: {final_df.columns.tolist()}")

# Quick validation
print("\nSample of anonymised data:")
print(final_df[['Buyer username', 'Item title', 'Quantity', 'Total_Units', 'Sold for', 'Discount_Percentage']].head())

Enhanced sample dataset saved with 1000 rows and 19 columns
Columns: ['Sales record number', 'Order number', 'Buyer username', 'Buyer postcode', 'Item number', 'Item title', 'Sold via Promoted listings', 'Quantity', 'Sold for', 'Sale date', 'Brand', 'Discount_Percentage', 'Product_Category', 'Product_Size', 'Unit_Size', 'Pack_Count', 'Total_Size', 'Is_Multipack', 'Total_Units']

Sample of anonymised data:
    Buyer username                                         Item title  \
626  user_fb7071bb  Kojie San Soap 135g x 2 (Double Pack) - Skin B...   
629  user_cfb173e5  Kojie San Soap 100g x 3 (Large Trio Pack) - Sk...   
847  user_711e970d          Johnson's SUMMER SWING Baby Cologne 125ml   
514  user_5d503f1f  Kojie San Dream White Kojic Acid Soap 135g x 2...   
365  user_ac605fd5  Kojie San Soap 65g x 3 (Triple Pack) - Skin Br...   

     Quantity  Total_Units Sold for  Discount_Percentage  
626       1.0          2.0    £7.59                    0  
629       1.0          3.0    £8.1