In [1]:
import pandas as pd

In [None]:
import re

def remove_duplicate_product_name(product_name: str) -> str:
    """
    Remove duplicated product names if the name is repeated consecutively.

    Parameters:
    product_name (str): The raw product name.

    Returns:
    str: Cleaned product name with duplicates removed.
    """
    if not product_name:
        return ""

    # Normalize whitespaces and lowercase
    name = product_name.lower().strip()
    name = re.sub(r'\s+', ' ', name)

    # Split by space and check halves
    tokens = name.split()
    half = len(tokens) // 2

    if len(tokens) % 2 == 0 and tokens[:half] == tokens[half:]:
        return ' '.join(tokens[:half])
    return name


def util_clean_product_name(product_name: str) -> str:
    """
    Clean the product name by removing special characters, HTML artifacts, and normalizing text.
    
    Parameters:
    product_name (str): The original product name.
    
    Returns:
    str: Cleaned product name.
    """
    if not product_name:
        return ""

    cleaned_name = product_name.lower().strip()

    cleaned_name = remove_duplicate_product_name(cleaned_name)

    # Handle common artifacts like 'x000d'
    cleaned_name = re.sub(r'(\\x[0-9a-fA-F]{2,4}|x000d|\\r|\\n)', ' ', cleaned_name)

    # Remove any special characters except alphanumerics and space
    cleaned_name = re.sub(r'[^a-z0-9\s]', ' ', cleaned_name)

    # Remove extra whitespace
    cleaned_name = re.sub(r'\s+', ' ', cleaned_name)

    return cleaned_name.strip()


def util_clean_category(category: str) -> str:
    """
    Normalize the category name by converting it to lowercase and removing special characters.

    Parameters:
    category (str): The original category name.

    Returns:
    str: Normalized category name.
    """
    
    cleaned_category = str(category).lower().strip()
    brand_patterns = r"(all.*?|walmart.*?|target.*?|mazon.co.uk|retail brand.*?|xyz.*?|brand name.*?|see more.*?)(,|$)"

    cleaned_category = re.sub(brand_patterns, '', cleaned_category)
    # cleaned_category = re.sub(r'[^a-z0-9\s]', '', cleaned_category)  # Remove special characters except spaces
    cleaned_category = re.sub(r'\s{2,}', ' ', cleaned_category)  # Remove multiple spaces
    cleaned_category = re.sub(r'\s+', ' ', cleaned_category)  # Remove extra spaces
    cleaned_category = cleaned_category.strip()

    return cleaned_category

def util_normalize_category_list(category_list: list) -> list:
    """
    Normalize a list of category names by cleaning each category.

    Parameters:
    category_list (list): List of category names.

    Returns:
    list: List of normalized category names.
    """

    normalization_map = {
        'ipad & tablets': 'tablets',
        'ipads tablets': 'tablets',
        'all tablets': 'tablets',
        'xyz brand tablets': 'tablets',
        'android tablets': 'tablets',
        'windows tablets': 'tablets',
        'tablets & ebook readers': 'tablets',
        'tablets & ereaders': 'tablets',
        'kids\' tablets': 'tablets',
        'wi-fi 3g (unlocked...': 'tablets',

        'e-readers': 'ereaders',
        'ebook readers': 'ereaders',
        'ebook readers & accessories': 'ereaders',
        'e-readers & accessories': 'ereaders',
        'ereaders & accessories': 'ereaders',
        'brand name e-readers': 'ereaders',

        'computers/tablets & networking': 'computers & tablets',
        'computers & laptops': 'computers & tablets',

        'audio docks & mini speakers': 'audio',
        'speaker systems': 'audio',
        'portable audio & headphones': 'audio',
        'audio player accessories': 'audio',

        'smart home & connected living': 'smart home',
        'smart home & home automation devices': 'smart home',
        'home safety & security': 'smart home',
        'smart hubs & wireless routers': 'smart home',
        'voice-enabled smart assistants': 'smart home',
        'voice assistants': 'smart home',
        'virtual assistant speakers': 'smart home',
        'alarms & sensors': 'smart home',

        'cases & bags': 'accessories',
        'tablet accessories': 'accessories',
        'tablet cases covers': 'accessories',
        'power adapters': 'accessories',
        'power adapters & cables': 'accessories',
        'computer accessories': 'accessories',
        'covers': 'accessories',

        'brand name e-reader accessories': 'accessories',
        'brand name paperwhite accessories': 'accessories',
        'brand name touch (4th generation) accessories': 'accessories',
        'brand name touch (4th generation) covers': 'accessories',
        'brand name (5th generation) accessories': 'accessories',
        'brand name (5th generation) covers': 'accessories',
        'pocketbook touch hd 3 e-reader accessories': 'accessories',
        'pocketbook touch hd 3 paperwhite accessories': 'accessories',
        'pocketbook touch hd 3 store': 'accessories',

        'books & magazines': 'media',
        'software & books': 'media',
        'book accessories': 'media',
        'movies': 'media',
        'music': 'media',

        'walmart for business': 'retail',
        'top rated': 'retail',
        'clearance': 'retail',
        'frys': 'retail',
        'categories': 'retail',
        'featured brands': 'retail',

        'tvs entertainment': 'tvs & electronics',
    }

    normalized = []
    for cat in category_list:
        key = cat.strip().lower()
        if key in normalization_map:
            mapped = normalization_map[key]
            if mapped not in normalized:
                normalized.append(mapped)
        else:
            if key not in normalized:
                normalized.append(key)
    return normalized


In [3]:
def clean_product_name(df: pd.DataFrame) -> str:
    """
    Clean the product name by removing special characters and converting to lowercase.

    Parameters:
    df (pd.DataFrame): The original product name.

    Returns:
    pd.DataFrame: DataFrame with cleaned product names.
    """

    df['product'] = df['product'].apply(util_clean_product_name)

    return df

In [4]:
def normalize_category(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize the category names by converting them to lowercase and removing special characters.

    Parameters:
    df (pd.DataFrame): Input DataFrame with category names.

    Returns:
    pd.DataFrame: DataFrame with normalized category names.
    """

    df['categories'] = df['categories'].apply(util_clean_category)
    df['categories'] = df['categories'].str.split(',')
    df['categories'] = df['categories'].apply(util_normalize_category_list)

    # Same products can have multiple categories, so we merge them into a single list
    # Losing data on merge
    # def merge_categories(series_of_category):
    #     all_tags = set()
    #     for tags in series_of_category:
    #         all_tags.update(tags)
    #     return list(all_tags)
    
    # merged = df.groupby('product').agg({'categories': merge_categories}).reset_index()
    # df = df.drop('categories', axis=1).merge(merged, on='product', how='left')
    
    return df

In [5]:
def create_feedback_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a feedback column from 'title' and 'reviews' columns.

    Parameters:
    df (pd.DataFrame): Input DataFrame with 'title' and 'reviews' columns.

    Returns:
    pd.DataFrame: DataFrame with a new 'feedback' column.
    """

    df['feedback'] = df['title'].astype(str) + df['reviews']
    
    return df

In [6]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """
    Perform feature engineering on the input DataFrame by cleaning product names and normalizing categories.

    Parameters:
    df (pd.DataFrame): Input DataFrame with product and category information.

    Returns:
    pd.DataFrame: DataFrame with cleaned product names and normalized categories.
    """

    df = clean_product_name(df)
    df = normalize_category(df)
    df = create_feedback_column(df)
    # todo: check if column categories must be dropped for duplicates
    # todo: handle metrics of the duplicate product that will be dropped
    # df = df.drop_duplicates(subset=['product']) 
    df = df.drop(columns=['title', 'reviews'], errors='ignore')

    return df

In [7]:
data = pd.read_csv("../data/preprocessed_data.csv")
data = feature_engineering(data)
data = data.dropna()
data.to_csv("../data/feature_data.csv", index=False)