In [8]:
#importing the necessary libraries

import pandas as pd
import numpy as np
import re
import warnings
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings("ignore")

In [9]:
'''steps for loading the dataset into a dataframe
preprocessing and cleaning the dataset'''

def load_data(file_path):
    return pd.read_csv(file_path, encoding='unicode_escape')

def preprocess_numeric(df, columns=['retail_price', 'discounted_price']):
    df[columns] = df[columns].apply(pd.to_numeric)
    df= df.dropna()
    return df

def clean_text(text):
        text = text.lower()
        text = re.sub(r"\b(product_specification|key|value)\b\s*\[.*?\]", "", text)
        text = re.sub(r"[^\w\d\-+\s\[\]]", "", text)
        return text.strip()

def preprocess_text(df, columns=['product_name', 'description', 'brand']):
    for col in columns:
        df[col] = df[col].astype(str).apply(lambda text: clean_text(text))
    return df

def process_entry(entry):
    if isinstance(entry, list):
        return " ".join([clean_text(v.get("value", "")) for v in entry if isinstance(v, dict) and v.get("key") != "product_specification"])
    elif isinstance(entry, dict):
        if entry.get("key") == "product_specification":
            return ""
        else:
            return clean_text(entry.get("value", ""))
    elif isinstance(entry, (float, int)):  
        return clean_text(str(entry))
    elif isinstance(entry, str):  
        return clean_text(entry)
    else:
        return "" 

def preprocess_specifications(df):
    df['product_specifications'] = df['product_specifications'].apply(process_entry)
    return df

def preprocess_categories(text):
    text = ''.join(e for e in text if e.isalnum() or e in ' /[]')
    text = text.replace(" >> ", "/")
    # Remove leading and trailing spaces
    text = text.lower()
    return text

def stem_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

#a comprehensive function that calls all other functions at one place to clean the dataset
def clean_and_preprocess(df): 
    df = preprocess_numeric(df)
    df = preprocess_text(df)
    
    text_columns = ['product_name', 'description', 'brand', 'product_specifications']
    for col in text_columns:
        df[col] = df[col].apply(lambda text: lemmatize_text(stem_text(text)))

    df['product_category_tree'] = df['product_category_tree'].apply(lambda x: [preprocess_categories(item) for item in x])

    #dropping the columns which seem to be less useful for matching
    df = df.drop(['crawl_timestamp', 'product_url', 'product_rating', 'overall_rating', 'is_FK_Advantage_product'], axis=1)
    df = preprocess_specifications(df)
    return df

In [10]:
'''implememnted parallel application and batch processing
to speed up the tf-idf implementation'''

def parallel_apply(df, func, column):
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(func, text): idx for idx, text in df[column].items()}
        results = []
        for future in as_completed(futures):
            idx = futures[future]
            result = future.result()
            results.append((idx, result))
        results.sort(key=lambda x: x[0])  # Ensure results are in the same order as the input
        return [result[1] for result in results]

def batch_process(df, func, batch_size=100):
    total_rows = len(df)
    processed_data = []
    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        batch = df.iloc[start:end]
        processed_batch = func(batch)
        processed_data.append(processed_batch)
    return pd.concat(processed_data, ignore_index=True)

def calculate_tfidf_cosine_similarity(df1, df2):
    combined_df = pd.concat([df1, df2], ignore_index=True)
    text_columns = ['product_name', 'description', 'brand', 'product_specifications']
    combined_df[text_columns] = combined_df[text_columns].fillna('')
    combined_df['combined_text'] = combined_df[text_columns].apply(lambda x: ' '.join(x), axis=1)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined_df['combined_text'])

    df1_tfidf = tfidf_matrix[:len(df1)]
    df2_tfidf = tfidf_matrix[len(df1):]

    cosine_similarities = cosine_similarity(df1_tfidf, df2_tfidf)

    return cosine_similarities

def find_matching_products(cosine_similarities, amazon_df, flipkart_df, threshold=1):
    matches = []
    for i, row in enumerate(cosine_similarities):
        for j, similarity in enumerate(row):
            if similarity > threshold:
                matches.append({
                    'Amazon Unique id': amazon_odf.iloc[i]['uniq_id'],
                    'Amazon Product name': amazon_odf.iloc[i]['product_name'],
                    'Flipkart Unique id': flipkart_odf.iloc[j]['uniq_id'],
                    'Flipkart Product name': flipkart_odf.iloc[j]['product_name'],
                    'Similarity': similarity
                })
    return matches

#saving the matched sets as a csv file
def save_matches_to_csv(matches, file_path):
    matches_df = pd.DataFrame(matches)
    matches_df.to_csv(file_path, index=False)
    print(f"Matches saved to {file_path}")

In [11]:
#call the main function to execute the tasks

if __name__ == "__main__":
    amazon_df = load_data('Dataset/amz_com-ecommerce_sample.csv')
    amazon_odf = amazon_df
    flipkart_df = load_data('Dataset/flipkart_com-ecommerce_sample.csv')
    flipkart_odf = flipkart_df
    
    amazon_df = clean_and_preprocess(amazon_df)
    flipkart_df = clean_and_preprocess(flipkart_df)
    cosine_similarities = calculate_tfidf_cosine_similarity(amazon_df, flipkart_df)
    matches = find_matching_products(cosine_similarities, amazon_df, flipkart_df)
    save_matches_to_csv(matches, 'Matched Products.csv')

Matches saved to Matched Products.csv
