## 4.0 Article & Product Matching Model

This notebook consists of a Gensim Model that inputs scraped article data and DigiKey product list to perform a semantic and keyword matching analysis. It outputs atmost 3 relevant articles with scores for each product category.

In [56]:
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
from collections import defaultdict
import json
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
import random
import pandas as pd

#### Load Dataset

In [None]:
with open("./intermediate_data/Cleaned_Article_Data.json", "r", encoding="utf-8") as f:
    articles = json.load(f)
with open("./intermediate_data/Products_List_Clean.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)
with open("./intermediate_data/product_description.json", "r", encoding="utf-8") as f:
    raw_descriptions = json.load(f)

#### Input required data from datasets

In [None]:
category_summaries = raw_descriptions
products = raw_data.get("categories", [])

#### Preprocess data

In [None]:
def preprocess(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]
article_texts = []
for article in articles:
    text = article.get("text")
    if text: 
        article_texts.append(preprocess(text))
product_texts = [preprocess(f"{p['name']} {p.get('categories', '')}") for p in products]
all_texts = article_texts + product_texts

# Create dictionary and corpus
dictionary = Dictionary(all_texts)
corpus = [dictionary.doc2bow(text) for text in all_texts]

# Build TF-IDF model
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# Split article and product vectors
article_tfidf_vectors = corpus_tfidf[:len(articles)]
product_tfidf_vectors = corpus_tfidf[len(articles):]

# Articles Similarity Index
article_index = SparseMatrixSimilarity(article_tfidf_vectors, num_features=len(dictionary))

#### Keyword matching to find relevant articles

In [None]:
def find_articles_mentioning_product(product_name):
    mentions = []
    product_name_lower = product_name.lower()
    for i, article in enumerate(articles):
        text = article.get('text')
        if text and product_name_lower in text.lower():
            mentions.append((i, article))
    return mentions

#### Semantic matching to find similar articles

In [None]:
def recommend_similar_articles(product_name, product_category, top_n=5):
    product_text = f"{product_name} {product_category}"
    bow = dictionary.doc2bow(preprocess(product_text))
    tfidf_vec = tfidf[bow]
    sims = article_index[tfidf_vec]
    ranked = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
    seen_articles = {} # Removing duplicates
    unique_results = []

    for i, score in ranked:
        article = articles[i]
        article_key = article['title']
        if article_key not in seen_articles:
            seen_articles[article_key] = True
            unique_results.append((article, (score*100)))
        if len(unique_results) >= top_n:
            break
    return unique_results

#### Random sampling of product categories

In [None]:
noOfProducts=70
random.seed(noOfProducts)
random_products = random.sample(products, min(noOfProducts, len(products)))
all_results = []

#### Recommend articles for each product categories with scores

In [57]:
top_n = 3  # Maximum number of articles to display per product category
final_rows = []
for product in random_products:
    product_name = product.get('name').strip().lower()
    category_key = product['name'].strip()
    summary = category_summaries.get(category_key, {}).get("summary", "No summary available")
    results = recommend_similar_articles(product['name'], product.get('category', ''), top_n=top_n) # Article recommendations
    valid_results = [(article, score) for article, score in results if score > 0]  # Filter results with score > 0
    if not valid_results:
       continue
    row = {
        'Product Category': product['name'],
        'Products': product.get('Products', 0),
        'Product url': product.get('url', ''),
        'Description': summary
    }
    article_idx = 1
    for article, score in results:
        if score > 0:
            row[f'Article_{article_idx}_Score'] = round(score, 3)
            row[f'Article_{article_idx}_Title'] = article.get('title', '')
            row[f'Article_{article_idx}_Link'] = article.get('link', '') 
            article_idx += 1
    final_rows.append(row)

## Save Matching Results

Saves the result for further analysis

In [58]:
df_wide = pd.DataFrame(final_rows)
df_wide.to_csv('./intermediate_data/Product_Article_Matching.csv', index=False)