In [1]:
import json
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor

def get_search_results(api_key, search_engine_id, query):
    """
    Retrieves search results based on the provided API key, search engine ID, and query.
    Returns the search results in JSON format.
    """
    url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={query}"
    response = requests.get(url)
    data = response.json()
    return data

def extract_information(search_results):
    """
    Extracts relevant information (title and URL) from the search results.
    Returns a list of dictionaries containing the extracted information.
    """
    results = search_results.get("items", [])
    extracted_data = []
    for result in results:
        extracted_data.append({
            "title": result.get("title", ""),
            "url": result.get("link", "")
        })
    return extracted_data

def scrape_description(url):
    """
    Scrapes the meta description from the given URL.
    Returns the scraped description or an empty string if not found.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    description = soup.find("meta", {"name": "description"})
    if description:
        return description.get("content", "")
    else:
        return ""

def compute_similarity(input_text, item_descriptions):
    """
    Computes similarity scores between the input text and a list of item descriptions using TF-IDF vectorization and cosine similarity.
    Returns a list of similarity scores.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([input_text] + item_descriptions)
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
    similarity_scores = similarity_matrix[0].tolist()
    return similarity_scores

def rank_results(similarity_scores, item_urls, top_n):
    """
    Ranks the search results based on similarity scores and returns the top N results along with their descriptions.
    Returns a list of dictionaries containing the ranked results.
    """
    ranked_results = []
    sorted_results = sorted(zip(similarity_scores, item_urls), reverse=True)
    for i, (score, url) in enumerate(sorted_results[:top_n]):
        ranked_results.append({
            "rank": i + 1,
            "url": url,
            "description": scrape_description(url)
        })
    return ranked_results

def is_dress_website(url):
    """
    Checks if a given URL belongs to a dress-related website.
    Returns True if it is, False otherwise.
    """
    dress_websites = ["target.com", "lulus.com", "princesspolly.com", "petitestudionyc.com", "nordstromrack.com", "asos.com"]
    for website in dress_websites:
        if website in url:
            return True
    return False

# Set your API key and search engine ID
api_key = 'AIzaSyBeFOb7TUpJQuR_kksVYVJcwEYyBWp4wGM'
search_engine_id = '101c39968761f4b5b'

input_text = input("Enter the clothing item description: ")
top_n = 5

try:
    # Retrieve search results
    search_results = get_search_results(api_key, search_engine_id, input_text)
    
    # Extract information
    extracted_data = extract_information(search_results)
    
    # Filter and extract dress-related URLs
    item_urls = [data["url"] for data in extracted_data if is_dress_website(data["url"])]

    # Scrape descriptions in parallel
    with ThreadPoolExecutor() as executor:
        item_descriptions = list(executor.map(scrape_description, item_urls))

    # Compute similarity scores
    similarity_scores = compute_similarity(input_text, item_descriptions)

    # Rank and print the results
    ranked_results = rank_results(similarity_scores, item_urls, top_n)
    json_results = [
        {
            "rank": result['rank'],
            "url": result['url'],
            "description": result['description']
        } for result in ranked_results
    ]
    json_str = json.dumps(json_results)
    print(json_str)

except Exception as e:
    print(f"An error occurred: {e}")

Enter the clothing item description: red floral dress
[{"rank": 1, "url": "https://www.target.com/s/red+floral+dress", "description": "Shop Target for red floral dress you will love at great low prices. Choose from Same Day Delivery, Drive Up or Order Pickup plus free shipping on orders $35+."}, {"rank": 2, "url": "https://www.petitestudionyc.com/products/carly-dress-red-floral", "description": "Petite Studio creates high fashion clothing specifically for petite women. The perfect red maxi dress for summer. Features light and drapey fabric with a slip on the side adds a slightly sexy touch. High Fashion, trendy looks with sustainable Slow Fashion principles."}, {"rank": 3, "url": "https://us.princesspolly.com/products/nellie-mini-dress-red-floral", "description": "Add this fun, feminine Mini Dress to your collection, shop online at Princess Polly & make it happen! Buy now pay later with Afterpay."}, {"rank": 4, "url": "https://www.nordstromrack.com/c/women/clothing/dresses/floral?filte