In [1]:
import pandas as pd
import numpy as np
import os
from utils import (
    get_exhibitor_category_info,
    calculate_match_score,
    clean_text,
    get_lemmatized_words
)
from tqdm import tqdm

In [2]:
# Load processed data 
base_path = '../../source/'
exhibitors_df = pd.read_csv(os.path.join(base_path, 'processed_exhibitors_categories.csv'))
# Get original exhibitor names map
exhibitor_names = exhibitors_df[['exhibitorid', 'exhibitorName']].drop_duplicates().set_index('exhibitorid')['exhibitorName'].to_dict()

In [3]:
exhibitors_df

Unnamed: 0,exhibitorid,exhibitorName,categoryId,categoryName,parentCategory
0,90556,Turkey Travels,52276,1.5 Resort hotel,1. Accomodation providers
1,90556,Turkey Travels,52280,2.1 Inbound tour operator,2. Tour operators
2,90556,Turkey Travels,52281,2.2 Outbound tour operator,2. Tour operators
3,92462,Russian Travel Company,52273,1.2 Apartments / Residential hotel,1. Accomodation providers
4,92462,Russian Travel Company,52283,2.4 Mass market tour operators,2. Tour operators
...,...,...,...,...,...
178,17729,Sunny Travel Journeys,52282,2.3 Receptive / ground operator,2. Tour operators
179,17729,Sunny Travel Journeys,52272,1.1 Hotel / Hotel chain / Inn,1. Accomodation providers
180,17729,Sunny Travel Journeys,52329,10.5 Zoo,10. Museums & parks
181,17729,Sunny Travel Journeys,52366,"17.3 Banking, investments",17. Insurance companies & banking


In [4]:
exhibitor_names

{90556: 'Turkey Travels',
 92462: 'Russian Travel Company',
 92491: 'Indian Travel Company',
 92492: 'Asia Tourism',
 92493: 'SriLanka Adventures',
 92494: 'Exotic Europe Travels',
 92495: 'Luxury Europe Company',
 92496: 'DreamTravel Company ',
 92497: 'Happy Company 2',
 68142: 'Global Tours Voyages',
 27827: 'Exotic Tours Holidays',
 74870: 'Exotic Tours Journeys',
 56923: 'Royal Tours Expeditions',
 80700: 'Exotic Holidays Services',
 46350: 'Prime Travel Journeys',
 55311: 'Sunny Adventures Journeys',
 31331: 'Exotic Adventures Journeys',
 97818: 'Sunny Trips Adventures',
 13033: 'Prime Vacations Expeditions',
 15881: 'Royal Trips Tours',
 10846: 'Royal Adventures Adventures',
 98889: 'Exotic Tours Tours',
 18004: 'Exotic Getaways Journeys',
 44942: 'Royal Holidays Adventures',
 48028: 'Prime Adventures Journeys',
 72153: 'Elite Adventures Holidays',
 32278: 'Exotic Holidays Holidays',
 51839: 'Dream Getaways Expeditions',
 30134: 'Global Holidays Expeditions',
 54579: 'Prime Vaca

In [5]:
exhibitor_categories_map, exhibitor_category_counts = get_exhibitor_category_info(exhibitors_df)

In [6]:

PENALTY_ALPHA = 0.5 # Penalty strength
PENALTY_THRESHOLD = 6 # Threshold for penalty

def recommend_exhibitors_for_interests(
    interest_list, # List of strings representing potential answers/interests
    exhibitor_categories_map, 
    exhibitor_category_counts, 
    exhibitor_names_map, 
    penalty_alpha=PENALTY_ALPHA,
    penalty_threshold=PENALTY_THRESHOLD
):
    """
    Recommends top N exhibitors based on a provided list of interest terms.

    Args:
        interest_list (list[str]): A list of strings representing visitor interests.
        exhibitor_categories_map (dict): Map of exhibitor IDs to their set of cleaned category names.
        exhibitor_category_counts (dict): Map of exhibitor IDs to their total unique category count.
        exhibitor_names_map (dict): Map of exhibitor IDs to their names.
        penalty_alpha (float): Penalty strength parameter.
        penalty_threshold (int): Threshold for penalty calculation.

    Returns:
        pd.DataFrame: Top N recommended exhibitors with scores and matching details.
    """
    # 1. Clean the input interest list
    provided_interests_set = set()

    for interest in interest_list:
        cleaned_text = clean_text(interest)
        
        # Add the cleaned full text
        provided_interests_set.add(cleaned_text)
        
        # Add individual words from the cleaned text
        words = cleaned_text.split()
        provided_interests_set.update(words)
        
        # Add lemmatized versions of each word
        provided_interests_set.update(get_lemmatized_words(word) for word in words)
    print(f"\nProvided Interests (Cleaned + Lemmatized): {provided_interests_set}")

    scores = []

    # 2. Iterate through all exhibitors
    print(f"Calculating scores against {len(exhibitor_categories_map)} exhibitors...")
    for exhibitor_id, exhibitor_cats_set in tqdm(exhibitor_categories_map.items(), desc="Matching Exhibitors"):

        total_categories = exhibitor_category_counts[exhibitor_id]

        # 3. Calculate score
        score, num_matches, matched_categories = calculate_match_score(
            provided_interests_set,
            exhibitor_cats_set,
            total_categories,
            penalty_alpha, 
            penalty_threshold
        )

        # 4. Store results if score > 0
        if score > 0:
            scores.append({
                'exhibitorid': exhibitor_id,
                'exhibitorName': exhibitor_names_map.get(exhibitor_id, "Unknown Name"),
                'score': score,
                'numMatches': num_matches,
                'penalty_categories': total_categories,
                'matchedCategories': ', '.join(sorted(list(matched_categories)))
            })

    # 5. Sort and return top N
    if not scores:
        print("No matching exhibitors found for the provided interests.")
        return pd.DataFrame(columns=['exhibitorid', 'exhibitorName', 'score', 'numMatches', 'matchedCategories'])

    recommendations_df = pd.DataFrame(scores)
    recommendations_df = recommendations_df.sort_values(by='score', ascending=False)

    return recommendations_df


In [7]:
interests = ["IT solutions for travel industry", "Online booking system", "Adventure tourism", "Gastro tourism"]

recommendations = recommend_exhibitors_for_interests(
    interests,
    exhibitor_categories_map,
    exhibitor_category_counts,
    exhibitor_names
)

recommendations


Provided Interests (Cleaned + Lemmatized): {'solutions', 'adventure tourism', 'travel', 'gastro tourism', 'industry', 'online', 'online booking system', 'system', 'solutions travel industry', 'solution', 'booking', 'adventure', 'gastro', 'tourism'}
Calculating scores against 35 exhibitors...


Matching Exhibitors: 100%|██████████| 35/35 [00:00<?, ?it/s]


Unnamed: 0,exhibitorid,exhibitorName,score,numMatches,penalty_categories,matchedCategories
0,10846,Royal Adventures Adventures,4.0,4,6,"gastro, gastro tourism, tourism, travel"
10,55311,Sunny Adventures Journeys,4.0,10,9,"adventure, adventure tourism, gastro, gastro t..."
7,44942,Royal Holidays Adventures,3.0,3,4,"online, tourism, travel"
21,97920,Dream Tours Tours,3.0,3,6,"gastro, gastro tourism, tourism"
14,80700,Exotic Holidays Services,3.0,3,1,"gastro, gastro tourism, tourism"
2,15881,Royal Trips Tours,2.666667,4,7,"booking, online, system, tourism"
4,27827,Exotic Tours Holidays,2.0,2,2,"tourism, travel"
16,92492,Asia Tourism,2.0,2,3,"online, tourism"
8,48028,Prime Adventures Journeys,2.0,4,8,"gastro, gastro tourism, industry, tourism"
12,72153,Elite Adventures Holidays,1.666667,5,10,"booking, online, system, tourism, travel"
