In [13]:
import pytest
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.data import find
from matching_model.utils import (
        clean_text,
        get_visitor_interests,
        get_exhibitor_category_info,
        calculate_match_score
    )
import ipytest 
# Configure ipytest to discover tests and manage fixtures in the notebook
ipytest.autoconfig()

In [14]:
def ensure_nltk_resources():
    resources = ['corpora/wordnet', 'corpora/stopwords']
    downloaded = False
    for resource in resources:
        try:
            find(resource)
        except LookupError:
            resource_name = resource.split('/')[-1]
            print(f"NLTK resource '{resource_name}' not found. Downloading...")
            try:
                nltk.download(resource_name, quiet=True)
                print(f"Successfully downloaded '{resource_name}'.")
                downloaded = True
            except Exception as e:
                print(f"Error downloading NLTK resource '{resource_name}': {e}")
    if downloaded:
        print("NLTK setup complete.")

ensure_nltk_resources()

In [15]:
# --- Global Variables / Constants ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
RELEVANT_QUESTIONS_TEST = [
    "Please indicate your company's main area of business",
    "Which of the following best describes your job function?"
]
PENALTY_ALPHA_TEST = 0.5
PENALTY_THRESHOLD_TEST = 6

In [None]:
# --- Fixtures for Mock Data ---

@pytest.fixture(scope="module")
def mock_visitor_data():
    """Provides the mock visitor DataFrame."""
    visitor_data = {
        'visitor_id': ['v1', 'v1', 'v1', 'v2', 'v2', 'v3', 'v4'],
        'visitor_email': ['v1@test.com', 'v1@test.com', 'v1@test.com', 'v2@test.com', 'v2@test.com', 'v3@test.com', 'v4@test.com'],
        'questionText': [
            RELEVANT_QUESTIONS_TEST[0], # Relevant
            RELEVANT_QUESTIONS_TEST[1], # Relevant
            'Irrelevant Question',      # Irrelevant
            RELEVANT_QUESTIONS_TEST[0], # Relevant
            'Another Irrelevant',       # Irrelevant
            'Another1 Irrelevant',          # Irrelevant
            RELEVANT_QUESTIONS_TEST[1], # Relevant
        ],
        'answerText': [
            'Travel Agent',
            'Marketing and Sales',
            'Blue',
            'Tour Operator',
            'Yes',
            'Maybe',
            'Media',
        ]
    }
    return pd.DataFrame(visitor_data)

@pytest.fixture(scope="module")
def mock_exhibitor_data():
    """Provides the mock exhibitor DataFrame."""
    exhibitor_data = {
        'exhibitorid': [101, 101, 101, 102, 102, 104, 104, 104, 104, 104, 104, 104],
        'exhibitorName': ['Exhibitor A', 'Exhibitor A', 'Exhibitor A', 'Exhibitor B', 'Exhibitor B', 'Exhibitor D', 'Exhibitor D', 'Exhibitor D', 'Exhibitor D', 'Exhibitor D', 'Exhibitor D', 'Exhibitor D'],
        'categoryId': [500, 501, 502, 503, 505, 506, 507, 508, 509, 510, 511, 512],
        'categoryName': [
            '3.1 Travel Agency',
            '4.1 Online Marketing',
            '5.1 Sales Support',
            '2.1 Tour Operators',
            '1.1 Hotels and Resorts',
            '1.1 Hotels',
            '2.1 Tour Op',
            '3.1 Agency',
            '4.1 Online Ads',
            '5.1 Support',
            '6.1 Cruises',
            '7.1 Niche Travel'
        ],
        'parentCategory': [
            '3. Travel Agencies',
            '4. Marketing',
            '5. Sales',
            '2. Tour Operators',
            '1. Accommodation',
            '1. Accommodation',
            '2. Tour Operators',
            '3. Travel Agencies',
            '4. Marketing',
            '5. Sales',
            '6. Cruises',
            '7. Niche'
        ]
    }
    return pd.DataFrame(exhibitor_data)

@pytest.fixture(scope="module")
def processed_exhibitor_data(mock_exhibitor_data):
    """Processes the mock exhibitor data once per module."""
    return get_exhibitor_category_info(mock_exhibitor_data)


In [17]:
def test_clean_text():
    assert clean_text(" 1.1 Hotels and Resorts ") == "hotels resorts"
    assert clean_text("Travel & Agency!") == "travel agency"
    assert clean_text("Marketing and Sales Support") == "marketing sales support"
    assert clean_text("This is a stop word test") == "stop word test"
    assert clean_text("10.5 Zoo") == "zoo"
    assert clean_text("17.3 Banking, investments") == "banking investments"
    assert clean_text("  ") == ""
    assert clean_text(None) == ""
    assert clean_text(123) == ""
    assert clean_text("3. Travel  agencies") == "travel agencies"

In [18]:
def test_get_visitor_interests(mock_visitor_data):
    # Visitor 1: Travel Agent, Marketing and Sales
    expected_v1 = {'travel', 'agent', 'travel agent', 'marketing', 'sales', 'marketing sales'}
    expected_v1.add(lemmatizer.lemmatize('agent'))
    expected_v1.add(lemmatizer.lemmatize('marketing'))
    expected_v1.add(lemmatizer.lemmatize('sales'))
    assert get_visitor_interests('v1@test.com', mock_visitor_data, RELEVANT_QUESTIONS_TEST) == expected_v1

    # Visitor 2: Tour Operator
    expected_v2 = {'tour', 'operator', 'tour operator'}
    expected_v2.add(lemmatizer.lemmatize('operator'))
    assert get_visitor_interests('v2@test.com', mock_visitor_data, RELEVANT_QUESTIONS_TEST) == expected_v2

In [19]:
def test_get_exhibitor_category_info(processed_exhibitor_data):
    exhibitor_categories_map, exhibitor_category_counts = processed_exhibitor_data

    # Exhibitor 101: Travel Agency, Online Marketing, Sales Support
    expected_cats_101 = {'travel', 'agency', 'travel agency', 'online', 'marketing', 'online marketing', 'sales', 'support', 'sales support'}
    expected_cats_101.add(lemmatizer.lemmatize('agency'))
    expected_cats_101.add(lemmatizer.lemmatize('marketing'))
    expected_cats_101.add(lemmatizer.lemmatize('sales'))
    expected_cats_101.add(lemmatizer.lemmatize('support'))
    assert exhibitor_categories_map[101] == expected_cats_101
    assert exhibitor_category_counts[101] == 3 # 3 unique parent categories

    # Exhibitor 102: Tour Operators, Hotels and Resorts
    expected_cats_102 = {'tour', 'operators', 'tour operators', 'hotels', 'resorts', 'hotels resorts'}
    expected_cats_102.add(lemmatizer.lemmatize('operators'))
    expected_cats_102.add(lemmatizer.lemmatize('hotels'))
    expected_cats_102.add(lemmatizer.lemmatize('resorts'))
    expected_cats_102.add(lemmatizer.lemmatize('hotel')) # from hotels
    expected_cats_102.add(lemmatizer.lemmatize('resort')) # from resorts
    expected_cats_102.add(lemmatizer.lemmatize('operator')) # from operators
    assert exhibitor_categories_map[102] == expected_cats_102
    assert exhibitor_category_counts[102] == 2 # 2 unique parent categories


In [20]:
def test_calculate_match_score_no_overlap():
    visitor = {'interest1', 'interest2'}
    exhibitor = {'catA', 'catB'}
    score, num, matched = calculate_match_score(visitor, exhibitor, 2, PENALTY_ALPHA_TEST, PENALTY_THRESHOLD_TEST)
    assert score == 0.0
    assert num == 0
    assert matched == set()

def test_calculate_match_score_partial_overlap_no_penalty():
    visitor = {'travel', 'agent', 'hotel'}
    exhibitor = {'travel', 'agency', 'tour'}
    total_cats = 3 # Below threshold
    score, num, matched = calculate_match_score(visitor, exhibitor, total_cats, PENALTY_ALPHA_TEST, PENALTY_THRESHOLD_TEST)
    assert score == 1.0 # Score = num_matches (1) * penalty_factor (1)
    assert num == 1
    assert matched == {'travel'}

def test_calculate_match_score_multiple_overlap_no_penalty():
    visitor = {'travel', 'agent', 'hotel', 'resort'}
    exhibitor = {'travel', 'agency', 'hotel', 'booking'}
    total_cats = 4 # Below threshold
    score, num, matched = calculate_match_score(visitor, exhibitor, total_cats, PENALTY_ALPHA_TEST, PENALTY_THRESHOLD_TEST)
    assert score == 2.0 # Score = num_matches (2) * penalty_factor (1)
    assert num == 2
    assert matched == {'travel', 'hotel'}

def test_calculate_match_score_with_penalty():
    visitor = {'travel', 'agent', 'hotel', 'resort', 'op'}
    exhibitor = {'travel', 'agency', 'hotel', 'booking', 'op'}
    total_cats = 8 # Above threshold of 6
    penalty_threshold = PENALTY_THRESHOLD_TEST # 6
    penalty_alpha = PENALTY_ALPHA_TEST       # 0.5
    expected_penalty_factor = 1.0 / (1.0 + penalty_alpha * max(0, total_cats - penalty_threshold))
    # expected_penalty_factor = 1.0 / (1.0 + 0.5 * (8 - 6)) = 1.0 / (1.0 + 0.5 * 2) = 1.0 / 2.0 = 0.5
    expected_score = 3 * expected_penalty_factor # 3 matches * 0.5

    score, num, matched = calculate_match_score(visitor, exhibitor, total_cats, penalty_alpha, penalty_threshold)
    assert score == pytest.approx(expected_score)
    assert num == 3
    assert matched == {'travel', 'hotel', 'op'}

def test_calculate_match_score_at_penalty_threshold():
    visitor = {'travel', 'agent', 'hotel'}
    exhibitor = {'travel', 'agency', 'hotel', 'booking'}
    total_cats = 6 # At threshold
    penalty_threshold = PENALTY_THRESHOLD_TEST # 6
    penalty_alpha = PENALTY_ALPHA_TEST       # 0.5
    expected_penalty_factor = 1.0 / (1.0 + penalty_alpha * max(0, total_cats - penalty_threshold))
    # expected_penalty_factor = 1.0 / (1.0 + 0.5 * (6 - 6)) = 1.0 / 1.0 = 1.0
    expected_score = 2 * expected_penalty_factor # 2 matches * 1.0

    score, num, matched = calculate_match_score(visitor, exhibitor, total_cats, penalty_alpha, penalty_threshold)
    assert score == pytest.approx(expected_score)
    assert num == 2
    assert matched == {'travel', 'hotel'}

In [21]:
ipytest.run('-v') 

platform win32 -- Python 3.11.5, pytest-8.3.5, pluggy-1.5.0
rootdir: d:\ITEProject\ITE-data-sciene-assignment
configfile: pyproject.toml
plugins: anyio-4.9.0
collected 8 items

t_68ab2931365e4ffca6bfa73c3a33b819.py [32m.[0m[32m.[0m[31mE[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[31m                                               [100%][0m

[31m[1m_______________________ ERROR at setup of test_get_exhibitor_category_info ________________________[0m

    [0m[37m@pytest[39;49;00m.fixture(scope=[33m"[39;49;00m[33mmodule[39;49;00m[33m"[39;49;00m)[90m[39;49;00m
    [94mdef[39;49;00m[90m [39;49;00m[92mmock_exhibitor_data[39;49;00m():[90m[39;49;00m
    [90m    [39;49;00m[33m"""Provides the mock exhibitor DataFrame."""[39;49;00m[90m[39;49;00m
        exhibitor_data = {[90m[39;49;00m
            [33m'[39;49;00m[33mexhibitorid[39;49;00m[33m'[39;49;00m: [[94m101[39;49;00m, [94m101[39;49;00m, [94m101[39;49;00m, [94m102[39;49;00m, [94m102

<ExitCode.TESTS_FAILED: 1>