In [1]:
import pandas as pd
import csv
import random
from tqdm import tqdm

In [2]:
# Load destinations
destinations = pd.read_csv('destinations.csv')
destinations['activities'] = destinations['activities'].str.split('|')
destinations['best_months'] = destinations['best_months'].str.split('|')

# Load synthetic users
users = pd.read_csv('synthetic_users.csv')
users['secondary_interests'] = users['secondary_interests'].str.split(',')

print(f"Loaded {len(users)} users and {len(destinations)} destinations")
print(f"Total possible user-destination pairs: {len(users) * len(destinations):,}")

print("\nSample user:")
print(users.iloc[0])

print("\nSample destination:")
print(destinations.iloc[0])

Loaded 200 users and 103 destinations
Total possible user-destination pairs: 20,600

Sample user:
age_group                            51+
primary_style                  adventure
secondary_interests      [culture, food]
group_type                          solo
budget                              high
climate_preference              tropical
safety_importance               0.581678
popularity_preference           0.875692
Name: 0, dtype: object

Sample destination:
city                                                      Paris
country                                                  France
continent                                                Europe
avg_cost                                                   1500
climate                                                    mild
activities        [culture, food, nightlife, museums, shopping]
best_months                           [Apr, May, Jun, Sep, Oct]
avg_temp_range                                           5-25°C
popularity        

In [4]:
destinations.head(10)

Unnamed: 0,city,country,continent,avg_cost,climate,activities,best_months,avg_temp_range,popularity,safety_rating
0,Paris,France,Europe,1500,mild,"[culture, food, nightlife, museums, shopping]","[Apr, May, Jun, Sep, Oct]",5-25°C,9,8
1,Reykjavik,Iceland,Europe,2000,cold,"[nature, hiking, adventure, northern_lights, h...","[Jun, Jul, Aug]",-2-15°C,7,9
2,Rome,Italy,Europe,1200,warm,"[culture, history, food, museums, architecture]","[Apr, May, Sep, Oct]",8-30°C,9,7
3,Barcelona,Spain,Europe,1100,warm,"[beach, nightlife, culture, food, architecture]","[May, Jun, Jul, Aug, Sep]",10-28°C,9,7
4,Innsbruck,Austria,Europe,1300,cold,"[skiing, hiking, nature, mountains]","[Dec, Jan, Feb, Jul, Aug]",-5-20°C,6,9
5,Amsterdam,Netherlands,Europe,1400,mild,"[culture, museums, nightlife, biking, canals]","[Apr, May, Jun, Sep]",3-22°C,8,8
6,Prague,Czech Republic,Europe,900,mild,"[culture, history, nightlife, architecture, food]","[Apr, May, Jun, Sep]",-2-24°C,8,8
7,Santorini,Greece,Europe,1600,warm,"[beach, relaxation, food, photography, romance]","[May, Jun, Sep, Oct]",12-29°C,8,8
8,Dubrovnik,Croatia,Europe,1000,warm,"[beach, history, culture, sailing]","[May, Jun, Sep]",8-28°C,7,8
9,Edinburgh,Scotland,Europe,1300,cool,"[culture, history, festivals, hiking]","[Jun, Jul, Aug]",4-19°C,7,9


In [5]:
destinations.shape

(103, 10)

In [6]:
age_groups = ["18-25", "26-35", "36-50", "51+"]
travel_styles = ["adventure", "relaxation", "culture", "nightlife", "food", "nature", "beach"]
group_types = ["solo", "couple", "friends", "family"]
budget_levels = ["low", "medium", "high", "luxury"]
climate_preferences = ["hot", "warm", "mild", "cool", "cold", "tropical", "subtropical"]

# User profile weights based on demographics
age_preferences = {
    "18-25": {"nightlife": 1.3, "adventure": 1.2, "culture": 0.9, "budget_sensitivity": 1.4},
    "26-35": {"food": 1.2, "culture": 1.1, "nightlife": 1.1, "budget_sensitivity": 1.1},
    "36-50": {"culture": 1.3, "relaxation": 1.2, "food": 1.2, "budget_sensitivity": 0.9},
    "51+": {"culture": 1.4, "relaxation": 1.3, "history": 1.3, "budget_sensitivity": 0.8}
}

group_preferences = {
    "solo": {"culture": 1.2, "adventure": 1.1, "safety_weight": 1.3},
    "couple": {"relaxation": 1.2, "food": 1.2, "romance": 1.4},
    "friends": {"nightlife": 1.3, "adventure": 1.2, "beach": 1.1},
    "family": {"safety_weight": 1.5, "relaxation": 1.2, "nature": 1.1}
}

budget_map = {"low": 1000, "medium": 1500, "high": 2200, "luxury": 5000}

In [7]:
def get_adjacent_months(month):
    months = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
    idx = months.index(month)
    return [months[(idx-1) % 12], months[(idx+1) % 12]]

def create_user_filters(user_row):
    all_activities = [user_row['primary_style']] + user_row['secondary_interests']
    
    return {
        "budget_max": budget_map[user_row['budget']],
        "activities": all_activities,
        "preferred_month": random.choice(["Jan","Feb","Mar","Apr","May","Jun",
                                         "Jul","Aug","Sep","Oct","Nov","Dec"]),
        "climate_preference": user_row['climate_preference'],
        "continent_preference": None  # No continent preference for training data
    }


In [8]:
def score_city(user, filters, city, add_noise=False):
    score = 0
    max_score = 0
    
    # Budget score (25% weight)
    budget_weight = 0.25
    max_score += budget_weight
    
    if city["avg_cost"] <= filters["budget_max"]:
        # Bonus for being well under budget
        budget_efficiency = 1 - (city["avg_cost"] / filters["budget_max"])
        score += budget_weight * (0.7 + 0.3 * budget_efficiency)
    else:
        # Penalty for over budget
        over_budget_penalty = min(0.5, (city["avg_cost"] - filters["budget_max"]) / filters["budget_max"])
        score += budget_weight * (0.3 - over_budget_penalty)
    
    # Activity match (30% weight)
    activity_weight = 0.30
    max_score += activity_weight
    
    city_activities = city["activities"]
    
    # Primary interest match
    if user["primary_style"] in city_activities:
        score += activity_weight * 0.6
    
    # Secondary interests match
    secondary_matches = sum(1 for interest in user["secondary_interests"] if interest in city_activities)
    if len(user["secondary_interests"]) > 0:
        score += activity_weight * 0.4 * (secondary_matches / len(user["secondary_interests"]))
    
    # Apply age-based activity preferences
    age_prefs = age_preferences.get(user["age_group"], {})
    for activity in city_activities:
        if activity in age_prefs:
            score += 0.05 * (age_prefs[activity] - 1)
    
    # Season match (15% weight)
    season_weight = 0.15
    max_score += season_weight
    
    if filters["preferred_month"] in city["best_months"]:
        score += season_weight
    elif len(set(city["best_months"]).intersection(get_adjacent_months(filters["preferred_month"]))) > 0:
        score += season_weight * 0.5
    
    # Climate match (10% weight)
    climate_weight = 0.10
    max_score += climate_weight
    
    # Handle 'any' climate and similar climates
    user_climate = filters.get("climate_preference", "any")
    if user_climate == "any" or user_climate == city["climate"]:
        score += climate_weight
    elif user_climate in ['warm', 'hot'] and city["climate"] in ['warm', 'hot', 'tropical']:
        score += climate_weight * 0.7
    elif user_climate in ['cold', 'cool'] and city["climate"] in ['cold', 'cool']:
        score += climate_weight * 0.7
    
    # Safety (10% weight, scaled by user's safety importance)
    safety_weight = 0.10 * user["safety_importance"]
    max_score += safety_weight
    
    # Apply group-specific safety preferences
    group_prefs = group_preferences.get(user["group_type"], {})
    safety_multiplier = group_prefs.get("safety_weight", 1.0)
    
    score += safety_weight * (city["safety_rating"] / 10) * safety_multiplier
    
    # Popularity (5% weight)
    popularity_weight = 0.05
    max_score += popularity_weight
    
    if user["popularity_preference"] > 0.7:
        score += popularity_weight * (city["popularity"] / 10)
    else:
        score += popularity_weight * (1 - city["popularity"] / 10)
    
    # Continent preference (5% weight)
    continent_weight = 0.05
    max_score += continent_weight
    
    if filters["continent_preference"] is None or filters["continent_preference"] == city["continent"]:
        score += continent_weight
    
    # Add realistic noise (optional, for training data variation)
    if add_noise:
        noise = random.uniform(-0.05, 0.05)
        score += noise
    
    # Normalize to 0-1 range
    return round(max(0, min(1, score)), 3)


In [18]:
def get_recommendations(user, filters, cities_df, top_n=5):
    recommendations = []
    
    for idx, row in cities_df.iterrows():
        score = score_city(user, filters, row, recommendations)
        recommendations.append({
            "city": row["city"],
            "country": row["country"],
            "continent": row["continent"],
            "score": score,
            "avg_cost": row["avg_cost"],
            "climate": row["climate"],
            "safety_rating": row["safety_rating"],
            "popularity": row["popularity"]
        })
    
    # Sort by score
    recommendations.sort(key=lambda x: x["score"], reverse=True)
    return recommendations[:top_n]

In [14]:
df_sampled.to_csv('training_data.csv', index=False)

In [17]:
df_sampled.head(50)


Unnamed: 0,user_id,destination_id,user_age_group,user_primary_style,user_group_type,user_budget,user_climate_pref,user_safety_importance,user_popularity_pref,user_preferred_month,...,user_likes_relaxation,dest_has_adventure,dest_has_beach,dest_has_culture,dest_has_food,dest_has_nature,dest_has_nightlife,dest_has_relaxation,compatibility_score,label
0,0,14,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,0,0,1,1,0,1,0,0.535,1
1,0,80,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,1,0,1,0,1,0,0,0.611,0
2,0,46,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,0,0,1,1,0,0,0,0.525,1
3,0,24,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,0,0,1,0,0,0,0,0.617,1
4,0,42,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,0,0,1,1,0,1,0,0.506,0
5,0,91,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,1,1,1,0,0,0,1,0.746,1
6,0,48,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,0,0,0,0,1,0,0,0.358,0
7,0,5,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,0,0,1,0,0,1,0,0.526,1
8,0,27,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,1,0,0,0,1,0,0,0.472,0
9,0,82,51+,adventure,solo,high,tropical,0.581678,0.875692,Mar,...,0,0,1,1,0,0,0,1,0.622,0


In [19]:
df_sampled.shape

(5000, 33)

In [13]:
interactions_per_user = 25  # Each user rates 25 random destinations

training_data_sampled = []

for user_idx, user_row in tqdm(users.iterrows(), total=len(users), desc="Processing users"):
    # Create filters for this user
    filters = create_user_filters(user_row)
    
    # Sample random destinations
    sampled_dests = destinations.sample(n=min(interactions_per_user, len(destinations)), replace=False)
    
    for dest_idx, dest_row in sampled_dests.iterrows():
        # Calculate compatibility score
        score = score_city(user_row, filters, dest_row, add_noise=True)
        
        # Create binary label: score > 0.6 = like (1), else dislike (0)
        # Add some probabilistic uncertainty in the 0.4-0.7 range
        if score > 0.7:
            label = 1
        elif score < 0.4:
            label = 0
        else:
            # Probabilistic in uncertain zone
            label = 1 if random.random() < score else 0
        
        # Create training example
        example = {
            'user_id': user_idx,
            'destination_id': dest_idx,
            
            # User features
            'user_age_group': user_row['age_group'],
            'user_primary_style': user_row['primary_style'],
            'user_group_type': user_row['group_type'],
            'user_budget': user_row['budget'],
            'user_climate_pref': user_row['climate_preference'],
            'user_safety_importance': user_row['safety_importance'],
            'user_popularity_pref': user_row['popularity_preference'],
            'user_preferred_month': filters['preferred_month'],
            
            # Destination features
            'dest_city': dest_row['city'],
            'dest_country': dest_row['country'],
            'dest_continent': dest_row['continent'],
            'dest_cost': dest_row['avg_cost'],
            'dest_climate': dest_row['climate'],
            'dest_popularity': dest_row['popularity'],
            'dest_safety': dest_row['safety_rating'],
            
            # Activity matching features
            'user_likes_adventure': 1 if 'adventure' in filters['activities'] else 0,
            'user_likes_beach': 1 if 'beach' in filters['activities'] else 0,
            'user_likes_culture': 1 if 'culture' in filters['activities'] else 0,
            'user_likes_food': 1 if 'food' in filters['activities'] else 0,
            'user_likes_nature': 1 if 'nature' in filters['activities'] else 0,
            'user_likes_nightlife': 1 if 'nightlife' in filters['activities'] else 0,
            'user_likes_relaxation': 1 if 'relaxation' in filters['activities'] else 0,
            
            'dest_has_adventure': 1 if 'adventure' in dest_row['activities'] else 0,
            'dest_has_beach': 1 if 'beach' in dest_row['activities'] else 0,
            'dest_has_culture': 1 if 'culture' in dest_row['activities'] else 0,
            'dest_has_food': 1 if 'food' in dest_row['activities'] else 0,
            'dest_has_nature': 1 if 'nature' in dest_row['activities'] else 0,
            'dest_has_nightlife': 1 if 'nightlife' in dest_row['activities'] else 0,
            'dest_has_relaxation': 1 if 'relaxation' in dest_row['activities'] else 0,
            
            # Labels (targets)
            'compatibility_score': score,  # Regression target (0-1)
            'label': label  # Classification target (0 or 1)
        }
        
        training_data_sampled.append(example)

df_sampled = pd.DataFrame(training_data_sampled)

print(f"\n✅ Generated {len(df_sampled)} training examples (sampled)")
print(f"Label distribution:")
print(df_sampled['label'].value_counts())
print(f"Like rate: {df_sampled['label'].mean():.2%}")
print(f"\nScore statistics:")
print(df_sampled['compatibility_score'].describe())

Processing users: 100%|██████████████████████| 200/200 [00:00<00:00, 338.69it/s]



✅ Generated 5000 training examples (sampled)
Label distribution:
label
1    2710
0    2290
Name: count, dtype: int64
Like rate: 54.20%

Score statistics:
count    5000.000000
mean        0.545114
std         0.161235
min         0.038000
25%         0.439000
50%         0.551000
75%         0.654000
max         0.966000
Name: compatibility_score, dtype: float64


# Used for generating syntetic user

In [None]:
def generate_user():
    # Add secondary interests (1-2 additional activities)
    style = random.choice(travel_styles)
    secondary_interests = random.sample(
        [s for s in travel_styles if s != style], 
        k=random.randint(1, 2)
    )
    
    return {
        "age_group": random.choice(age_groups),
        "primary_style": style,
        "secondary_interests": secondary_interests,
        "group_type": random.choice(group_types),
        "budget": random.choice(budget_levels),
        "climate_preference": random.choice(climate_preferences),
        "safety_importance": random.uniform(0.5, 1.0),
        "popularity_preference": random.uniform(0.3, 1.0)
    }

num_users = 200
csv_filename = "synthetic_users.csv"

with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)

    header = ["age_group", "primary_style", "secondary_interests",
              "group_type", "budget", "climate_preference",
              "safety_importance", "popularity_preference"]
    writer.writerow(header)

    for _ in range(num_users):
        user = generate_user()
        row = [
            user["age_group"],
            user["primary_style"],
            ",".join(user["secondary_interests"]),  # save list as comma-separated string
            user["group_type"],
            user["budget"],
            user["climate_preference"],
            user["safety_importance"],
            user["popularity_preference"]
        ]
        writer.writerow(row)

print(f"Saved {num_users} synthetic users to '{csv_filename}'")