In [2]:
pip install faker

Collecting faker
  Obtaining dependency information for faker from https://files.pythonhosted.org/packages/78/5e/c8c3c5ea0896ab747db2e2889bf5a6f618ed291606de6513df56ad8670a8/faker-37.4.0-py3-none-any.whl.metadata
  Downloading faker-37.4.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---- ----------------------------------- 0.2/1.9 MB 6.7 MB/s eta 0:00:01
   --------------- ------------------------ 0.7/1.9 MB 9.5 MB/s eta 0:00:01
   ---------------------- ----------------- 1.1/1.9 MB 8.5 MB/s eta 0:00:01
   -------------------------------- ------- 1.6/1.9 MB 9.1 MB/s eta 0:00:01
   ---------------------------------------  1.9/1.9 MB 8.8 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 8.2 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.4.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
# This is the code to generate members.csv file

import numpy as np
import pandas as pd
import random

from faker import Faker

fake = Faker('en_US') # calling instance and considering only USA for simplicity 

Faker.seed(37)
random.seed(37) # for reproducability purposes

TRAVEL_PREFERENCES = ['beach', 'adventure', 'nature', 'luxury', 'budget', 'cultural']
WEIGHTS = [0.2, 0.25, 0.2, 0.15, 0.1, 0.1]

# feature (age)
def random_age():
    choices = [random.randint(18, 80), '', 'unknown', -3] # considering only adults 18+
    return random.choices(choices, weights = [0.92, 0.03, 0.03, 0.02])[0]

# feature (location)
def random_location():
    city = fake.city()
    state = fake.state_abbr()
    formats = [
        f"{city}, {state}",
        f"{city}- {state}"
    ]
    return random.choice(formats)

# feature (name)
def messy_name(name):
    diff_formats = [
        name.lower(),
        name.upper(),
        name.capitalize()
    ]
    return random.choice(diff_formats)

# feature (signup_date)
def random_signup_date():
    date = fake.date_between(start_date='-4y', end_date='today')
    formats = [
        date.strftime('%Y-%m-%d'),
        date.strftime('%d/%m/%Y'),
        date.strftime('%b %d, %Y')
    ]
    return random.choice(formats)

# function to define how many members to be generated. (we can increase or decrease this number a/c to our needs)
# Currently I am setting 1000.

def generate_members(num_members=1000):
    members = []
    
    for i in range(num_members):
        member_id = 1000 + i # starting from 1000
        name = messy_name(fake.name())
        age = random_age()
        location = random_location()
        travel_budget = random.randint(100, 900) * 1000
        signup_date = random_signup_date()
        preference = random.choices(TRAVEL_PREFERENCES, weights = WEIGHTS)[0]
        num_dependents = random.randint(0,3)
        has_passport = random.choices([True, False], weights=[85,15])[0]
        
        members.append([
            member_id,
            name,
            age,
            location,
            travel_budget,
            signup_date,
            preference,
            num_dependents,
            has_passport
        ])
        
    df = pd.DataFrame(members, columns=[
            'member_id',
            'name',
            'age',
            'location',
            'travel_budget',
            'signup_date',
            'preference',
            'num_dependents',
            'has_passport'])
    
    # Introducing a few duplicates
    
    duplicates = df.sample(5, random_state = 1)
    df = pd.concat([df, duplicates], ignore_index=True) # adds at the end - so, it is better to shuffle final df
    
    df = df.sample(frac=1, random_state=1).reset_index(drop=True) # shuffling final df
    return df


# calling function to generate the dataset

df_members = generate_members(1000)
df_members.to_csv("members.csv", index=False)
        
print("Members csv file generation is successful.")
        
        

Members csv file generation is successful.


In [2]:
df_members.head()

Unnamed: 0,member_id,name,age,location,travel_budget,signup_date,preference,num_dependents,has_passport
0,1685,michael williams,71,Stevenstad- IA,646000,17/07/2021,luxury,3,False
1,1614,Danielle george,50,"Jessicabury, NC",153000,2024-04-06,adventure,0,True
2,1602,denise brock,22,Salinasfort- ND,155000,2024-09-23,beach,3,True
3,1502,Terry holmes,unknown,South Megan- MP,403000,"Apr 24, 2024",beach,3,True
4,1341,michelle walker,25,"Davidburgh, CO",693000,22/01/2023,cultural,3,True


In [3]:
# This is the code to generate destinations.csv file

DESTINATIONS = [
    'Cancun',
    'Paris',
    'Tokyo',
    'Bali',
    'Cape Town',
    'Denver',
    'Banff',
    'Phuket',
    'Rome',
    'Barcelona'
]

DESTINATION_COUNTRY_MAP = {
    'Cancun': 'Mexico',
    'Paris': 'France',
    'Tokyo': 'Japan',
    'Bali': 'Indonesia',
    'Cape Town': 'South Africa',
    'Denver': 'United States',
    'Banff': 'Canada',
    'Phuket': 'Thailand',
    'Rome': 'Italy',
    'Barcelona': 'Spain'
}

TAGS = [
    'beach', 'romantic', 'adventure', 'historic', 'party', 'skiing',
    'budget-friendly', 'luxury', 'nature', 'cultural', 'family-friendly'
]

def generate_tags():
    return ",".join(random.sample(TAGS, k=random.randint(2,4)))

def generate_safety_rating():
    return round(random.uniform(2.5,5.0), 1)

def generate_avg_cost(tags):
    tags = tags.split(',')
    if "luxury" in tags:
        return random.randint(400, 600)* 1000
    elif "budget-friendly" in tags:
        return random.randint(80, 180)* 1000
    else:
        return random.randint(180, 400)* 1000

def generate_destinations():
    rows = []
    for dest in DESTINATIONS:
        tags = generate_tags()
        country = DESTINATION_COUNTRY_MAP[dest]
        avg_cost = generate_avg_cost(tags)
        safety_rating = generate_safety_rating()

        rows.append([
            dest, country, tags, avg_cost, safety_rating
        ])

    df = pd.DataFrame(rows, columns=[
        'destination', 'country', 'tags', 'avg_cost_usd',
        'safety_rating'
    ])
    
    return df


df_dest = generate_destinations()
df_dest.to_csv("destinations.csv", index=False)

print("Destinations generated successfully.") 


Destinations generated successfully.


In [4]:
df_dest.head()

Unnamed: 0,destination,country,tags,avg_cost_usd,safety_rating
0,Cancun,Mexico,"budget-friendly,romantic,cultural,historic",80000,4.6
1,Paris,France,"luxury,beach",502000,3.1
2,Tokyo,Japan,"historic,party,beach,budget-friendly",101000,3.9
3,Bali,Indonesia,"budget-friendly,romantic,historic",109000,2.5
4,Cape Town,South Africa,"cultural,nature,adventure",279000,4.8


In [5]:
df_dest.shape

(10, 5)

In [6]:
# This is the code to generate trips.csv file


DESTINATIONS_DF = df_dest
DESTINATIONS = DESTINATIONS_DF['destination'].tolist()

DEST_AVG_COST = dict(zip(
    DESTINATIONS_DF['destination'],
    DESTINATIONS_DF['avg_cost_usd']
))

PREFERENCE_TO_ACTIVITIES = {
    'adventure': ['hiking', 'kayaking', 'safari'],
    'beach': ['snorkeling', 'beach', 'spa'],
    'cultural': ['museum tour', 'cultural show', 'food tour'],
    'luxury': ['spa', 'shopping', 'wine tasting'],
    'budget': ['hiking', 'museum tour', 'food tour'],
    'nature': ['safari', 'hiking', 'kayaking']
}

def messy_date():
    date = fake.date_between(start_date='-4y', end_date='today')
    formats = [
        date.strftime('%Y-%m-%d'),
        date.strftime('%m/%d/%Y'),
        date.strftime('%B %d, %Y')
    ]
    return random.choice(formats)

def clean_cost(base):
    cost = base * random.uniform(0.8, 1.2)
    return round(cost / 1000) * 1000

def generate_activities(preference):
    options = PREFERENCE_TO_ACTIVITIES.get(preference, [])
    selected = random.sample(options, k=random.randint(1, 3))
    return ", ".join(selected)

def generate_trips(members_df):
    trips = []
    trip_id_counter = 1000  

    for _, row in members_df.iterrows():
        member_id = row['member_id']
        budget = row['travel_budget']
        preference = row['preference']
        num_trips = random.randint(1, 5)

        affordable_destinations = [
            d for d in DESTINATIONS
            if DEST_AVG_COST[d] * 1.2 <= budget
        ]

        if not affordable_destinations:
            continue  

        for _ in range(num_trips):
            destination = random.choice(affordable_destinations)
            base_cost = DEST_AVG_COST[destination]
            trip_cost = clean_cost(base_cost)

            duration_days = random.choice(
                [random.randint(3, 10)]
            )

            activities = generate_activities(preference)
            start_date = messy_date()

            trips.append([
                f"T{trip_id_counter}", member_id, destination, start_date,
                duration_days, trip_cost, activities
            ])
            trip_id_counter += 1

    df_trips = pd.DataFrame(trips, columns=[
        'trip_id', 'member_id', 'destination', 'start_date',
        'duration_days', 'cost_usd', 'activities'
    ])

    return df_trips



df_trips = generate_trips(df_members)
df_trips.to_csv("trips.csv", index=False)

print("Trips generated successfully.")

Trips generated successfully.


In [7]:
df_trips.head(10)

Unnamed: 0,trip_id,member_id,destination,start_date,duration_days,cost_usd,activities
0,T1000,1685,Cape Town,2023-04-12,5,286000,"wine tasting, shopping"
1,T1001,1685,Banff,"January 15, 2025",10,259000,"shopping, wine tasting"
2,T1002,1685,Banff,11/18/2024,8,268000,"shopping, wine tasting, spa"
3,T1003,1685,Barcelona,04/18/2025,9,176000,"wine tasting, shopping, spa"
4,T1004,1685,Rome,2025-01-29,6,232000,"shopping, wine tasting"
5,T1005,1614,Bali,2022-06-06,7,120000,"kayaking, hiking"
6,T1006,1614,Bali,05/19/2023,4,117000,"hiking, safari, kayaking"
7,T1007,1614,Tokyo,12/05/2021,6,95000,"kayaking, hiking, safari"
8,T1008,1614,Tokyo,05/01/2024,3,108000,"kayaking, hiking, safari"
9,T1009,1614,Cancun,03/26/2023,4,71000,"hiking, safari"


In [8]:
# This is the code to generate weather.csv file

from itertools import product

DESTINATIONS = df_dest['destination'].to_list()

WEATHER_TYPES = [
    "sunny", "cloudy", "rainy", "windy", "stormy", "snowy"
]

MONTHS = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

combinations = list(product(DESTINATIONS, MONTHS))

rows = []

for dest,month in combinations:
    weather = random.choice(WEATHER_TYPES)
    seasonal_rating = round(random.uniform(2.5, 5.0), 1)
    rows.append([dest, month, weather, seasonal_rating])

df_weather = pd.DataFrame(rows, columns=[
    'destination', 'month', 'weather', 'seasonal_rating'
])

df_weather.to_csv("weather.csv", index = False)

print("Weather Events generated successfully")

Weather Events generated successfully


In [9]:
df_weather.head()

Unnamed: 0,destination,month,weather,seasonal_rating
0,Cancun,January,stormy,4.1
1,Cancun,February,rainy,3.1
2,Cancun,March,cloudy,4.1
3,Cancun,April,rainy,4.6
4,Cancun,May,cloudy,4.8
