In [9]:
pip install faker pandas numpy

Collecting faker
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.0.0
Note: you may need to restart the kernel to use updated packages.


In [15]:
from faker import Faker
import pandas as pd
import numpy as np
from random import choice, randint
from datetime import timedelta
import os

# Set up Faker and basic config
fake = Faker()

# Define Downloads folder path explicitly
downloads_path = "/Users/swatisoni/Downloads"

# Airlines, Cities, Hotels, Room Types
airlines = ['Delta', 'United Airlines', 'Emirates', 'Lufthansa', 'Qatar Airways']
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Dubai', 'Sydney', 'Toronto']
hotels = ['Grand Plaza', 'Comfort Inn', 'City Lights Hotel', 'Oceanview Resort']
room_types = ['Single', 'Double', 'Suite']
purposes = ['Leisure', 'Business', 'Education', 'Medical']
attractions = ['Eiffel Tower', 'Great Wall', 'Statue of Liberty', 'Sydney Opera House', 'Burj Khalifa']

# --------------------------------------------------------------------------------
# 1. Flight Bookings CSV (1000 records)
flight_records = []
for i in range(1, 1001):
    customer_id = randint(1, 1000)
    origin = choice(cities)
    destination = choice([city for city in cities if city != origin])
    booking_date = fake.date_between(start_date='-1y', end_date='today')
    
    # Departure within 60 days of booking
    departure_date = fake.date_between(start_date=booking_date, end_date=booking_date + timedelta(days=60))
    arrival_date = departure_date  # same day arrival for simplicity
    
    record = [
        i,
        customer_id,
        f"FL{randint(1000,9999)}",
        choice(airlines),
        origin,
        destination,
        departure_date,
        arrival_date,
        booking_date,
        round(np.random.uniform(100, 1500), 2)
    ]
    flight_records.append(record)

df_flight = pd.DataFrame(flight_records, columns=[
    'booking_id', 'customer_id', 'flight_number', 'airline',
    'origin_city', 'destination_city', 'departure_date',
    'arrival_date', 'booking_date', 'ticket_price'
])

flight_file_path = os.path.join(downloads_path, 'flight_bookings.csv')
df_flight.to_csv(flight_file_path, index=False)
print(f"✅ Flight Bookings saved to {flight_file_path}")

# --------------------------------------------------------------------------------
# 2. Hotel Stays CSV (1000 records)
hotel_records = []
for i in range(1, 1001):
    customer_id = randint(1, 1000)
    city = choice(cities)
    booking_date = fake.date_between(start_date='-1y', end_date='today')
    
    # Check-in within 30 days after booking
    check_in = fake.date_between(start_date=booking_date, end_date=booking_date + timedelta(days=30))
    
    # Stay duration between 1 to 10 days
    stay_duration = randint(1, 10)
    check_out = check_in + timedelta(days=stay_duration)
    
    record = [
        i,
        customer_id,
        choice(hotels),
        city,
        choice(room_types),
        check_in,
        check_out,
        booking_date,
        round(np.random.uniform(50, 1000), 2)
    ]
    hotel_records.append(record)

df_hotel = pd.DataFrame(hotel_records, columns=[
    'booking_id', 'customer_id', 'hotel_name', 'city',
    'room_type', 'check_in_date', 'check_out_date',
    'booking_date', 'total_amount'
])

hotel_file_path = os.path.join(downloads_path, 'hotel_stays.csv')
df_hotel.to_csv(hotel_file_path, index=False)
print(f"✅ Hotel Stays saved to {hotel_file_path}")

# --------------------------------------------------------------------------------
# 3. Tourist Demographics JSON (1000 records)
demographic_records = []
for i in range(1, 1001):
    record = {
        "customer_id": i,
        "first_name": fake.first_name(),
        "last_name": fake.last_name(),
        "gender": choice(['Male', 'Female', 'Other']),
        "age": randint(18, 70),
        "nationality": fake.country(),
        "travel_purpose": choice(purposes)
    }
    demographic_records.append(record)

demographics_file_path = os.path.join(downloads_path, 'tourist_demographics.json')
df_demographics = pd.DataFrame(demographic_records)
df_demographics.to_json(demographics_file_path, orient='records', indent=4)
print(f"✅ Tourist Demographics saved to {demographics_file_path}")

# --------------------------------------------------------------------------------
# 4. Attractions Visited CSV (1000 records)
attraction_records = []
for i in range(1, 1001):
    attraction_name = choice(attractions)
    
    city_map = {
        'Eiffel Tower': 'Paris',
        'Great Wall': 'Beijing',
        'Statue of Liberty': 'New York',
        'Sydney Opera House': 'Sydney',
        'Burj Khalifa': 'Dubai'
    }
    
    city = city_map[attraction_name]
    visit_date = fake.date_between(start_date='-1y', end_date='today')
    
    visitors_count = randint(50, 1000)
    revenue = visitors_count * round(np.random.uniform(10, 100), 2)
    avg_rating = round(np.random.uniform(3, 5), 2)
    
    record = [
        i,
        attraction_name,
        city,
        visit_date,
        visitors_count,
        revenue,
        avg_rating
    ]
    attraction_records.append(record)

df_attractions = pd.DataFrame(attraction_records, columns=[
    'attraction_id', 'attraction_name', 'city',
    'visit_date', 'visitors_count', 'revenue', 'average_rating'
])

attractions_file_path = os.path.join(downloads_path, 'attractions_visited.csv')
df_attractions.to_csv(attractions_file_path, index=False)
print(f"✅ Attractions Visited saved to {attractions_file_path}")

# --------------------------------------------------------------------------------
print("\n🎉 All datasets successfully generated and saved in /Users/swatisoni/Downloads folder!")


✅ Flight Bookings saved to /Users/swatisoni/Downloads/flight_bookings.csv
✅ Hotel Stays saved to /Users/swatisoni/Downloads/hotel_stays.csv
✅ Tourist Demographics saved to /Users/swatisoni/Downloads/tourist_demographics.json
✅ Attractions Visited saved to /Users/swatisoni/Downloads/attractions_visited.csv

🎉 All datasets successfully generated and saved in /Users/swatisoni/Downloads folder!


In [19]:
# Create separate DataFrames for each table

# Flight Bookings Data Dictionary
flight_bookings_dict = [
    {"Field Name": "booking_id", "Data Type": "INTEGER", "Description": "Unique identifier for each flight booking"},
    {"Field Name": "customer_id", "Data Type": "INTEGER", "Description": "Unique identifier for the customer making the booking"},
    {"Field Name": "flight_number", "Data Type": "TEXT", "Description": "Alphanumeric flight number assigned to the flight"},
    {"Field Name": "airline", "Data Type": "TEXT", "Description": "Name of the airline"},
    {"Field Name": "origin_city", "Data Type": "TEXT", "Description": "Departure city for the flight"},
    {"Field Name": "destination_city", "Data Type": "TEXT", "Description": "Arrival city for the flight"},
    {"Field Name": "departure_date", "Data Type": "DATE", "Description": "Date when the flight departs"},
    {"Field Name": "arrival_date", "Data Type": "DATE", "Description": "Date when the flight arrives"},
    {"Field Name": "booking_date", "Data Type": "DATE", "Description": "Date when the flight was booked"},
    {"Field Name": "ticket_price", "Data Type": "DECIMAL(10,2)", "Description": "Price paid for the flight ticket"}
]
df_flight_bookings_dict = pd.DataFrame(flight_bookings_dict)

# Hotel Stays Data Dictionary
hotel_stays_dict = [
    {"Field Name": "booking_id", "Data Type": "INTEGER", "Description": "Unique identifier for each hotel booking"},
    {"Field Name": "customer_id", "Data Type": "INTEGER", "Description": "Unique identifier for the customer making the hotel reservation"},
    {"Field Name": "hotel_name", "Data Type": "TEXT", "Description": "Name of the hotel"},
    {"Field Name": "city", "Data Type": "TEXT", "Description": "City where the hotel is located"},
    {"Field Name": "room_type", "Data Type": "TEXT", "Description": "Type of room booked (Single, Double, Suite)"},
    {"Field Name": "check_in_date", "Data Type": "DATE", "Description": "Date when the customer checks into the hotel"},
    {"Field Name": "check_out_date", "Data Type": "DATE", "Description": "Date when the customer checks out of the hotel"},
    {"Field Name": "booking_date", "Data Type": "DATE", "Description": "Date when the hotel booking was made"},
    {"Field Name": "total_amount", "Data Type": "DECIMAL(10,2)", "Description": "Total amount paid for the hotel stay"}
]
df_hotel_stays_dict = pd.DataFrame(hotel_stays_dict)

# Tourist Demographics Data Dictionary
tourist_demographics_dict = [
    {"Field Name": "customer_id", "Data Type": "INTEGER", "Description": "Unique identifier for the tourist"},
    {"Field Name": "first_name", "Data Type": "TEXT", "Description": "First name of the tourist"},
    {"Field Name": "last_name", "Data Type": "TEXT", "Description": "Last name of the tourist"},
    {"Field Name": "gender", "Data Type": "TEXT", "Description": "Gender of the tourist (Male, Female, Other)"},
    {"Field Name": "age", "Data Type": "INTEGER", "Description": "Age of the tourist"},
    {"Field Name": "nationality", "Data Type": "TEXT", "Description": "Country of origin for the tourist"},
    {"Field Name": "travel_purpose", "Data Type": "TEXT", "Description": "Purpose of the tourist's travel (Leisure, Business, Education, Medical)"}
]
df_tourist_demographics_dict = pd.DataFrame(tourist_demographics_dict)

# Attractions Visited Data Dictionary
attractions_visited_dict = [
    {"Field Name": "attraction_id", "Data Type": "INTEGER", "Description": "Unique identifier for the attraction record"},
    {"Field Name": "attraction_name", "Data Type": "TEXT", "Description": "Name of the attraction visited"},
    {"Field Name": "city", "Data Type": "TEXT", "Description": "City where the attraction is located"},
    {"Field Name": "visit_date", "Data Type": "DATE", "Description": "Date when the attraction was visited"},
    {"Field Name": "visitors_count", "Data Type": "INTEGER", "Description": "Number of visitors to the attraction on that date"},
    {"Field Name": "revenue", "Data Type": "DECIMAL(10,2)", "Description": "Total revenue generated from visitors on that date"},
    {"Field Name": "average_rating", "Data Type": "DECIMAL(3,2)", "Description": "Average visitor rating for the attraction (scale of 1.0 to 5.0)"}
]
df_attractions_visited_dict = pd.DataFrame(attractions_visited_dict)

# Save all data dictionaries to separate CSV files
flight_csv = "/Users/swatisoni/Downloads/flight_bookings_dictionary.csv"
hotel_csv = "/Users/swatisoni/Downloads/hotel_stays_dictionary.csv"
tourist_csv = "/Users/swatisoni/Downloads/tourist_demographics_dictionary.csv"
attractions_csv = "/Users/swatisoni/Downloads/attractions_visited_dictionary.csv"

df_flight_bookings_dict.to_csv(flight_csv, index=False)
df_hotel_stays_dict.to_csv(hotel_csv, index=False)
df_tourist_demographics_dict.to_csv(tourist_csv, index=False)
df_attractions_visited_dict.to_csv(attractions_csv, index=False)

flight_csv, hotel_csv, tourist_csv, attractions_csv


('/Users/swatisoni/Downloads/flight_bookings_dictionary.csv',
 '/Users/swatisoni/Downloads/hotel_stays_dictionary.csv',
 '/Users/swatisoni/Downloads/tourist_demographics_dictionary.csv',
 '/Users/swatisoni/Downloads/attractions_visited_dictionary.csv')

In [4]:
import json

# Input and output file paths
input_file = '/Users/swatisoni/Downloads/datawarehouse/Data Source /tourist_demographics.json'
output_file = '/Users/swatisoni/Downloads/tourist_demographics_lines.json'

# Read the JSON array from the input file
with open(input_file, 'r') as f:
    data = json.load(f)

# Write as JSON Lines to the output file
with open(output_file, 'w') as f:
    for record in data:
        json_line = json.dumps(record)
        f.write(json_line + '\n')

print("Conversion to JSON Lines completed!")


Conversion to JSON Lines completed!
