#### APAN5310 Project Checkpoint4
by Yulong Zhang(yz4461)
Kaitai Yang(ky2493)

Import Required Packages

In [1]:
import pandas as pd
from sqlalchemy import create_engine, text
import numpy as np
import random
from datetime import datetime

Create an engine that connects to PostgreSQL

In [2]:
#Please add a new database in Postgre, and modify the following code to your own database connection, but the rest of the code should be the same.
conn_url = 'postgresql://postgres:123@localhost:5432/5310_project_checkpont4'

In [3]:
engine = create_engine(conn_url)
#establish a connection
connection = engine.connect()

Creating schemas by using SQL codes from Checkpoint3:

In [4]:
#pass sql statement
stmt = """
DROP TABLE IF EXISTS reviews;
DROP TABLE IF EXISTS car_rental_bookings;
DROP TABLE IF EXISTS hotel_bookings;
DROP TABLE IF EXISTS flight_bookings;
DROP TABLE IF EXISTS flight_search_recommendations;
DROP TABLE IF EXISTS car_rental_search_recommendations;
DROP TABLE IF EXISTS hotel_search_recommendations;
DROP TABLE IF EXISTS user_preferences;
DROP TABLE IF EXISTS payment_information;
DROP TABLE IF EXISTS discounts;
DROP TABLE IF EXISTS car_types;
DROP TABLE IF EXISTS rental_companies;
DROP TABLE IF EXISTS room_types;
DROP TABLE IF EXISTS hotels;
DROP TABLE IF EXISTS flights;
DROP TABLE IF EXISTS airlines;
DROP TABLE IF EXISTS customers;
DROP TABLE IF EXISTS locations;


-- locations table
CREATE TABLE locations (
    location_id SERIAL PRIMARY KEY,
    address VARCHAR(255) NOT NULL,
    city VARCHAR(255),
    state VARCHAR(255),
    country VARCHAR(255) NOT NULL
);

-- customers table
CREATE TABLE customers (
    customer_id SERIAL PRIMARY KEY,
    first_name VARCHAR(255) NOT NULL,
    last_name VARCHAR(255) NOT NULL,
    email VARCHAR(255) NOT NULL,
    phone_number CHAR(10),
    loyalty_degree INTEGER CHECK (loyalty_degree BETWEEN 1 AND 5) NOT NULL,
    customer_address INTEGER NOT NULL,
    FOREIGN KEY (customer_address) REFERENCES locations(location_id)
);

-- airlines table
CREATE TABLE airlines (
    airline_id SERIAL PRIMARY KEY,
    airline_name VARCHAR(255) NOT NULL
);

-- flights table
CREATE TABLE flights (
    flight_id SERIAL PRIMARY KEY,
    airline_id INTEGER NOT NULL,
    depart_airport CHAR(3) NOT NULL,
    depart_location INTEGER NOT NULL,
    destin_airport CHAR(3) NOT NULL,
    destin_location INTEGER NOT NULL,
    departure_time TIMESTAMP NOT NULL,
    arrival_time TIMESTAMP NOT NULL,
    duration INTERVAL NOT NULL,
    price DECIMAL(10, 2) NOT NULL,
    FOREIGN KEY (airline_id) REFERENCES airlines(airline_id),
    FOREIGN KEY (depart_location) REFERENCES locations(location_id),
    FOREIGN KEY (destin_location) REFERENCES locations(location_id)
);

-- hotels table
CREATE TABLE hotels (
    hotel_id SERIAL PRIMARY KEY,
    hotel_name VARCHAR(255) NOT NULL,
    hotel_address INTEGER NOT NULL,
    star_rating INTEGER NOT NULL,
    FOREIGN KEY (hotel_address) REFERENCES locations(location_id)
);

-- room_type table
CREATE TABLE room_types (
    room_type_id SERIAL PRIMARY KEY,
    room_type VARCHAR(255) NOT NULL UNIQUE,
    capacity INTEGER NOT NULL
);

-- rental_company table
CREATE TABLE rental_companies (
    rental_company_id SERIAL PRIMARY KEY,
    rental_company_name VARCHAR(255) NOT NULL,
    rental_company_address INTEGER NOT NULL,
    FOREIGN KEY (rental_company_address) REFERENCES locations(location_id)
);

-- car_type table
CREATE TABLE car_types (
    car_type_id SERIAL PRIMARY KEY,
    car_type VARCHAR(255) NOT NULL,
    manufacturer VARCHAR(255) NOT NULL,
    model VARCHAR(255) NOT NULL
);

-- discount table
CREATE TABLE discounts (
    customer_id INTEGER PRIMARY KEY,
    discount_degree DECIMAL(3, 2),
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
);

-- payment_information table
CREATE TABLE payment_information (
    payment_id SERIAL PRIMARY KEY,
    customer_id INTEGER NOT NULL,
    card_details VARCHAR(19) NOT NULL,
    payment DECIMAL(10, 2) NOT NULL,
    billing_address INTEGER NOT NULL,
    discount INTEGER,
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
    FOREIGN KEY (billing_address) REFERENCES locations(location_id),
    FOREIGN KEY (discount) REFERENCES discounts(customer_id)
);

-- user_preferences table
CREATE TABLE user_preferences (
    customer_id INTEGER PRIMARY KEY,
    preferred_hotel_id INTEGER,
    preferred_car_model_id INTEGER,
    preferred_airline_id INTEGER,
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
    FOREIGN KEY (preferred_hotel_id) REFERENCES hotels(hotel_id),
    FOREIGN KEY (preferred_car_model_id) REFERENCES car_types(car_type_id),
    FOREIGN KEY (preferred_airline_id) REFERENCES airlines(airline_id)
);

-- hotel_search_recommendation table
CREATE TABLE hotel_search_recommendations (
    hotel_id INTEGER PRIMARY KEY,
    room_type_id INTEGER,
    avg_price DECIMAL(10, 2),
    avg_review_score DECIMAL(3, 2),
    FOREIGN KEY (hotel_id) REFERENCES hotels(hotel_id),
    FOREIGN KEY (room_type_id) REFERENCES room_types(room_type_id)
);

-- car_rental_search_recommendation table
CREATE TABLE car_rental_search_recommendations (
    rental_company_id INTEGER PRIMARY KEY,
    car_type_id INTEGER,
    avg_price DECIMAL(10, 2),
    avg_review_score DECIMAL(3, 2),
    FOREIGN KEY (rental_company_id) REFERENCES rental_companies(rental_company_id),
    FOREIGN KEY (car_type_id) REFERENCES car_types(car_type_id)
);

-- flight_search_recommendation table
CREATE TABLE flight_search_recommendations (
    flight_id INTEGER PRIMARY KEY,
    avg_price DECIMAL(10, 2),
    avg_review_score DECIMAL(3, 2),
    FOREIGN KEY (flight_id) REFERENCES flights(flight_id)
);

-- flight_bookings table
CREATE TABLE flight_bookings (
    booking_id SERIAL PRIMARY KEY,
    customer_id INTEGER NOT NULL,
    payment_id INTEGER NOT NULL,
    flight_id INTEGER,
    adults_count INTEGER,
    children_count INTEGER,
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
    FOREIGN KEY (payment_id) REFERENCES payment_information(payment_id),
    FOREIGN KEY (flight_id) REFERENCES flights(flight_id)
);

-- hotel_bookings table
CREATE TABLE hotel_bookings (
    booking_id SERIAL PRIMARY KEY,
    customer_id INTEGER NOT NULL,
    payment_id INTEGER NOT NULL,
    hotel_id INTEGER,
    room_type INTEGER,
    adults_count INTEGER,
    children_count INTEGER,
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
    FOREIGN KEY (payment_id) REFERENCES payment_information(payment_id),
    FOREIGN KEY (hotel_id) REFERENCES hotels(hotel_id),
    FOREIGN KEY (room_type) REFERENCES room_types(room_type_id)
);

-- car_rental_bookings table
CREATE TABLE car_rental_bookings (
    booking_id SERIAL PRIMARY KEY,
    customer_id INTEGER NOT NULL,
    payment_id INTEGER NOT NULL,
    rental_company_id INTEGER,
    car_type INTEGER,
    adults_count INTEGER,
    children_count INTEGER,
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
    FOREIGN KEY (payment_id) REFERENCES payment_information(payment_id),
    FOREIGN KEY (rental_company_id) REFERENCES rental_companies(rental_company_id),
    FOREIGN KEY (car_type) REFERENCES car_types(car_type_id)
);

-- reviews table
CREATE TABLE reviews (
    review_id SERIAL PRIMARY KEY,
    customer_id INTEGER NOT NULL,
    hotel_booking_id INTEGER,
    car_rental_booking_id INTEGER,
    flight_booking_id INTEGER,
    date DATE NOT NULL,
    review_text TEXT,
    review_score DECIMAL(2,1) NOT NULL,
    --FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
    --FOREIGN KEY (hotel_booking_id) REFERENCES hotel_bookings(booking_id),
    --FOREIGN KEY (car_rental_booking_id) REFERENCES car_rental_bookings(booking_id),
    FOREIGN KEY (flight_booking_id) REFERENCES flight_bookings(booking_id)
);


SELECT * FROM Customers;
"""
connection.execute(stmt).fetchall()
#now have no values in tables

[]

Since Online datasets can not sufficiently fulfill our data schemas, we will generate our own data to fill the data warehouse.

In [5]:
#we generated our own data in this part to make sure that our project is complete and accessiable in final submission/analysis.
#Once again, the point of our ETL pipeline is to show our project is feasible, and the data is only the means to the end.
#case on working with real world data will be demonstrated in later section

#code for generating the data:
'''
num_rows = 600

locations_df = pd.DataFrame({
    'location_id': range(1, num_rows + 1),
    'address': [f'Address {i}' for i in range(1, num_rows + 1)],
    'city': [f'City {random.randint(1, 100)}' for _ in range(num_rows)],
    'state': [f'State {random.randint(1, 50)}' for _ in range(num_rows)],
    'country': [f'Country {random.randint(1, 50)}' for _ in range(num_rows)],
})

locations_df.to_csv('locations.csv', index=False)

# 100 common first name
first_names = [
    "James", "John", "Robert", "Michael", "William", "David", "Richard", "Charles", "Joseph", "Thomas",
    "Christopher", "Daniel", "Paul", "Mark", "Donald", "George", "Kenneth", "Steven", "Edward", "Brian",
    "Ronald", "Anthony", "Kevin", "Jason", "Matthew", "Gary", "Timothy", "Jose", "Larry", "Jeffrey",
    "Frank", "Scott", "Eric", "Stephen", "Andrew", "Raymond", "Gregory", "Joshua", "Jerry", "Dennis",
    "Walter", "Patrick", "Peter", "Harold", "Douglas", "Henry", "Carl", "Arthur", "Ryan", "Roger",
    "Joe", "Juan", "Jack", "Albert", "Jonathan", "Justin", "Terry", "Gerald", "Keith", "Samuel",
    "Willie", "Ralph", "Lawrence", "Nicholas", "Roy", "Benjamin", "Bruce", "Brandon", "Adam", "Harry",
    "Fred", "Wayne", "Billy", "Steve", "Louis", "Jeremy", "Aaron", "Randy", "Howard", "Eugene",
    "Carlos", "Russell", "Bobby", "Victor", "Martin", "Ernest", "Phillip", "Todd", "Jesse", "Craig",
    "Alan", "Shawn", "Clarence", "Sean", "Philip", "Chris", "Johnny", "Earl", "Jimmy", "Antonio"
]

# 100 common last name
last_names = [
    "Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson", "Moore", "Taylor",
    "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin", "Thompson", "Garcia", "Martinez", "Robinson",
    "Clark", "Rodriguez", "Lewis", "Lee", "Walker", "Hall", "Allen", "Young", "Hernandez", "King",
    "Wright", "Lopez", "Hill", "Scott", "Green", "Adams", "Baker", "Gonzalez", "Nelson", "Carter",
    "Mitchell", "Perez", "Roberts", "Turner", "Phillips", "Campbell", "Parker", "Evans", "Edwards", "Collins",
    "Stewart", "Sanchez", "Morris", "Rogers", "Reed", "Cook", "Morgan", "Bell", "Murphy", "Bailey",
    "Rivera", "Cooper", "Richardson", "Cox", "Howard", "Ward", "Torres", "Peterson", "Gray", "Ramirez",
    "James", "Watson", "Brooks", "Kelly", "Sanders", "Price", "Bennett", "Wood", "Barnes", "Ross",
    "Henderson", "Coleman", "Jenkins", "Perry", "Powell", "Long", "Patterson", "Hughes", "Flores", "Washington"
]

customers_df = pd.DataFrame({
    'customer_id': range(1, num_rows + 1),
    'first_name': [random.choice(first_names) for _ in range(num_rows)],
    'last_name': [random.choice(last_names) for _ in range(num_rows)],
    'email': [random_email() for _ in range(num_rows)],
    'phone_number': [str(random.randint(1000000000, 9999999999)) for _ in range(num_rows)],
    'loyalty_degree': [random.randint(1, 5) for _ in range(num_rows)],
    'customer_address': [random.choice(locations_df['location_id'].tolist()) for _ in range(num_rows)],
})

customers_df.to_csv('customers.csv', index=False)

airlines_df = pd.DataFrame({
    'airline_id': range(1, 11),  # Assume 10 airlines
    'airline_name': [f'Airline {i}' for i in range(1, 11)],
})

airlines_df.to_csv('airlines.csv', index=False)

from datetime import timedelta

flights_df = pd.DataFrame({
    'flight_id': range(1, num_rows + 1),
    'airline_id': [random.choice(airlines_df['airline_id'].tolist()) for _ in range(num_rows)],
    'depart_airport': [f'{random_string(3).upper()}' for _ in range(num_rows)],
    'depart_location': [random.choice(locations_df['location_id'].tolist()) for _ in range(num_rows)],
    'destin_airport': [f'{random_string(3).upper()}' for _ in range(num_rows)],
    'destin_location': [random.choice(locations_df['location_id'].tolist()) for _ in range(num_rows)],
    'departure_time': pd.date_range(start='2022-01-01', periods=num_rows, freq='H'),
    'arrival_time': pd.date_range(start='2022-01-01', periods=num_rows, freq='H') + timedelta(hours=5),
    'duration': [timedelta(hours=random.randint(1, 15)) for _ in range(num_rows)],
    'price': [random.uniform(50, 1000) for _ in range(num_rows)]
})

flights_df.to_csv('flights.csv', index=False)

hotels_df = pd.DataFrame({
    'hotel_id': range(1, num_rows + 1),
    'hotel_name': [f'Hotel {i}' for i in range(1, num_rows + 1)],
    'hotel_address': [random.choice(locations_df['location_id'].tolist()) for _ in range(num_rows)],
    'star_rating': [random.randint(1, 5) for _ in range(num_rows)],
})

hotels_df.to_csv('hotels.csv', index=False)

room_types_df = pd.DataFrame({
    'room_type_id': range(1, 6),
    'room_type': ['Single', 'Double', 'Triple', 'Quad', 'Suite'],
    'capacity': [1, 2, 3, 4, 5]
})

room_types_df.to_csv('room_types.csv', index=False)



# Fake rental company names
rental_company_names = [
    'Speedy Rentals', 'City Cars', 'Express Auto', 'Budget Rentals',
    'Deluxe Cars', 'Easy Drive', 'Prestige Autos', 'Economy Rentals',
    'First Class Cars', 'Luxury Drive', 'Savvy Rentals', 'Prime Autos'
]

#Select the location ID, which is randomly selected from the previously created locations_df
rental_company_addresses = random.choices(locations_df['location_id'].tolist(), k=len(rental_company_names))

# create a dataframe to show the rental_companies table
rental_companies_df = pd.DataFrame({
    'rental_company_id': range(1, len(rental_company_names) + 1),
    'rental_company_name': rental_company_names,
    'rental_company_address': rental_company_addresses
})

# save the rental_companies table to a csv file
rental_companies_df.to_csv('rental_companies.csv', index=False)

car_types_df = pd.DataFrame({
    'car_type_id': range(1, 101),  # Assume 100 different types of cars
    'car_type': [random.choice(['Sedan', 'SUV', 'Convertible', 'Hatchback', 'Coupe']) for _ in range(100)],
    'manufacturer': [random.choice(['Toyota', 'Honda', 'Ford', 'BMW', 'Audi']) for _ in range(100)],
    'model': [random_string(5) for _ in range(100)]  # Assume 5 characters for model name
})

car_types_df.to_csv('car_types.csv', index=False)


discounts_df = pd.DataFrame({
    'customer_id': range(1, 601),
    'discount_degree': [round(random.uniform(0.00, 0.20), 2) for _ in range(600)]  # discount is from 0% to 20%
})

discounts_df.to_csv('discounts.csv', index=False)


payment_information_df = pd.DataFrame({
    'payment_id': range(1, 601),
    'customer_id': range(1, 601),
    'card_details': [str(random.randint(1000_0000_0000_0000, 9999_9999_9999_9999)) for _ in range(600)],
    'payment': [round(random.uniform(100, 5000), 2) for _ in range(600)],
    'billing_address': random.choices(locations_df['location_id'].tolist(), k=600),
    'discount': random.choices(discounts_df['customer_id'].tolist(), k=600)
})

payment_information_df.to_csv('payment_information.csv', index=False)


user_preferences_df = pd.DataFrame({
    'customer_id': range(1, 601),
    'preferred_hotel_id': random.choices(hotels_df['hotel_id'].tolist(), k=600),
    'preferred_car_model_id': random.choices(car_types_df['car_type_id'].tolist(), k=600),
    'preferred_airline_id': random.choices(airlines_df['airline_id'].tolist(), k=600)
})

user_preferences_df.to_csv('user_preferences.csv', index=False)


hotel_search_recommendations_df = pd.DataFrame({
    'hotel_id': hotels_df['hotel_id'],
    'room_type_id': random.choices(room_types_df['room_type_id'].tolist(), k=len(hotels_df)),
    'avg_price': [round(random.uniform(50, 300), 2) for _ in range(len(hotels_df))],
    'avg_review_score': [round(random.uniform(1, 5), 2) for _ in range(len(hotels_df))]
})

hotel_search_recommendations_df.to_csv('hotel_search_recommendations.csv', index=False)


car_rental_search_recommendations_df = pd.DataFrame({
    'rental_company_id': rental_companies_df['rental_company_id'],
    'car_type_id': random.choices(car_types_df['car_type_id'].tolist(), k=len(rental_companies_df)),
    'avg_price': [round(random.uniform(30, 100), 2) for _ in range(len(rental_companies_df))],
    'avg_review_score': [round(random.uniform(1, 5), 2) for _ in range(len(rental_companies_df))]
})

car_rental_search_recommendations_df.to_csv('car_rental_search_recommendations.csv', index=False)


flight_search_recommendations_df = pd.DataFrame({
    'flight_id': flights_df['flight_id'],
    'avg_price': [round(random.uniform(100, 1000), 2) for _ in range(len(flights_df))],
    'avg_review_score': [round(random.uniform(1, 5), 2) for _ in range(len(flights_df))]
})

flight_search_recommendations_df.to_csv('flight_search_recommendations.csv', index=False)

flight_bookings_df = pd.DataFrame({
    'booking_id': range(1, 601),
    'customer_id': range(1, 601),
    'payment_id': range(1, 601),
    'flight_id': random.choices(flights_df['flight_id'].tolist(), k=600),
    'adults_count': [random.randint(1, 5) for _ in range(600)],
    'children_count': [random.randint(0, 3) for _ in range(600)]
})

flight_bookings_df.to_csv('flight_bookings.csv', index=False)


hotel_bookings_df = pd.DataFrame({
    'booking_id': range(1, 601),
    'customer_id': range(1, 601),
    'payment_id': range(1, 601),
    'hotel_id': random.choices(hotels_df['hotel_id'].tolist(), k=600),
    'room_type': random.choices(room_types_df['room_type_id'].tolist(), k=600),
    'adults_count': [random.randint(1, 5) for _ in range(600)],
    'children_count': [random.randint(0, 3) for _ in range(600)]
})

hotel_bookings_df.to_csv('hotel_bookings.csv', index=False)



car_rental_bookings_df = pd.DataFrame({
    'booking_id': range(1, 601),
    'customer_id': range(1, 601),
    'payment_id': range(1, 601),
    'rental_company_id': random.choices(rental_companies_df['rental_company_id'].tolist(), k=600),
    'car_type': random.choices(car_types_df['car_type_id'].tolist(), k=600),
    'adults_count': [random.randint(1, 5) for _ in range(600)],
    'children_count': [random.randint(0, 3) for _ in range(600)]
})

car_rental_bookings_df.to_csv('car_rental_bookings.csv', index=False)



reviews_df = pd.DataFrame({
    'review_id': range(1, 601),
    'customer_id': range(1, 601),
    'hotel_booking_id': random.choices(hotel_bookings_df['booking_id'].tolist(), k=600),
    'car_rental_booking_id': random.choices(car_rental_bookings_df['booking_id'].tolist(), k=600),
    'flight_booking_id': random.choices(flight_bookings_df['booking_id'].tolist(), k=600),
    'date': pd.date_range(start='1/1/2020', periods=600),
    'review_text': [random_string(200) for _ in range(600)]
})

reviews_df.to_csv('reviews.csv', index=False)
'''

'\nnum_rows = 600\n\nlocations_df = pd.DataFrame({\n    \'location_id\': range(1, num_rows + 1),\n    \'address\': [f\'Address {i}\' for i in range(1, num_rows + 1)],\n    \'city\': [f\'City {random.randint(1, 100)}\' for _ in range(num_rows)],\n    \'state\': [f\'State {random.randint(1, 50)}\' for _ in range(num_rows)],\n    \'country\': [f\'Country {random.randint(1, 50)}\' for _ in range(num_rows)],\n})\n\nlocations_df.to_csv(\'locations.csv\', index=False)\n\n# 100 common first name\nfirst_names = [\n    "James", "John", "Robert", "Michael", "William", "David", "Richard", "Charles", "Joseph", "Thomas",\n    "Christopher", "Daniel", "Paul", "Mark", "Donald", "George", "Kenneth", "Steven", "Edward", "Brian",\n    "Ronald", "Anthony", "Kevin", "Jason", "Matthew", "Gary", "Timothy", "Jose", "Larry", "Jeffrey",\n    "Frank", "Scott", "Eric", "Stephen", "Andrew", "Raymond", "Gregory", "Joshua", "Jerry", "Dennis",\n    "Walter", "Patrick", "Peter", "Harold", "Douglas", "Henry", "Carl", "

In [6]:
#define a function that push all data to the table
def import_csv_to_table(csv_file, table_name):
    file_path = "C:\\Users\\zzyul\\OneDrive\\Desktop\\School\\Columbia\\Summer2023\\APAN5310\\Project\\CheckPoint4\\generated_datasets\\" + csv_file
    df = pd.read_csv(file_path)
    df.to_sql(table_name, engine, index=False, if_exists='append')

#push each csv file to the table
import_csv_to_table('locations.csv', 'locations')
import_csv_to_table('airlines.csv', 'airlines')
import_csv_to_table('car_types.csv', 'car_types')
import_csv_to_table('room_types.csv', 'room_types')
import_csv_to_table('rental_companies.csv', 'rental_companies')
import_csv_to_table('hotels.csv', 'hotels')
import_csv_to_table('customers.csv', 'customers')
import_csv_to_table('flights.csv', 'flights')
import_csv_to_table('discounts.csv', 'discounts')
import_csv_to_table('payment_information.csv', 'payment_information')
import_csv_to_table('user_preferences.csv', 'user_preferences')
import_csv_to_table('hotel_search_recommendations.csv', 'hotel_search_recommendations')
import_csv_to_table('car_rental_search_recommendations.csv', 'car_rental_search_recommendations')
import_csv_to_table('flight_search_recommendations.csv', 'flight_search_recommendations')
import_csv_to_table('flight_bookings.csv', 'flight_bookings')
import_csv_to_table('hotel_bookings.csv', 'hotel_bookings')
import_csv_to_table('car_rental_bookings.csv', 'car_rental_bookings')
#import_csv_to_table('reviews.csv', 'reviews')

print("All CSV files imported successfully!")

All CSV files imported successfully!


In [7]:
#print some generated results from the xx table
stmt = """
SELECT * FROM customers LIMIT 20;
"""

connection.execute(stmt).fetchall()

[(1, 'Victor', 'Reed', 'uBXvZlcHBg@TDBEc.com', '2619142097', 3, 320),
 (2, 'Michael', 'Cook', 'UPxEfyUMUC@UNLyL.com', '5067862620', 3, 265),
 (3, 'Jose', 'Anderson', 'kPSHEaGcTQ@MNOsX.com', '8011355067', 3, 405),
 (4, 'Jesse', 'Anderson', 'oZHbOrykeh@GRvom.com', '5341572200', 2, 538),
 (5, 'Antonio', 'Morris', 'tebldcBbSj@eHVHl.com', '1288120527', 1, 451),
 (6, 'Eric', 'Martin', 'rJnxYVtFdc@WtGFD.com', '3327473156', 5, 117),
 (7, 'Robert', 'Thomas', 'ImHAfMKGso@ZoTAL.com', '9137037832', 2, 13),
 (8, 'Shawn', 'Lee', 'khkejjXhZG@RVwaQ.com', '7818908039', 4, 214),
 (9, 'Dennis', 'Adams', 'VYWdlgQUDe@BkcXs.com', '8577479354', 2, 125),
 (10, 'Henry', 'Murphy', 'xRyOCRMsiQ@JkgJp.com', '5726368714', 3, 61),
 (11, 'Matthew', 'Jones', 'wYqBRknvgM@hmrzC.com', '4858071912', 2, 169),
 (12, 'Steven', 'Nelson', 'zqrWVCVgVg@RqkJk.com', '8686122445', 5, 205),
 (13, 'Craig', 'Hill', 'lyDsrnyIDe@hwTWK.com', '7860372967', 5, 327),
 (14, 'Brandon', 'Howard', 'uFpaCYTUUM@JTclb.com', '3330688611', 5, 421),


Use the datasets we found from Kaggle, import some data to our database.

In [8]:
#this part is to show that our ETL pipeline is capable of handling real world datasets
#therefore only a small demonstration is shown here using hotel reviews dataset from Kaggle
#since we cannot find a dataset online that exactly matches the database schema that we design.

df = pd.read_csv('Hotel_Reviews.csv')
#EDA
print(df.head())
print(len(df))
print(df.columns)

                                       Hotel_Address  \
0   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
1   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
2   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
3   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   
4   s Gravesandestraat 55 Oost 1092 AA Amsterdam ...   

   Additional_Number_of_Scoring Review_Date  Average_Score   Hotel_Name  \
0                           194    8/3/2017            7.7  Hotel Arena   
1                           194    8/3/2017            7.7  Hotel Arena   
2                           194   7/31/2017            7.7  Hotel Arena   
3                           194   7/31/2017            7.7  Hotel Arena   
4                           194   7/24/2017            7.7  Hotel Arena   

  Reviewer_Nationality                                    Negative_Review  \
0              Russia    I am so angry that i made this post available...   
1             Ireland                                     

In [9]:
#INSERT INTO reviews table
#insert reviews that are positive when random number is 1, negative when random number is 0
for i in range(600):
    random_int = random.randint(0, 1)
    if random == 1:
        k = 6
    else:
        k = 9

    stmt = f"""
    INSERT INTO reviews(customer_id, hotel_booking_id, car_rental_booking_id, flight_booking_id, date, review_text, review_score)
    VALUES({i+1},{i+1},{i+1},{i+1},'{datetime.strptime(df.iloc[i,2], '%m/%d/%Y').strftime('%Y/%m/%d')}','{df.iloc[i, k]}', {df.iloc[i,3]});
    """
    connection.execute(stmt)


In [10]:
#print some generated results from the xx table
stmt = """
SELECT * FROM reviews LIMIT 20;
"""

connection.execute(stmt).fetchall()

[(1, 1, 1, 1, 1, datetime.date(2017, 8, 3), ' Only the park outside of the hotel was beautiful ', Decimal('7.7')),
 (2, 2, 2, 2, 2, datetime.date(2017, 8, 3), ' No real complaints the hotel was great great location surroundings rooms amenities and service Two recommendations however firstly the staff upon ch ... (313 characters truncated) ... even a wrap or toasted sandwich option would be great Aside from those minor minor things fantastic spot and will be back when i return to Amsterdam ', Decimal('7.7')),
 (3, 3, 3, 3, 3, datetime.date(2017, 7, 31), ' Location was good and staff were ok It is cute hotel the breakfast range is nice Will go back ', Decimal('7.7')),
 (4, 4, 4, 4, 4, datetime.date(2017, 7, 31), ' Great location in nice surroundings the bar and restaurant are nice and have a lovely outdoor area The building also has quite some character ', Decimal('7.7')),
 (5, 5, 5, 5, 5, datetime.date(2017, 7, 24), ' Amazing location and building Romantic setting ', Decimal('7.7')),
 (