# Clean Data

In [1]:
import pandas as pd
import numpy as np
import json
import csv

In [2]:
from faker import Faker

In [3]:
import datetime

In [4]:
import random
import re

## Customer Info

Customer Profile

In [5]:
#Randomly Generate Customer Personal Info
fake = Faker()

num_cus = 10000

profiles = []
for i in range(num_cus):
    first_name = fake.first_name()
    last_name = fake.last_name()
    email = fake.email()
    profiles.append((first_name, last_name, email))
    
customer_df = pd.DataFrame(profiles).reset_index().rename(columns={'index':'customer_id',
                                                    0: 'first_name',
                                                    1: 'last_name',
                                                    2: 'email'})

#Randomly Generate royalty points with a specific distribution
weights = [0.6, 0.2, 0.15, 0.05]
royalty_points = np.random.choice([np.random.randint(1, 501),
                                   np.random.randint(501, 3001),
                                   np.random.randint(3001, 10001),
                                   np.random.randint(10001, 100000)],
                                   size=num_cus, p=weights)
customer_df['royalty_points'] = royalty_points

#Add royalty level base on royalty points. 

#royalty 分数
#1-500 Silver
#500-3000 Gold
#3000-10000 Platium
#10000+ Diamond

conditions = [
    (customer_df['royalty_points'] >= 1) & (customer_df['royalty_points'] <= 500),
    (customer_df['royalty_points'] > 500) & (customer_df['royalty_points'] <= 3000),
    (customer_df['royalty_points'] > 3000) & (customer_df['royalty_points'] <= 10000),
    (customer_df['royalty_points'] > 10000)
]
values = ['Silver', 'Gold', 'Platinum', 'Diamond']

customer_df['royalty_level'] = np.select(conditions, values)

In [6]:
customer_df.to_csv('customer_profiles.csv', index=False)

In [7]:
customer_df

Unnamed: 0,customer_id,first_name,last_name,email,royalty_points,royalty_level
0,0,Sara,Phillips,morgan34@example.org,77553,Diamond
1,1,Juan,Cummings,tamaralopez@example.net,456,Silver
2,2,Jeanne,Riley,taylorjacob@example.org,456,Silver
3,3,Lisa,Santos,michellesimpson@example.org,7196,Platinum
4,4,Kaitlyn,Watkins,ksmith@example.net,456,Silver
...,...,...,...,...,...,...
9995,9995,Edgar,Woods,lewisjoseph@example.com,2383,Gold
9996,9996,Cameron,Barnes,ebeck@example.net,456,Silver
9997,9997,Erin,Sanders,brianolsen@example.com,77553,Diamond
9998,9998,Alex,Collins,janetrussell@example.org,2383,Gold


In [8]:
customer_id_list = customer_df['customer_id'].tolist()

Membership_status Table

In [9]:
membership_status_df = pd.DataFrame({
    'royalty_level':['Silver', 'Gold', 'Platinum', 'Diamond'],
    'discount': [0.98,0.95,0.9,0.85]
})

In [10]:
membership_status_df

Unnamed: 0,royalty_level,discount
0,Silver,0.98
1,Gold,0.95
2,Platinum,0.9
3,Diamond,0.85


## Car Profiles

In [11]:
car_df_full = pd.read_csv('CarRentalData.csv')

car_df_full = car_df_full.rename(columns={
    'vehicle.make': 'car_brand',
    'vehicle.model': 'car_model',
    'vehicle.type': 'car_type',
    'fuelType': 'fuel_type',
    'vehicle.year': 'car_year',
    'owner.id': 'owner_id'
})

In [12]:
car_df = car_df_full[['car_brand',
                      'car_model',
                      'car_type',
                      'fuel_type',
                      'car_year',
                      'owner_id']]

#Randomly Generate color for cars
colors = np.random.choice(['red', 'blue', 'green','grey','black','white','silver','yellow','champagne'], 
                          size=len(car_df))
car_df['color'] = colors

#Randomly Generate plate numbers for cars


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car_df['color'] = colors


In [13]:
unique_plate_numbers = set()

# Generate plate numbers until the desired number of unique values is reached
while len(unique_plate_numbers) < len(car_df):
    plate_number = ''.join(np.random.choice(list('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'), size=5))
    unique_plate_numbers.add(plate_number)

plate_numbers = list(unique_plate_numbers)
car_df['plate_number'] = plate_numbers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car_df['plate_number'] = plate_numbers


In [14]:
car_df.iloc[:5,]

Unnamed: 0,car_brand,car_model,car_type,fuel_type,car_year,owner_id,color,plate_number
0,Tesla,Model X,suv,ELECTRIC,2019,12847615,champagne,8S8SG
1,Tesla,Model X,suv,ELECTRIC,2018,15621242,red,ZX1FN
2,Toyota,Prius,car,HYBRID,2012,10199256,silver,KMZA9
3,Ford,Mustang,car,GASOLINE,2018,9365496,yellow,ULFFC
4,Chrysler,Sebring,car,GASOLINE,2010,3553565,champagne,RR18T


In [15]:
plate_list = car_df['plate_number'].tolist()

## Hotel Profiles

Hotels

In [16]:
with open("hotels.json",encoding='utf-8') as file:
    hotels_data = json.load(file)

In [17]:
hotels = []
for item in hotels_data['root']['page']:
    hotel = {}
    if 'hotel_id' in item['record']:
        hotel['hotel_id'] = item['record']['hotel_id']
        hotel['hotel_name'] = item['record']['hotel_name']
    hotels.append(hotel)

In [18]:
hotels_df = pd.DataFrame(hotels)
hotels_df = hotels_df.drop_duplicates(subset='hotel_id').reset_index(drop=True)
hotels_df.dropna(subset=['hotel_id'], inplace=True)

In [19]:
hotels_df

Unnamed: 0,hotel_id,hotel_name
0,4521185,OYO 23501 Log Inn Plaza
1,5323150,SPOT ON 40211 Hotel Prakash Inn SPOT
2,1662781,OYO Flagship 205 Bandhan
3,4048734,Paradise Rooms in Gangtok
4,5617914,Dream house. G-72/2
...,...,...
15443,4031776,OYO Flagship 15698 Citiotel Shivajinagar
15444,4625077,OYO 19008 Hotel Doon 999
15445,5556801,Hexa Royal Residency
15446,5311720,Illas Domain Orange Apartment


In [20]:
hotels_id_list = hotels_df['hotel_id'].tolist()

Hotel Rooms

In [21]:
rooms = []
for item in hotels_data['root']['page']:
    if 'hotel_id' in item['record']:
        hotel_id = item['record']['hotel_id']
        for room_type in item['record']['room_type']:
            room = {}
            room['hotel_id'] = hotel_id
            room['room_type_name'] = room_type['room_type_name']
            room['room_type_price'] = room_type['room_type_price']
            room['room_type_occupancy'] = room_type['room_type_occupancy']
            room['room_type_breakfast'] = room_type['room_type_breakfast']
            room['room_type_cancellation'] = room_type['room_type_cancellation']
            rooms.append(room)

room_types_df = pd.DataFrame(rooms)

In [22]:
room_types_df[:5]

Unnamed: 0,hotel_id,room_type_name,room_type_price,room_type_occupancy,room_type_breakfast,room_type_cancellation
0,4521185,Small Double Room,1304.0,2,breakfast,free_cancellation
1,4521185,Small Double Room,1077.0,1,breakfast,free_cancellation
2,4521185,Classic Triple Room,1379.0,2,Room Only,free_cancellation
3,4521185,Classic Triple Room,1663.0,3,Room Only,free_cancellation
4,4521185,Classic Triple Room,1077.0,1,Room Only,free_cancellation


In [23]:
def random_room_number():
    room_number = str(random.randint(1, 5)) + str(random.randint(0, 9)) + str(random.randint(0, 3)) + str(random.randint(1, 9))
    return room_number

In [24]:
def duplicate_rooms(group):
    num_duplicates = random.randint(5, 30)
    room_numbers = [random_room_number() for i in range(num_duplicates)]
    new_df = group.sample(n=num_duplicates, replace=True)
    new_df['room_number'] = room_numbers
    return new_df

In [25]:
hotel_rooms_df = room_types_df.groupby(['hotel_id', 'room_type_name']).apply(duplicate_rooms).reset_index(drop=True)

In [26]:
hotel_rooms_df['room_type_price'].replace('N/A', None, inplace=True)
hotel_rooms_df.drop_duplicates(subset=['hotel_id', 'room_number'], inplace=True)

In [27]:
hotel_rooms_df

Unnamed: 0,hotel_id,room_type_name,room_type_price,room_type_occupancy,room_type_breakfast,room_type_cancellation,room_number
0,1002081,Deluxe Double Room,,3,Room Only,Non-refundable,4406
1,1002081,Deluxe Double Room,,3,Room Only,Non-refundable,1321
2,1002081,Deluxe Double Room,,3,Room Only,Non-refundable,2302
3,1002081,Deluxe Double Room,,3,Room Only,Non-refundable,5022
4,1002081,Deluxe Double Room,,3,Room Only,Non-refundable,4933
...,...,...,...,...,...,...,...
560712,909966,Superior Double or Twin Room,2912.0,2,half_board,free_cancellation,1028
560713,909966,Superior Double or Twin Room,2912.0,2,half_board,free_cancellation,2709
560714,909966,Superior Double or Twin Room,2912.0,2,half_board,free_cancellation,4634
560715,909966,Superior Double or Twin Room,2912.0,2,half_board,free_cancellation,1835


## Flights Profiles

In [28]:
flights_df = pd.read_csv('flights.csv')

In [29]:
flights_df['flight_route_id'] = flights_df.groupby(['origin', 'dest']).ngroup()

In [30]:
flights_df

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour,flight_route_id
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T05:00:00Z,34
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T05:00:00Z,187
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T05:00:00Z,122
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01T05:00:00Z,93
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T06:00:00Z,156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336771,2013,9,30,,1455,,,1634,,9E,3393,,JFK,DCA,,213,14,55,2013-09-30T14:00:00Z,103
336772,2013,9,30,,2200,,,2312,,9E,3525,,LGA,SYR,,198,22,0,2013-09-30T22:00:00Z,219
336773,2013,9,30,,1210,,,1330,,MQ,3461,N535MQ,LGA,BNA,,764,12,10,2013-09-30T12:00:00Z,160
336774,2013,9,30,,1159,,,1344,,MQ,3572,N511MQ,LGA,CLE,,419,11,59,2013-09-30T11:00:00Z,169


Flight Info

In [31]:
flight_info_df = flights_df.loc[:, ['flight','flight_route_id','carrier','tailnum',
                                    'dep_time','arr_time','dep_delay','arr_delay','time_hour']]
flight_info_df = flight_info_df.rename(columns={'flight': 'flight_info_id'})
flight_info_df['time_hour'] = pd.to_datetime(flight_info_df['time_hour'])
flight_info_df = flight_info_df.replace({np.nan: None})
flight_info_df = flight_info_df.drop_duplicates(subset='flight_info_id')

In [32]:
flight_info_df

Unnamed: 0,flight_info_id,flight_route_id,carrier,tailnum,dep_time,arr_time,dep_delay,arr_delay,time_hour
0,1545,34,UA,N14228,517.0,830.0,2.0,11.0,2013-01-01 05:00:00+00:00
1,1714,187,UA,N24211,533.0,850.0,4.0,20.0,2013-01-01 05:00:00+00:00
2,1141,122,AA,N619AA,542.0,923.0,2.0,33.0,2013-01-01 05:00:00+00:00
3,725,93,B6,N804JB,544.0,1004.0,-1.0,-18.0,2013-01-01 05:00:00+00:00
4,461,156,DL,N668DN,554.0,812.0,-6.0,-25.0,2013-01-01 06:00:00+00:00
...,...,...,...,...,...,...,...,...,...
330040,6093,4,EV,N11192,1807.0,1957.0,128.0,127.0,2013-09-23 15:00:00+00:00
332375,3988,95,9E,N8412F,811.0,950.0,-4.0,2.0,2013-09-26 08:00:00+00:00
334398,3583,106,9E,N8554A,952.0,1150.0,95.0,89.0,2013-09-28 08:00:00+00:00
334837,3857,103,9E,N805AY,2021.0,2121.0,-14.0,-48.0,2013-09-28 20:00:00+00:00


Airlines

In [33]:
airlines_df = pd.read_csv('airlines.csv',usecols=['name','alias','IATA','ICAO'])

airlines_df = airlines_df.reset_index().rename(columns={'index': 'id'})

In [34]:
airlines_df

Unnamed: 0,id,name,alias,IATA,ICAO
0,0,Private flight,\N,-,
1,1,135 Airways,\N,,GNL
2,2,1Time Airline,\N,1T,RNX
3,3,2 Sqn No 1 Elementary Flying Training School,\N,,WYT
4,4,213 Flight Unit,\N,,TFU
...,...,...,...,...,...
6156,6156,GX Airlines,,,CBG
6157,6157,Lynx Aviation (L3/SSX),,,SSX
6158,6158,Jetgo Australia,,JG,\N
6159,6159,Air Carnival,,2S,\N


Airports

In [35]:
airports_df = pd.read_csv('airports_info.csv',usecols=['name','IATA','ICAO'])

airports_df = airports_df.reset_index().rename(columns={'index': 'airport_id'})

In [36]:
airports_df

Unnamed: 0,airport_id,name,IATA,ICAO
0,0,Goroka Airport,GKA,AYGA
1,1,Madang Airport,MAG,AYMD
2,2,Mount Hagen Kagamuga Airport,HGU,AYMH
3,3,Nadzab Airport,LAE,AYNZ
4,4,Port Moresby Jacksons International Airport,POM,AYPY
...,...,...,...,...
7693,7693,Rogachyovo Air Base,\N,ULDA
7694,7694,Ulan-Ude East Airport,\N,XIUW
7695,7695,Krechevitsy Air Base,\N,ULLK
7696,7696,Desierto de Atacama Airport,CPO,SCAT


In [37]:
#可以加country, timezone

Flight Routes

In [38]:
flight_routes_df = flights_df.loc[:, ['flight_route_id','origin','dest','distance','hour','minute']]
flight_routes_df.drop_duplicates(subset=['origin', 'dest'], inplace=True)

In [39]:
#Transform origin airport from IATA code to airport id
ori_id_trans = pd.merge(flight_routes_df, airports_df[['IATA', 'airport_id']], 
                     how='left', left_on='origin', right_on='IATA')

ori_id_trans = ori_id_trans.drop(columns=['origin','IATA']).rename(columns={'airport_id': 'origin'})

#Transform dest airport from IATA code to airport id
dest_id_trans = pd.merge(ori_id_trans, airports_df[['IATA', 'airport_id']], 
                     how='left', left_on='dest', right_on='IATA')

dest_id_trans = dest_id_trans.drop(columns=['dest','IATA']).rename(columns={'airport_id': 'dest'})

flight_routes_df = dest_id_trans

In [40]:
flight_routes_df

Unnamed: 0,flight_route_id,distance,hour,minute,origin,dest
0,34,1400,5,15,3295,3350
1,187,1416,5,29,3497,3350
2,122,1089,5,40,3597,3376
3,93,1576,5,45,3597,2740
4,156,762,6,0,3497,3482
...,...,...,...,...,...,...
219,221,655,18,0,3497,3809
220,202,563,15,20,3497,3316
221,83,644,14,0,3295,3809
222,1,3370,16,15,3295,3574


Airplanes

In [41]:
airplane_df = pd.read_csv('planes.csv')
airplane_df = airplane_df.reset_index().rename(columns={'index': 'airplane_id','name':'airplane_name'})

In [42]:
airplane_df

Unnamed: 0,airplane_id,airplane_name,IATA,ICAO
0,0,Aerospatiale (Nord) 262,ND2,N262
1,1,Aerospatiale (Sud Aviation) Se.210 Caravelle,CRV,S210
2,2,Aerospatiale SN.601 Corvette,NDC,S601
3,3,Aerospatiale/Alenia ATR 42-300,AT4,AT43
4,4,Aerospatiale/Alenia ATR 42-500,AT5,AT45
...,...,...,...,...
241,241,Tupolev Tu-144,\N,T144
242,242,Tupolev Tu-154,TU5,T154
243,243,Tupolev Tu-204,T20,T204
244,244,Yakovlev Yak-40,YK4,YK40


### Order Table

In [43]:
#Order table
customer_id = [random.choice(customer_id_list) for i in range(3000)]

start_date = datetime.date(2023, 1, 1)
today = datetime.date.today()
order_date = [fake.date_between(start_date=start_date, end_date=today) for i in range(3000)]

order_tracker_df = pd.DataFrame({'customer_id': customer_id,
                              'order_time': order_date
                                   })

order_types = ['hotel'] * 1000 + ['car'] * 1000 + ['flight'] * 1000
order_tracker_df['order_type'] = order_types
order_ids = list(range(1, 3001))
order_tracker_df['order_id'] = order_ids

In [44]:
order_tracker_df

Unnamed: 0,customer_id,order_time,order_type,order_id
0,2414,2023-02-07,hotel,1
1,6373,2023-04-20,hotel,2
2,4525,2023-01-05,hotel,3
3,8485,2023-03-24,hotel,4
4,6115,2023-04-18,hotel,5
...,...,...,...,...
2995,2067,2023-01-07,flight,2996
2996,8493,2023-01-04,flight,2997
2997,8009,2023-02-22,flight,2998
2998,4002,2023-03-20,flight,2999


In [45]:
order_id_list = order_tracker_df['order_id'].tolist()

reservation_tracker Table

In [46]:
#Suppose 3500 reservations
num_res = 3500

dup_order_id = [random.choice(order_id_list) for i in range(num_res-len(order_id_list))]
res_order_id = order_id_list + dup_order_id

reservation_tracker_df = pd.DataFrame({
    'order_id':res_order_id,
    'reservation_type': np.random.choice(['flight', 'car', 'hotel'], size=num_res)
})

reservation_tracker_df = reservation_tracker_df.reset_index().rename(columns={'index': 'reservation_id'})

In [47]:
reservation_tracker_df

Unnamed: 0,reservation_id,order_id,reservation_type
0,0,1,hotel
1,1,2,flight
2,2,3,flight
3,3,4,flight
4,4,5,flight
...,...,...,...
3495,3495,1727,flight
3496,3496,1525,flight
3497,3497,2578,flight
3498,3498,1847,flight


In [48]:
#Get the list of reservation id for car reservations
car_res_id_list = reservation_tracker_df.loc[reservation_tracker_df['reservation_type'] == 'car', 'reservation_id']
car_res_num = len(car_res_id_list)

#Get the list of reservation id for hotel reservations
hotel_res_id_list = reservation_tracker_df.loc[reservation_tracker_df['reservation_type'] == 'hotel', 'reservation_id']
hotel_res_num = len(hotel_res_id_list)

#Get the list of reservation id for flight reservations
flight_res_id_list = reservation_tracker_df.loc[reservation_tracker_df['reservation_type'] == 'flight', 'reservation_id']
flight_res_num = len(flight_res_id_list)

### Car Reservations

In [49]:
#Randomly choose plate number
plate_number = [random.choice(plate_list) for i in range(car_res_num)]

#Randomly Generate Pickup and Dropoff Dates
start_date = datetime.date(2023, 1, 1)
end_date = datetime.date(2023, 12, 31)
pick_up_date = [fake.date_between(start_date=start_date, end_date=end_date) for i in range(car_res_num)]
drop_date = [pick_up_date[i] + datetime.timedelta(days=random.randint(1, 7)) for i in range(car_res_num)]

#Randomly Generate price
price = [round(random.uniform(50, 500), 2) for i in range(car_res_num)]

In [50]:
#car_reservation table without reservation id
car_reservation_df = pd.DataFrame({'car_resevation_id': car_res_id_list,
                                    'plate_number': plate_number,
                                    'pick_up_date': pick_up_date,
                                    'drop_date': drop_date,
                                    'price': price
                                   })

In [51]:
car_reservation_df

Unnamed: 0,car_resevation_id,plate_number,pick_up_date,drop_date,price
5,5,A8AEE,2023-04-09,2023-04-10,158.66
6,6,5QKBH,2023-06-13,2023-06-15,201.76
10,10,ICUSK,2023-01-14,2023-01-16,122.58
14,14,ZQBG4,2023-04-21,2023-04-22,395.43
15,15,E21WZ,2023-01-05,2023-01-06,166.82
...,...,...,...,...,...
3483,3483,299LW,2023-07-02,2023-07-07,241.71
3485,3485,QLSQO,2023-04-20,2023-04-22,240.86
3491,3491,79ATG,2023-07-28,2023-07-29,179.80
3492,3492,T16YG,2023-09-18,2023-09-19,223.74


### Hotel Reservations

In [52]:
len(hotel_id)

7

In [53]:
#Suppose some hotels have more than one reservation

#First filter the hotel_id assume only 1 res
hotel_id = [random.choice(hotels_id_list) for i in range(hotel_res_num)]
filtered_df = hotel_rooms_df.loc[hotel_rooms_df['hotel_id'].isin(hotel_id)]
room_numbers = filtered_df.groupby('hotel_id')['room_number'].apply(lambda x: np.random.choice(x)).reset_index()

random_subset = room_numbers.sample(n=(hotel_res_num-len(room_numbers)), replace=True)
hotel_id_room = pd.concat([room_numbers, random_subset], ignore_index=True)
hotel_id_room['hotel_reservation_id'] = hotel_res_id_list

#randomly generate date
arrival_date = [fake.date_between(start_date=start_date, end_date=end_date) for i in range(hotel_res_num)]
departure_date = [arrival_date[i] + datetime.timedelta(days=random.randint(1, 7)) for i in range(hotel_res_num)]

#Create hotel reservation dataframe
hotel_reservation_df = pd.DataFrame({'hotel_reservation_id': hotel_res_id_list,
                                    'arrival_date': arrival_date,
                                    'departure_date': departure_date,
                                    'car_parking_needed': [random.choice([0,1]) for i in range(hotel_res_num)]
                                   })

hotel_reservation_df = pd.merge(hotel_reservation_df, hotel_id_room, on='hotel_reservation_id')

In [54]:
hotel_reservation_df

Unnamed: 0,hotel_reservation_id,arrival_date,departure_date,car_parking_needed,hotel_id,room_number
0,0,2023-03-17,2023-03-19,0,1004487,2311
1,20,2023-02-03,2023-02-08,1,1226924,2933
2,26,2023-10-09,2023-10-16,0,1268930,4138
3,27,2023-05-20,2023-05-25,0,1270832,3833
4,33,2023-08-12,2023-08-16,0,1321594,5604
...,...,...,...,...,...,...
377,1147,2023-05-24,2023-05-31,0,5557157,2012
378,1152,2023-09-23,2023-09-27,1,311500,4214
379,1155,2023-12-27,2024-01-03,0,2593927,4528
380,1160,2023-03-24,2023-03-25,0,2694858,2406


### Flight Reservations

In [55]:
def generate_seat_number(row):
    seat_class = row['class']
    if seat_class == 'First Class':
        return (''.join(np.random.choice(list('1'), size=1)) + ''.join(np.random.choice(list('0123456789'), size=1)) +''.join(np.random.choice(list('ABCDEF'), size=1)))
    if seat_class == 'Business Class':
        return (''.join(np.random.choice(list('1'), size=1)) + ''.join(np.random.choice(list('0123456789'), size=1)) +''.join(np.random.choice(list('ABCDEF'), size=1)))
    if seat_class == 'Premium Economy':
        return (''.join(np.random.choice(list('2345'), size=1)) + ''.join(np.random.choice(list('0123456789'), size=1)) +''.join(np.random.choice(list('ABCDEF'), size=1)))
    else:
        return (''.join(np.random.choice(list('3456'), size=1)) + ''.join(np.random.choice(list('0123456789'), size=1)) +''.join(np.random.choice(list('ABCDEF'), size=1)))

In [56]:
flight_reservation_df = pd.DataFrame({'flight_reservation_id': flight_res_id_list,
                                    'flight_status': [random.choice([0,1]) for i in range(flight_res_num)],
                                    'class': [random.choice(['Economy Class', 'Premium Economy', 'Business Class', 'First Class']) for i in range(flight_res_num)]                       
                                   })

In [57]:
flight_reservation_df['seat'] = flight_reservation_df.apply(generate_seat_number, axis=1)

In [58]:
means = {'Economy Class': 500, 'Premium Economy': 1000, 'Business Class': 3000, 'First Class': 6000}
stds = {'Economy Class': 50, 'Premium Economy': 200, 'Business Class': 500, 'First Class': 1000}

# generate prices for each class
flight_reservation_df['price']  = flight_reservation_df['class'].apply(lambda x: np.random.normal(means[x], stds[x]))

In [59]:
flight_reservation_df

Unnamed: 0,flight_reservation_id,flight_status,class,seat,price
1,1,0,Business Class,15E,2888.507549
2,2,1,First Class,16A,6805.126452
3,3,1,Economy Class,32A,536.554242
4,4,1,Business Class,15A,2411.187375
7,7,0,Business Class,10D,2222.838002
...,...,...,...,...,...
3495,3495,1,Premium Economy,23C,1108.100199
3496,3496,0,First Class,17D,6439.173305
3497,3497,1,Economy Class,46A,511.369628
3498,3498,1,Economy Class,36E,503.891601


# Address Profile

In [60]:
# Randomly generate 50,000 address profiles
num_addresses = 50000
addresses = []
for i in range(num_addresses):
    street = fake.street_address()
    city = fake.city()
    state = fake.state()
    country = fake.country()
    zipcode = fake.zipcode()
    addresses.append((street, city, state, country, zipcode))

address_df = pd.DataFrame(addresses, columns=['street', 'city', 'state', 'country', 'zipcode'])
address_df = address_df.reset_index().rename(columns={'index': 'address_id'})

In [61]:
address_df.to_csv('address.csv', index=False)

In [62]:
address_id_list = address_df['address_id'].tolist()

In [63]:
hotel_add_num = len(hotels_df)
hotel_address_list = address_id_list[0:hotel_add_num]

airport_add_num = len(airports_df)
airports_address_list = address_id_list[hotel_add_num:(hotel_add_num+airport_add_num)]

customer_add_num = len(customer_df)
customer_address_list = address_id_list[(hotel_add_num+airport_add_num):(hotel_add_num+airport_add_num+customer_add_num)]

#car pickup and dropoff location, assume 1.5x num of reservations
car_add_num = int(1.5 * len(car_reservation_df))
car_address_list = address_id_list[(hotel_add_num+airport_add_num+customer_add_num):(hotel_add_num+airport_add_num+customer_add_num+ car_add_num)]

#### Assign Address Id to different tables

In [64]:
hotels_df['address_id'] = [random.choice(hotel_address_list) for i in range(len(hotels_df))]

airports_df['address_id']= [random.choice(airports_address_list) for i in range(len(airports_df))]

customer_df['address_id']= [random.choice(customer_address_list) for i in range(len(customer_df))]

car_reservation_df['pick_up_location'] = [random.choice(car_address_list) for i in range(car_res_num)]
car_reservation_df['drop_location'] = [random.choice(car_address_list) for i in range(car_res_num)]

# Create Connection to PostgreSQL

In [65]:
from sqlalchemy import create_engine

In [66]:
# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:tyyy@localhost/5310_group'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()

# Create Tables in PostgreSQL

# Import data into PostgreSQL

In [67]:
car_reservation_df.columns

Index(['car_resevation_id', 'plate_number', 'pick_up_date', 'drop_date',
       'price', 'pick_up_location', 'drop_location'],
      dtype='object')

In [68]:
customer_df = customer_df.reindex(columns=['customer_id', 'first_name', 'last_name', 'email',  'address_id',
                                           'royalty_points','royalty_level'])

hotel_rooms_df = hotel_rooms_df.reindex(columns = ['hotel_id', 'room_number', 'room_type_name', 'room_type_price', 
                                                   'room_type_occupancy','room_type_breakfast', 'room_type_cancellation'])

hotel_reservation_df = hotel_reservation_df.reindex(columns = ['hotel_reservation_id', 'hotel_id', 'room_number', 
                                                               'arrival_date', 'departure_date', 'car_parking_needed'])

flight_routes_df = flight_routes_df.reindex(columns = ['flight_route_id', 'origin', 'dest', 'distance', 'hour', 'minute'])

car_reservation_df = car_reservation_df.reindex(columns = ['car_resevation_id', 'plate_number', 'pick_up_date', 'pick_up_location',
                                                           'drop_date', 'drop_location','price'])

In [69]:
#membership_status
membership_status_df.to_sql(name='membership_status', con=engine, if_exists='append', index=False)

#address
address_df.to_sql(name='address', con=engine, if_exists='append', index=False)

#customers 
customer_df.to_sql(name='customers', con=engine, if_exists='append', index=False)

#order_tracker
order_tracker_df.to_sql(name='order_tracker', con=engine, if_exists='append', index=False)

#reservation_tracker
reservation_tracker_df.to_sql(name='reservation_tracker', con=engine, if_exists='append', index=False)


##Hotel Tables
#hotels
hotels_df.to_sql(name='hotels', con=engine, if_exists='append', index=False)
#hotel_rooms
hotel_rooms_df.to_sql(name='hotel_rooms', con=engine, if_exists='append', index=False)
#hotel_reservations
hotel_reservation_df.to_sql(name='hotel_reservations', con=engine, if_exists='append', index=False)



#Flights Tables

#airports
airports_df.to_sql(name='airports', con=engine, if_exists='append', index=False)
#airlines
airlines_df.to_sql(name='airlines', con=engine, if_exists='append', index=False)
#flight_routes
flight_routes_df.to_sql(name='flight_routes', con=engine, if_exists='append', index=False)
#airplanes
airplane_df.to_sql(name='airplanes', con=engine, if_exists='append', index=False)
#flight_info
flight_info_df.to_sql(name='flight_info', con=engine, if_exists='append', index=False)
#flight_reservation
flight_reservation_df.to_sql(name='flight_reservation', con=engine, if_exists='append', index=False)



#Car Rental Tables

#cars
car_df.to_sql(name='cars', con=engine, if_exists='append', index=False)
#car_reservations
car_reservation_df.to_sql(name='car_reservations', con=engine, if_exists='append', index=False)



166