In [735]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# set the max columns to none
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

import statistics
from statistics import mode

In [171]:
df_datafiniti = pd.read_csv('Datafiniti_Hotel_Reviews.csv')
# select potential parameter only (drop)
df_datafiniti.drop(columns=['dateAdded','dateUpdated','keys','postalCode','reviews.dateAdded','reviews.dateSeen','reviews.sourceURLs','sourceURLs','websites'], inplace=True)
df_datafiniti.shape

(10000, 17)

In [172]:
# drop data null dan duplicate
df_datafiniti.dropna(inplace=True)
df_datafiniti.drop_duplicates(inplace=True)
print('jumlah data setelah drop null: ',len(df_datafiniti))

jumlah data setelah drop null:  9787


## Eksplorasi Data

In [173]:
# process data review
print('jumlah nama hotel: ',len(df_datafiniti['name'].unique())) 
print('jumlah alamat hotel: ',len(df_datafiniti['address'].unique())) 
print('jumlah kota hotel: ',len(df_datafiniti['city'].unique())) 
print('jumlah provinsi hotel: ',len(df_datafiniti['province'].unique())) 
print('jumlah country hotel: ',len(df_datafiniti['country'].unique())) 

print('jumlah categories: ',len(df_datafiniti['categories'].unique())) 
# categories berupa nested data mungkin di drop
print('jumlah primary categories: ',len(df_datafiniti['primaryCategories'].unique()), df_datafiniti['primaryCategories'].unique())
# primary categories kurang bermanfaat kemungkinan di drop

# create review dataframe
df_review = df_datafiniti.copy()
df_review.drop(columns=['id','categories','primaryCategories','city','country','latitude','longitude','province','address'], inplace=True)
df_review.reset_index(drop=True, inplace=True)
print('jumlah data hotel: ',len(df_review))

# recreate the property_id for better indexing
rid = []
for n in range(1, len(df_review)+1):
    rid.append('R{0:06}'.format(n))
df_review['review_id'] = rid

# rearrange column
df_review = df_review[['review_id','reviews.username','reviews.userCity','reviews.userProvince','name','reviews.rating','reviews.text','reviews.title','reviews.date']]


jumlah nama hotel:  1311
jumlah alamat hotel:  1432
jumlah kota hotel:  842
jumlah provinsi hotel:  46
jumlah country hotel:  1
jumlah categories:  631
jumlah primary categories:  4 ['Accommodation & Food Services'
 'Accommodation & Food Services,Arts Entertainment & Recreation'
 'Accommodation & Food Services,Administrative & Support & Waste Management & Remediation'
 'Accommodation & Food Services,Agriculture']
jumlah data hotel:  9787


In [174]:
df_review

Unnamed: 0,review_id,reviews.username,reviews.userCity,reviews.userProvince,name,reviews.rating,reviews.text,reviews.title,reviews.date
0,R000001,tatsurok2018,San Jose,UnitedStates,Best Western Plus South Coast Inn,3,"This hotel was nice and quiet. Did not know, t...",Best Western Plus Hotel,2018-01-01T00:00:00.000Z
1,R000002,STEPHEN N,San Francisco,CA,Best Western Carmel's Town House Lodge,4,We stayed in the king suite with the separatio...,Clean rooms at solid rates in the heart of Carmel,2016-04-02T00:00:00Z
2,R000003,15Deborah,Prescott Valley,AZ,Best Western Carmel's Town House Lodge,3,"Parking was horrible, somebody ran into my ren...",Business,2016-01-06T00:00:00Z
3,R000004,Wilfredo M,Guaynabo,PR,Best Western Carmel's Town House Lodge,5,Not cheap but excellent location. Price is som...,Very good,2016-08-22T00:00:00Z
4,R000005,Luc D,Reno,NV,Best Western Carmel's Town House Lodge,2,If you get the room that they advertised on th...,Low chance to come back here,2016-03-21T00:00:00Z
...,...,...,...,...,...,...,...,...,...
9782,R009783,Tiffany1017,Wallingford,CT,Hampton Inn Hampton-newport News,4,My friends and I took a trip to Hampton for th...,Very accommodating and friendly staff!,2015-12-24T00:00:00Z
9783,R009784,bobg187,Homer,AK,Hampton Inn Hampton-newport News,5,"from check in to departure, staff is friendly,...","comfortable, friendly, clean, professional",2015-11-17T00:00:00Z
9784,R009785,K261ANbrendah,Conway,AR,Hampton Inn Hampton-newport News,5,This Hampton is located on a quiet street acro...,Great location,2016-07-06T00:00:00Z
9785,R009786,soccerrocks2016,Hunter,NewYork,Roseberry's Inn,5,Awesome wings (my favorite was garlic parmesan...,Great Atmosphere!,2016-10-26T00:00:00.000Z


In [175]:
# process data hotel

# alamat terindikasi duplikat, sehingga hotel menyesuaikan dengan nama hotel
df_hotel = df_datafiniti.copy()
df_hotel = df_hotel.drop_duplicates(subset=['name'])
df_hotel.drop(columns=['id','categories','primaryCategories','reviews.date','reviews.rating','reviews.text','reviews.title','reviews.userCity','reviews.userProvince','reviews.username'], inplace=True)
df_hotel.reset_index(drop=True, inplace=True)
print('jumlah data hotel: ',len(df_hotel))

# calculate hotel review mean by hotel name
df_hotel['mean_review_rating'] = df_review[['reviews.rating','name']].groupby(by="name").mean()['reviews.rating'].values

# recreate the property_id for better indexing, but gonna take the ID from hotel data goibibo
# hid = []
# for n in range(1, len(df_goibibo)+1):
#     hid.append('H{0:06}'.format(n))
# df_goibibo['property_id'] = hid

jumlah data hotel:  1311


In [176]:
df_hotel

Unnamed: 0,address,city,country,latitude,longitude,name,province,mean_review_rating
0,5620 Calle Real,Goleta,US,34.441780,-119.819790,Best Western Plus South Coast Inn,CA,1.0
1,5th And San Carlos PO Box 3574,Carmel by the Sea,US,36.557220,-121.921940,Best Western Carmel's Town House Lodge,CA,2.5
2,167 W Main St,Lexington,US,38.047014,-84.497742,21c Museum Hotel Lexington,KY,4.0
3,115 W Steve Wariner Dr,Russell Springs,US,37.065296,-85.073580,Springs Motel LLC,KY,3.0
4,2240 Buena Vista Rd,Lexington,US,38.042100,-84.427100,Microtel Inn Suites By Wyndham Lexington,KY,4.0
...,...,...,...,...,...,...,...,...
1306,1820 5th Ave S,Birmingham,US,33.513333,-86.797809,Courtyard Birmingham Downtown at UAB,AL,4.0
1307,2555 Hilton Garden Dr,Auburn,US,32.605260,-85.430530,Hilton Garden Inn Auburn/Opelika,AL,3.7
1308,3101 Coliseum Dr,Hampton,US,37.057650,-76.393310,Hampton Inn Hampton-newport News,VA,2.0
1309,7886 Main Street,Hunter,US,42.210915,-74.215309,Roseberry's Inn,NY,5.0


In [177]:
# process data user
df_user = df_datafiniti.copy()
df_user = df_user.drop_duplicates(subset=['reviews.username'])
# select user related parameter
df_user = df_user[['reviews.username','reviews.userCity','reviews.userProvince']]
df_user.reset_index(drop=True, inplace=True)
print('jumlah data user: ',len(df_user))

# recreate the user_id for better indexing
uid = []
for n in range(1, len(df_user)+1):
    uid.append('U{0:06}'.format(n))
df_user['user_id'] = uid

# rearrange column
df_user = df_user[['user_id','reviews.username','reviews.userCity','reviews.userProvince']]

# calculate total review by each user
# df_user['total_review'] = df_datafiniti[['reviews.username','id']].groupby(by="reviews.username").count()['id'].values
# print('Variasi dari jumlah review per user : ', df_datafiniti[['reviews.username','id']].groupby(by="reviews.username").count()['id'].unique())

jumlah data user:  9219


In [178]:
df_user

Unnamed: 0,user_id,reviews.username,reviews.userCity,reviews.userProvince
0,U000001,tatsurok2018,San Jose,UnitedStates
1,U000002,STEPHEN N,San Francisco,CA
2,U000003,15Deborah,Prescott Valley,AZ
3,U000004,Wilfredo M,Guaynabo,PR
4,U000005,Luc D,Reno,NV
...,...,...,...,...
9214,U009215,Tiffany1017,Wallingford,CT
9215,U009216,bobg187,Homer,AK
9216,U009217,K261ANbrendah,Conway,AR
9217,U009218,soccerrocks2016,Hunter,NewYork


## Merge Hotel Data

In [179]:
# merge hotel data from goibibo and hotel data extracted from datafiniti
df_hotel_goibibo = pd.read_csv('DataHotel_ML.csv', index_col=[0])
df_hotel_datafiniti = df_hotel.copy()
print(df_hotel_goibibo.shape)
print(df_hotel_datafiniti.shape)

(1311, 383)
(1311, 8)


In [180]:
# see datafiniti hotel review rating unique values
df_hotel_datafiniti.sort_values(by='mean_review_rating', ascending=False)['mean_review_rating'].unique()

array([5.        , 4.91304348, 4.90909091, 4.9       , 4.88888889,
       4.88      , 4.86713287, 4.84615385, 4.83333333, 4.81818182,
       4.81578947, 4.8       , 4.77027027, 4.76      , 4.75      ,
       4.72413793, 4.71428571, 4.68888889, 4.6875    , 4.68421053,
       4.67307692, 4.66666667, 4.65625   , 4.63636364, 4.63461538,
       4.63157895, 4.625     , 4.61538462, 4.61363636, 4.61111111,
       4.6       , 4.58333333, 4.57894737, 4.57692308, 4.57142857,
       4.57037037, 4.55714286, 4.54545455, 4.53846154, 4.51428571,
       4.504     , 4.5       , 4.49315068, 4.49295775, 4.48214286,
       4.47368421, 4.47297297, 4.46226415, 4.46153846, 4.45833333,
       4.44444444, 4.43243243, 4.425     , 4.41666667, 4.41573034,
       4.4       , 4.39285714, 4.38596491, 4.38461538, 4.38271605,
       4.37142857, 4.36842105, 4.36363636, 4.35714286, 4.35294118,
       4.33333333, 4.32903226, 4.30882353, 4.3       , 4.29545455,
       4.29166667, 4.28571429, 4.27777778, 4.27272727, 4.27142

In [181]:
# see goibibo hotel review rating unique values
df_hotel_goibibo = df_hotel_goibibo.sort_values(by='site_review_rating', ascending=False)
df_hotel_goibibo.reset_index(drop=True, inplace=True)
df_hotel_goibibo['site_review_rating'].unique()

array([5. , 4.9, 4.8, 4.7, 4.6, 4.5, 4.4, 4.3, 4.2, 4.1, 4. , 3.9, 3.8,
       3.7, 3.6, 3.5, 3.4, 3.3, 3.2, 3.1, 3. , 2.9, 2.8, 2.7, 2.6, 2.5,
       2.4, 2.3, 2.2, 2. , 1.9, 1.7, 1.5, 1.3, 1. ])

In [182]:
# based on the datafiniti data, we need to round the number to 1 decimal
df_hotel_datafiniti = df_hotel_datafiniti.sort_values(by='mean_review_rating', ascending=False).round(decimals=1)
df_hotel_datafiniti.reset_index(drop=True, inplace=True)
df_hotel_datafiniti['mean_review_rating'].unique()

array([5. , 4.9, 4.8, 4.7, 4.6, 4.5, 4.4, 4.3, 4.2, 4.1, 4. , 3.9, 3.8,
       3.7, 3.6, 3.5, 3.4, 3.3, 3.2, 3.1, 3. , 2.9, 2.8, 2.7, 2.6, 2.5,
       2.4, 2.3, 2.2, 2. , 1.9, 1.7, 1.5, 1.3, 1. ])

In [183]:
# distribusi data hotel goibibo berdasarkan rating untuk pencocokan merger
df_hotel_goibibo[['site_review_rating','property_id']].groupby(by="site_review_rating").count()

Unnamed: 0_level_0,property_id
site_review_rating,Unnamed: 1_level_1
1.0,64
1.3,2
1.5,11
1.7,3
1.9,2
2.0,51
2.2,4
2.3,7
2.4,3
2.5,18


In [184]:
# distribusi data hotel datafiniti berdasarkan rating untuk pencocokan merger
df_hotel_datafiniti[['mean_review_rating','name']].groupby(by="mean_review_rating").count()

Unnamed: 0_level_0,name
mean_review_rating,Unnamed: 1_level_1
1.0,64
1.3,2
1.5,11
1.7,3
1.9,2
2.0,51
2.2,4
2.3,7
2.4,3
2.5,18


In [185]:
# drop random data dari hotel goibibo agar cocok dengan datafiniti
print('jumlah selisih data : ', len(df_hotel_goibibo)-len(df_hotel_datafiniti))

np.random.seed(10)
remove_n = 642 #jumlah data
drop_indices = np.random.choice(df_hotel_goibibo.index, remove_n, replace=False)
df_hotel_goibibo = df_hotel_goibibo.drop(drop_indices)
df_hotel_goibibo.reset_index(drop=True, inplace=True)
df_hotel_goibibo.shape

jumlah selisih data :  0


(669, 383)

In [186]:
# merge data hotel
df_hotel_final = df_hotel_goibibo.copy()
df_hotel_final['property_name'] = df_hotel_datafiniti['name']
df_hotel_final['address'] = df_hotel_datafiniti['address']
df_hotel_final['city'] = df_hotel_datafiniti['city']
df_hotel_final['state'] = df_hotel_datafiniti['province']
df_hotel_final['country'] = df_hotel_datafiniti['country']
df_hotel_final['latitude'] = df_hotel_datafiniti['latitude']
df_hotel_final['longitude'] = df_hotel_datafiniti['longitude']
df_hotel_final['site_review_rating'] = df_hotel_datafiniti['mean_review_rating']

In [187]:
# re-indexing the hotel data
df_hotel_final = df_hotel_final.sort_values(by='property_id', ascending=True)
df_hotel_final.reset_index(drop=True, inplace=True)

hid = []
for n in range(1, len(df_hotel_final)+1):
    hid.append('H{0:06}'.format(n))
df_hotel_final['property_id'] = hid

In [188]:
df_hotel_final

Unnamed: 0,property_id,property_name,hotel_description,address,point_of_interest,city,state,country,latitude,longitude,property_type,hotel_star_rating,hotel_facilities,room_facilities,site_review_rating,Bike on Rent,Cook Services,Drivers Rest Room,24 Hour Front Desk,ATM / Banking,Access Via Exterior Corridors,Adjoining Rooms,Air / Rail Booking,Air-Conditioning - Central,Airport Transfer Available / Surcharge,Airport Transfer Free,Amphitheatre,Anti-Slip Ramps,Audio - Visual Equipment,Ayurvedic Facilities,Ayurvedic Facilities.1,Babysitting/ Child Care (surcharge),Badminttion Court,Baggage Room,Ballroom,Banquet Facilities,Bar / Lounge,Barbecue Area,Basketball Court,Beauty Salon - on charge,Billiards,Board Room,Boating,Bowling,Business Center,Business Center.1,CCTV,Camel Ride,Campfire / Bon Fire,Car Rental,Casino,Catering,Cell Phone Rental,Children's Park,City Shuttle Service,Clubhouse,Coffee Shop / Cafe,Computer Rental,Concierge,Conference Facility,Conference Facility.1,Courier Service,Courtesy Coach,Currency Exchange,Dance Performances (on demand),Disabled Friendly Facilities,Disabled Friendly Rooms,Discotheque,Doctor on Call,Doorman,Dry Cleaning,Electronic / Magnetic Keys,Electronic Check-Out,Exhibit Space,Express Check-In / Check-Out,Express Laundry,Extra Person / Child - Share the same room,Family Room,Fax Machine,Fire Exit Signs,Fire Place Available,Fire Safety,First-Aid Kit at Front Desk,Fishing,Fitness Equipment,Floor Butler,Flower Shop,Food Facility,Free Internet Access,Free Local Calls,Front Desk,Full Time Operation for All 7 Days,Fun Floats,Games Zone,Gardens,Gift Shop,Golf Course,Grocery,Guide / Sightseeing Service,Health Club / Gym Facility Available,Hookah Lounge,Horse Ride ( Chargeable ),Hotel Trading License,In House Events,Internet / Fax (Reception area only),Internet Access,Internet Access.2,Internet Access - Surcharge,Internet Access Free,Jacuzzi,Karaoke,Karaoke.1,Kids Play Zone,Kitchen available (home cook food on request),Laptop Available on Hire,Laundry Service Available,Library,Lift / Elevator,Limousine Service Available,Lobby,Local Tour / Travel Desk,Locker Facility,Luggage Storage,Major Credit Cards Accepted,Marina On Site,Massage Services,Massage Services.1,Medical Assistance Available,Meditation Room,Metal Detectors,Mini Theatre,Multi - Line Phone,Multi Cuisine Restaurant,Multi Cuisine Restaurant.1,Multi Lingual Staff,Nature Walk,Newspapers In Lobby,Night Club,Non Smoking Hotel,Non Smoking Rooms,Open Air Restaurant / Dining,Open Air Restaurant / Dining.1,Outdoor Parking - Non Secured,Outdoor Parking - Secured,Oxygen Cylinder (on request),Paid Transfers,Parking (Surcharge),Parking Facilities Available,Parking Facility,Party hall,Pest Control Facility,Pets Allowed,"Pick Up and Drop (Rly Station, Bus Stand, Temple)",Picnic Area,Pool Cafe,Pool Table,Poolside Bar,Porter / Bellhop,Porter Services Available,Postal / Parcel Services,Power Backup,Private / Plunge Pool,Private Beach,Public Restrooms,RO Water Purification System,Recreation Zone,Restaurant,Rooftop Garden,Room Service,Room Service.2,Room Service (24 Hours),STD / ISD,Safari,Sauna,Secretarial Services,Security at All Hotel Entrances,Self Check - In,Separate Sitting Area,Shopping Drop Facility (on fixed timings),Ski Storage,Skiing Facility Available,Smoke Detector,Smoking Area,Smoking Area.1,Smoking Rooms,Solarium,Spa On Site - Chargeable,Spa Services Nearby,Spa Services Onsite - Free,Speciality Restaurant,Speciality Restaurant.1,Squash court,Stamps and Mailing Facility,Suitable For Children,Suites,Swimming Pool,Table Tennis,Taxi Services,Tennis Court,Transfers Available,Translation Services,Trekking,Valet service,Veg / Non Veg Kitchens Separate,Vegetarian Food / Jain Food Available,Veranda,Village Visit ( Chargeable ),Virtual Golf,Voicemail,Wake-up Call / Service,Water Dispenser Available,Wedding Services Facility,Welcome Drinks,Wheelchair Accessibility - Room,Whirlpool Bath / Shower Cubicle,X-Ray Machine at Entrance,Yoga Hall / Classes,Hot / Cold Running Water,120 AC,220 AC,Air Conditioning,Air Conditioning.1,Air Cooler,Alarm Clock,Attached Bathroom,Balcony,Balcony / Terrace,Basic Bathroom Amenities,Bath Tub,Bathrobe,Bathrobe (on request),Bathrobes,Bathroom Amenities,Bathtub,Bathtub Only,Bedside Table,Bidet,Blackout Drapes,Breakfast Available,Breakfast Available (surcharge),Breakfast chargeable,Cable / Satellite / Pay TV available,Cable/Satellite TV,Ceiling Fan,Centrally Heated,Channel Music,Citrus Juicer,Climate Control,Clock Radio,Closet,Clothes Dryer,Clothes Hook in Bathroom,Clothes Press,Coffee Table,Color TV,Complete Bedding,Complimentary Use of Bicycle,Computer,Cribs Available,Crockery / Cutlery,Curtains/Shades,Custom Duvets / Linen,Daily Room Cleaning,Desk,Desk in Room,Direct Dial Phone,Disabled Friendly Bathrooms,Do-Not-Disturb Notice,Dressing Area,Dry Cleaning.1,Dustbins,Electronic Weighing Machine,Ensuite / Private Bathroom,"Extra Bed, Towels, Linens, Bedding (on request)",Extra Toilteries on Demand,Free Newspaper,Fruit Basket,Full Kitchen,Gas Cylinder - Chargeable,HD / LCD Satellite TV,HD / LCD Satellite TV.1,Hairdryer,Hairdryer (on request),Handheld Showers,Handicapped Facilities,Heating Facility (on request),Heating Facility (on request).1,Hill View from all Rooms,Hot / Cold Running Water.1,Housekeeping,Ice Bucket on Demand,In-Room Safe,In-room Storage Space,Induction Plate,International Switch Plugs (on request),Internet Access.1,Internet Connection In Room,Iron / Ironing board available,Iron/Ironing Board,Kitchenette,Laundry Service Available.1,Local Calls Free,Locker Facility.1,Luggage Rack,Luxurious Feather Mattresses,Makeup / Shaving Mirror,Microwave Oven Available,Mineral Water,Mini Bar,Mini Refrigerator,Mini bar - On Charge,Mirror,Modern Toilet Facilities,Moisturiser,Movie Channels,Movies &amp; Games on demand,Newspapers,No Smoking Room,Patio - Property,Pay Movies,Phone Messaging Services,Pillow menu,Pinup Boards,Premium Amenities / Toiletries,Premium Beddings / Hypo-Allergenic,Primium Bathrooms,Private Garden,Radio,Reading Lamps,Refrigerator,Remote Control TV,Remote Controlled Lights and Fan,Rollaway Beds,Room Door Fitted with Lock and Key,Room Heater,Room Service.1,Room Service .1,Room with Window/Ventilation,Rooms with Attached Bathrooms,Safe (on request),Safe - In - Room,Safe - In - Room.1,Sandwich Maker,Sanitary Bin,Second Bathrooms,Separate Bathtubs And Shower,Shared Bathroom,Shaving Kit /Dental Kit,Shoe Shine,Shower,Shower Cap,Shower Caps Available,Shower Facility Available,Slippers,Smoke Detector.1,Snacks / Beverage,Sofa Cum Bed Available,Sofa Table / Extra Seats,Sofa Table / Extra Seats.1,Soft Bar,Soundproof Windows,Split A.C's,Stationery Kit,Stove / Oven,Studio Apartment - Large,Sufficient Lighting,TV,Tea Coffee Kettle / Maker,Telephone,Telephone in Toilet,Television,Toaster,Toiletries,Two Line Telephone,Utensils (on request),Video/ DVD/ CD Player Available,Voicemail.1,Wardrobe,Washing machine,Western Toilet Seat,Window Opens,Work Station,Working Desk Phone
0,H000001,Dolphin Cove Motel,The standard check-in time is 12:00 PM and the...,170 Main St,"['Hadimba Temple', 'Naggar Village', 'Himalaya...",Pismo Beach,CA,US,35.1,-120.6,Resort,2,"['Doctor on Call', 'Dry Cleaning', 'Laundry Se...","['Room Service ', 'Basic Bathroom Amenities', ...",5.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
1,H000002,Residence Inn Mobile,The standard check-in time is 12:00 PM and the...,950 W I65 Service Rd S,"['The Mall', 'Tibetian Monastery', 'Vashisht H...",Mobile,AL,US,30.7,-88.1,Cottage,2,"['Doctor on Call', 'Dry Cleaning', 'Laundry Se...","['Basic Bathroom Amenities', 'Cable / Satellit...",5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,H000003,Hampton Inn Suites Knoxville-turkey Creek/farr...,The standard check-in time is 12:00 PM and the...,11340 Campbell Lakes Dr,"['Rg Baruah Road', 'Nehru Stadium', 'Guwahati ...",Knoxville,TN,US,35.9,-84.2,Hotel,0,"['Business Center ', 'Doctor on Call', 'Dry Cl...","['Room Service ', 'Basic Bathroom Amenities', ...",5.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,H000004,Best Western Webster Hotel Nasa,The standard check-in time is 12:00 PM and the...,889 W Bay Area Blvd,['Thuraipakkam'],Webster,TX,US,29.5,-95.1,Hotel,0,"['Internet Access - Surcharge', 'Laundry Servi...","['Air Conditioning ', 'Basic Bathroom Amenitie...",4.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,H000005,El Pueblo Lodge,The standard check-in time is 12:00 PM and the...,412 Paseo Del Pueblo Norte,"['Airport Area, Pallavaram', 'Chennai Internat...",Taos,NM,US,36.4,-105.6,Hotel,2,"['Parking Facilities Available', 'Front Desk',...","['Room Service ', 'Air Conditioning ', 'Basic ...",4.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,H000665,Hilton Garden Inn Clovis,The standard check-in time is 02:00 PM and the...,520 W Shaw Ave,"['Mashobra', 'Dhalli Bus Station']",Clovis,CA,US,36.8,-119.7,Hotel,5,"['Swimming Pool', 'Bar / Lounge ', 'Business C...","['Air Conditioning ', 'Cable / Satellite / Pay...",5.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
665,H000666,Best Western Plus Woodland Hills Hotel & Suites,The standard check-in time is 01:00 PM and the...,10143 E 62nd St S,"['International Cricket Stadium', 'Bhagsunath ...",Tulsa,OK,US,36.1,-95.9,Hotel,4,"['Swimming Pool', 'Airport Transfer Available ...","['Room Service ', 'Air Conditioning ', 'Basic ...",5.0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
666,H000667,Super 8-Midland,The standard check-in time is 02:00 PM and the...,3828 W Wall St,['Vivekananda Polyclinic &amp; Institute of Me...,Midland,TX,US,32.0,-102.1,Hotel,4,"['Bar / Lounge ', 'Internet Access - Surcharge...","['Room Service ', 'Air Conditioning ', 'Alarm ...",5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0,1,1,1,1,1,0,0,1,1,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,1,1,0,1,0,0,1,1,0,1,0,0,1,1,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,1,1,0,1,0,1,1,0,0,0,0,0,1,1,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,1,1,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,1,1,1,0,0,0,1,0,0,1,1,1,0,0,1,1,1,0,1,1,0,1,0,1,1,1,0,0,1,1,1,1,1,0,0,1,0,1,0,0,1,0,1,0,1,0
667,H000668,Hampton Inn Orlando-Maingate South,The standard check-in time is 12:00 PM and the...,44117 Highway 27,"['Rock Garden', 'Composite Hospital ITBP', 'Zi...",Davenport,FL,US,28.2,-81.7,Hotel,0,"['Airport Transfer Available / Surcharge', 'Ba...","['Air Conditioning ', 'Basic Bathroom Amenitie...",5.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Export Data and Finalization

In [31]:
%%time
# insert user_id and hotel_id to review data (CAUTION TO RE RUN, TAKE LONG TIME)
arr_user = []
arr_hotel = []
for i,review in df_review.iterrows():
    for h,hotel in df_hotel_final.iterrows():
        if hotel['property_name'] == review['name']:
            arr_hotel.append(hotel['property_id'])
    for u,user in df_user.iterrows():
        if user['reviews.username'] == review['reviews.username']:
            arr_user.append(user['user_id'])

CPU times: total: 42min 14s
Wall time: 42min 20s


In [35]:
df_review['user_id'] = arr_user
df_review['hotel_id'] = arr_hotel
df_review = df_review[['review_id','user_id','hotel_id','reviews.username','reviews.userCity','reviews.userProvince','name','reviews.rating','reviews.text','reviews.title','reviews.date']]

In [38]:
df_review.sample(5)

Unnamed: 0,review_id,user_id,hotel_id,reviews.username,reviews.userCity,reviews.userProvince,name,reviews.rating,reviews.text,reviews.title,reviews.date
9634,R009635,U009078,H001118,catherinemQ2687CP,Miami,Florida,Conrad Chicago,1,I was working so I ordered room service which ...,Horrible,2017-11-10T00:00:00.000Z
6463,R006464,U006193,H000882,waynel337,Grosse Pointe,MI,Microtel Inn Suites By Wyndham Caldwell,5,"Small but highly functional, clean and priced ...",Eurostyle,2016-04-13T00:00:00Z
5091,R005092,U004922,H000340,gulliver55,Seattle,WA,Briar Rose Inn,5,We've stayed here a number of times and it fee...,"A Little Oasis in Vancouver, WA",2014-08-25T00:00:00Z
9226,R009227,U008711,H000499,Shasta B,Chicago,IL,Hilton Garden Inn Oakland-San Leandro,4,Staff very helpful and professional. Furniture...,Hilton Inn - San Leandro,2015-10-11T00:00:00Z
1279,R001280,U001261,H000361,ambereagon,Cuyahoga Falls,Ohio,Fairfield Inn by Marriott Port Huron,5,This hotel had great customer service. The roo...,Very nice,2017-12-31T00:00:00.000Z


In [190]:
# save data hotel for ML
# df_hotel_final.to_csv('DataHotel_ML.csv')
# save data review 
# df_review.to_csv('DataReview.csv')
# save data review 
# df_user.to_csv('DataUser.csv')

## Eksperimen dan Percobaan

In [752]:
p_hotel = pd.read_csv('DataHotel_ML.csv', index_col=[0])
p_user = pd.read_csv('DataUser.csv', index_col=[0])
p_review = pd.read_csv('DataReview.csv', index_col=[0])

In [753]:
# slicing 50 feature + 1 target class (property_id)
p_hotel2 = p_hotel.iloc[:, np.r_[0, 11,14:63]].copy()

# normalize hotel_star_rating and site_review_rating
max_value = p_hotel2['hotel_star_rating'].max()
min_value = p_hotel2['hotel_star_rating'].min()
p_hotel2['hotel_star_rating'] = (p_hotel2['hotel_star_rating'] - min_value) / (max_value - min_value)

# normalize hotel_star_rating and site_review_rating
max_value = p_hotel2['site_review_rating'].max()
min_value = p_hotel2['site_review_rating'].min()
p_hotel2['site_review_rating'] = (p_hotel2['site_review_rating'] - min_value) / (max_value - min_value)

p_hotel2

Unnamed: 0,property_id,hotel_star_rating,site_review_rating,Bike on Rent,Cook Services,Drivers Rest Room,24 Hour Front Desk,ATM / Banking,Access Via Exterior Corridors,Adjoining Rooms,Air / Rail Booking,Air-Conditioning - Central,Airport Transfer Available / Surcharge,Airport Transfer Free,Amphitheatre,Anti-Slip Ramps,Audio - Visual Equipment,Ayurvedic Facilities,Ayurvedic Facilities.1,Babysitting/ Child Care (surcharge),Badminttion Court,Baggage Room,Ballroom,Banquet Facilities,Bar / Lounge,Barbecue Area,Basketball Court,Beauty Salon - on charge,Billiards,Board Room,Boating,Bowling,Business Center,Business Center.1,CCTV,Camel Ride,Campfire / Bon Fire,Car Rental,Casino,Catering,Cell Phone Rental,Children's Park,City Shuttle Service,Clubhouse,Coffee Shop / Cafe,Computer Rental,Concierge,Conference Facility,Conference Facility.1,Courier Service,Courtesy Coach
0,H000001,0.4,0.80,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0
1,H000002,0.0,1.00,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,H000003,0.4,1.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,H000004,0.6,0.75,0,0,1,1,1,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0
4,H000005,0.2,0.50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1306,H001307,0.4,0.75,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1307,H001308,0.6,0.75,0,1,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1308,H001309,0.0,0.50,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1309,H001310,0.4,1.00,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [754]:
p_review

Unnamed: 0,review_id,user_id,hotel_id,reviews.username,reviews.userCity,reviews.userProvince,name,reviews.rating,reviews.text,reviews.title,reviews.date
0,R000001,U000001,H000021,tatsurok2018,San Jose,UnitedStates,Best Western Plus South Coast Inn,3,"This hotel was nice and quiet. Did not know, t...",Best Western Plus Hotel,2018-01-01T00:00:00.000Z
1,R000002,U000002,H001054,STEPHEN N,San Francisco,CA,Best Western Carmel's Town House Lodge,4,We stayed in the king suite with the separatio...,Clean rooms at solid rates in the heart of Carmel,2016-04-02T00:00:00Z
2,R000003,U000003,H001054,15Deborah,Prescott Valley,AZ,Best Western Carmel's Town House Lodge,3,"Parking was horrible, somebody ran into my ren...",Business,2016-01-06T00:00:00Z
3,R000004,U000004,H001054,Wilfredo M,Guaynabo,PR,Best Western Carmel's Town House Lodge,5,Not cheap but excellent location. Price is som...,Very good,2016-08-22T00:00:00Z
4,R000005,U000005,H001054,Luc D,Reno,NV,Best Western Carmel's Town House Lodge,2,If you get the room that they advertised on th...,Low chance to come back here,2016-03-21T00:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...
9782,R009783,U009215,H001298,Tiffany1017,Wallingford,CT,Hampton Inn Hampton-newport News,4,My friends and I took a trip to Hampton for th...,Very accommodating and friendly staff!,2015-12-24T00:00:00Z
9783,R009784,U009216,H001298,bobg187,Homer,AK,Hampton Inn Hampton-newport News,5,"from check in to departure, staff is friendly,...","comfortable, friendly, clean, professional",2015-11-17T00:00:00Z
9784,R009785,U009217,H001298,K261ANbrendah,Conway,AR,Hampton Inn Hampton-newport News,5,This Hampton is located on a quiet street acro...,Great location,2016-07-06T00:00:00Z
9785,R009786,U009218,H001103,soccerrocks2016,Hunter,NewYork,Roseberry's Inn,5,Awesome wings (my favorite was garlic parmesan...,Great Atmosphere!,2016-10-26T00:00:00.000Z


In [764]:
p_user2 = p_user[['user_id']].copy()
for col in p_hotel2.columns:
    p_user2[col] = 0

p_user2['property_id'] = np.empty((len(p_user2), 0)).tolist()
p_user2

Unnamed: 0,user_id,property_id,hotel_star_rating,site_review_rating,Bike on Rent,Cook Services,Drivers Rest Room,24 Hour Front Desk,ATM / Banking,Access Via Exterior Corridors,Adjoining Rooms,Air / Rail Booking,Air-Conditioning - Central,Airport Transfer Available / Surcharge,Airport Transfer Free,Amphitheatre,Anti-Slip Ramps,Audio - Visual Equipment,Ayurvedic Facilities,Ayurvedic Facilities.1,Babysitting/ Child Care (surcharge),Badminttion Court,Baggage Room,Ballroom,Banquet Facilities,Bar / Lounge,Barbecue Area,Basketball Court,Beauty Salon - on charge,Billiards,Board Room,Boating,Bowling,Business Center,Business Center.1,CCTV,Camel Ride,Campfire / Bon Fire,Car Rental,Casino,Catering,Cell Phone Rental,Children's Park,City Shuttle Service,Clubhouse,Coffee Shop / Cafe,Computer Rental,Concierge,Conference Facility,Conference Facility.1,Courier Service,Courtesy Coach
0,U000001,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,U000002,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,U000003,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,U000004,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,U000005,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9214,U009215,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9215,U009216,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9216,U009217,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9217,U009218,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [765]:
%%time
for i,item in p_review.iterrows():
#     melakukan append terhadap array property_id pada df p_user2
    p_user2.loc[p_user2['user_id'] == item['user_id'], 'property_id'][p_user2.loc[p_user2['user_id'] == item['user_id'], 'property_id'].index.values[0]].append(item['hotel_id'])
#     melakukan assign fitur hotel facilities dan room facilities
    p_user2.loc[p_user2['user_id'] == item['user_id'], p_user2.columns[2:]] += p_hotel2.loc[p_hotel2['property_id'] == item['hotel_id']].iloc[:,1:].values

CPU times: total: 1min 5s
Wall time: 1min 5s


In [770]:
%%time
# updating value for property_id to the most frequent hotel reviewed
for i,item in p_user2.iterrows():
    p_user2.loc[i, 'property_id'] = mode(p_user2.loc[i, 'property_id'])
# normalize each feature based on the total review by each user
    for col in p_user2.columns[2:]:
        p_user2.loc[i, col] = item[col]/len(item['property_id'])

CPU times: total: 21.4 s
Wall time: 21.4 s


In [751]:
# save csv for training user data
# p_user2.to_csv('for_training_user.csv')

In [761]:
p_review['user_id'].mode()

0    U001231
1    U001785
Name: user_id, dtype: object

In [772]:
p_user2.loc[p_user2['user_id'] == 'U001231']

Unnamed: 0,user_id,property_id,hotel_star_rating,site_review_rating,Bike on Rent,Cook Services,Drivers Rest Room,24 Hour Front Desk,ATM / Banking,Access Via Exterior Corridors,Adjoining Rooms,Air / Rail Booking,Air-Conditioning - Central,Airport Transfer Available / Surcharge,Airport Transfer Free,Amphitheatre,Anti-Slip Ramps,Audio - Visual Equipment,Ayurvedic Facilities,Ayurvedic Facilities.1,Babysitting/ Child Care (surcharge),Badminttion Court,Baggage Room,Ballroom,Banquet Facilities,Bar / Lounge,Barbecue Area,Basketball Court,Beauty Salon - on charge,Billiards,Board Room,Boating,Bowling,Business Center,Business Center.1,CCTV,Camel Ride,Campfire / Bon Fire,Car Rental,Casino,Catering,Cell Phone Rental,Children's Park,City Shuttle Service,Clubhouse,Coffee Shop / Cafe,Computer Rental,Concierge,Conference Facility,Conference Facility.1,Courier Service,Courtesy Coach
1230,U001231,H000354,0.428571,0.660714,0.428571,0.142857,0.428571,0.285714,0.0,0.0,0.0,0.285714,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.285714,0.0,0.142857,0.428571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [748]:
p_user2.loc[p_user2['user_id'] == 'U001231']

Unnamed: 0,user_id,property_id,hotel_star_rating,site_review_rating,Bike on Rent,Cook Services,Drivers Rest Room,24 Hour Front Desk,ATM / Banking,Access Via Exterior Corridors,Adjoining Rooms,Air / Rail Booking,Air-Conditioning - Central,Airport Transfer Available / Surcharge,Airport Transfer Free,Amphitheatre,Anti-Slip Ramps,Audio - Visual Equipment,Ayurvedic Facilities,Ayurvedic Facilities.1,Babysitting/ Child Care (surcharge),Badminttion Court,Baggage Room,Ballroom,Banquet Facilities,Bar / Lounge,Barbecue Area,Basketball Court,Beauty Salon - on charge,Billiards,Board Room,Boating,Bowling,Business Center,Business Center.1,CCTV,Camel Ride,Campfire / Bon Fire,Car Rental,Casino,Catering,Cell Phone Rental,Children's Park,City Shuttle Service,Clubhouse,Coffee Shop / Cafe,Computer Rental,Concierge,Conference Facility,Conference Facility.1,Courier Service,Courtesy Coach
1230,U001231,H000354,0.428571,0.660714,0.428571,0.142857,0.428571,0.285714,0.0,0.0,0.0,0.285714,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.285714,0.0,0.142857,0.428571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
