In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import sklearn
import seaborn as sns
import re
from collections import Counter
%matplotlib inline

# Hotel Review Workshop Notebook

This notebook will guide you through the creation of a simple bag of words model for text matching.

In [2]:
# Import the Data Set.
data = pd.read_csv('https://github.com/Thinkful-Ed/data-201-resources/raw/master/hotel-reviews.csv')

In [3]:
data.head(5)

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,������ ���������������,,sungchul,


In [4]:
# Perform some basic cleaning and character removal.

# Make everything lower case.
data['reviews.text'] = data['reviews.text'].str.lower()

# Remove non-text characters.
data['reviews.text'] = data['reviews.text'].str.replace(r'\.|\!|\?|\'|,|-|\(|\)', "",)

# Fill in black reviews with '' rather than Null (which would give us errors).
data['reviews.text'] = data['reviews.text'].fillna('')

In [15]:
data.head(5)
#data.shape # (35912, 19)
#data['reviews.text']
#data[['reviews.text', 'reviews.rating']]
#data.isnull().sum()

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,really lovely hotel stayed on the very top flo...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,ett mycket bra hotell det som drog ner betyget...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,we stayed here for four nights in october the ...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,we stayed here for four nights in october the ...,������ ���������������,,sungchul,


In [17]:
# Import and initiate a vectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# The max features is how many words we want to allow us to create columns for.
vectorizer = CountVectorizer(max_features=5000)

In [20]:
# Vectorize our reviews to transform sentences into columns.
X = vectorizer.fit_transform(data['reviews.text'])

# And then put all of that in a table.
bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

bag_of_words.head(3)

Unnamed: 0,00,00am,00pm,10,100,1000,10000,101,1015,10am,...,yummy,zero,zimmer,zion,zona,zone,zoo,zu,zum,zur
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Rename some columns for clarity.
data.rename(columns={'address': 'hotel_address', 'city': 'hotel_city',
                     'country':'hotel_country', 'name':'hotel_name'},
            inplace=True)

# Join our bag of words back to our initial hotel data.
full_df = data.join(bag_of_words)

full_df.head(3)

Unnamed: 0,hotel_address,categories,hotel_city,hotel_country,latitude,longitude,hotel_name,postalCode,province,reviews.date,...,yummy,zero,zimmer,zion,zona,zone,zoo,zu,zum,zur
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# X is our words.
X = bag_of_words

# Y is our hotel name (the outcome we care about).
Y_hotel = data['hotel_name']

In [26]:
# Import a random forest model.
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

# Fit that random forest model to our data.
rfc.fit(X,Y_hotel)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# If you want to run a different test review, start from here.

In [37]:
# Write your own dream vacation review here...
test_review = ['''
    I loved the beach and the sunshine and the clean and modern room.
    ''']



In [38]:
# Convert your test review into a vector.
X_test = vectorizer.transform(test_review).toarray()

X_test

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [39]:
# Match your review.
prediction = rfc.predict(X_test)[0]

prediction

#rfc.predict(X_test)

"Harrah's Laughlin Hotel and Casino"

In [40]:
# Return the essential information about your match.
data[data['hotel_name'] == prediction][['hotel_name', 'hotel_address', 
                                        'hotel_city', 'hotel_country']].head(1)

Unnamed: 0,hotel_name,hotel_address,hotel_city,hotel_country
16011,Harrah's Laughlin Hotel and Casino,2900 S Casino Dr,Laughlin,US
