### Import Libraries

In [1]:
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import pandas as pd

### Load data

In [2]:
data = pd.read_csv('tripadvisor_hotel_reviews.csv')

In [3]:
data.head(5)

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
data['Review'][0]

'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,  '

### Lowercase

In [5]:
data['Review_lower'] = data['Review'].str.lower()

### Stop word removal

In [6]:
en_stopwords = stopwords.words('english')
en_stopwords.remove('not')

In [7]:
data['Review_no_stopwords'] = data['Review_lower'] \
                                .apply(lambda x: ' ' \
                                .join([word for word \
                                in x.split() if word \
                                not in (en_stopwords)]))

### Punctuation removal

In [8]:
data['Review_no_stopwords_no_punct'] = data \
                                    .apply(lambda x: \
                                    re.sub(r"[*]",
                                           "star",
                                           x['Review_no_stopwords']
                                           ), axis=1)
data['Review_no_stopwords_no_punct'] = data. \
                                    apply(lambda x: \
                                    re.sub(r"([^\w\s])",
                                           "",
                                           x['Review_no_stopwords_no_punct']
                                           ), axis=1)

### Tokenizing

In [9]:
def tokenizing(word):
    if word is None or (isinstance(word, float)):  # catches NaN
        return []
    return word_tokenize(str(word))

In [10]:
data['Review_Token'] = data['Review_no_stopwords_no_punct'].apply(tokenizing)

### Stemming

In [11]:
ps = PorterStemmer()

In [12]:
def porterStem(word):
    return [ps.stem(text) for text in word]

In [13]:
data['Review_Stemmed'] = data['Review_Token'].apply(porterStem)

### Lemmatization

In [14]:
lemmatizer = WordNetLemmatizer()

In [15]:
def lemmatization(word):
    return [lemmatizer.lemmatize(text) for text in word]

In [16]:
data['Review_Lemmatized'] = data['Review_Token'].apply(lemmatization)

In [17]:
data[['Review_Stemmed','Review_Lemmatized']]

Unnamed: 0,Review_Stemmed,Review_Lemmatized
0,"[nice, hotel, expens, park, got, good, deal, s...","[nice, hotel, expensive, parking, got, good, d..."
1,"[ok, noth, special, charg, diamond, member, hi...","[ok, nothing, special, charge, diamond, member..."
2,"[nice, room, not, 4star, experi, hotel, monaco...","[nice, room, not, 4star, experience, hotel, mo..."
3,"[uniqu, great, stay, wonder, time, hotel, mona...","[unique, great, stay, wonderful, time, hotel, ..."
4,"[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game..."
...,...,...
104,"[fairmont, hotel, singapor, splendid, stay, fa...","[fairmont, hotel, singapore, splendid, stayed,..."
105,"[great, hotel, stay, hotel, night, rout, perth...","[great, hotel, stayed, hotel, night, route, pe..."
106,"[great, hotel, centr, stay, 5, night, room, ov...","[great, hotel, centre, stayed, 5, night, room,..."
107,"[great, locat, great, valu, great, hotel, husb...","[great, location, great, value, great, hotel, ..."
