#### Import data set

In [None]:
import datatable as dt
from tqdm import tqdm
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

df = dt.fread('tripadvisor_hotel_reviews.csv').to_pandas()
df.columns = df.columns.str.lower()

#### Cleaning, lemmatizing, and removing stop words

In [None]:
import re
import spacy


def clean_text(text):
    nlp = spacy.load('en_core_web_sm')
    stopwords = nlp.Defaults.stop_words
    text = re.sub(r'[^\w\s]', '', text) # Symbols removal
    text = re.sub(r'\bnt\b', 'not', text) # nt -> not
    text = re.sub(r'\\s{2,}', r'\.', text) # knowwwwwww -> know
    text = text.strip()
    # lemmatization
    text = ' '.join(token.lemma_.lower() for token in nlp(text) if token.lemma_.lower() not in stopwords)
    return text


# Use vectorization
df['review'] = np.vectorize(clean_text)(df['review'])

#### Save the clean data to pickle file

In [None]:
import pickle

# Save the file to pickle
with open('cleaned reviews.pkl', 'wb') as f:
    pickle.dump(df['review'].values, f)