#### Import data set

In [1]:
import datatable as dt
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

df = dt.fread('tripadvisor_hotel_reviews.csv').to_pandas()
df.columns= df.columns.str.lower()

#### Data cleaning

In [2]:
# Check for missing data
df.isnull().sum()

review    0
rating    0
dtype: int64

In [3]:
import re
import spacy
from spacy.matcher import Matcher

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\bnt\b', 'not', text)
    text = re.sub(r'\\s{2,}', r'\.', text)
    text = text.strip()
    return text

# Use vectorization
df['review'] = np.vectorize(clean_text)(df['review'])

# Add a column length of review 
df['review_length'] = df['review'].map(len)

In [4]:
def get_chunks(limit, total):
    '''Splits a big chunk into equally smaller ones'''
    total = 20491
    limit = 1000
    current = 0
    chunks = []
    while current < total:
        chunks.append((current, current + limit))
        current += limit
    chunks.append((current, total))
    return chunks


def get_features(text, pattern_list):
    '''Uses Spacy rule-based matcher to extract phrases from a text'''
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    matcher.add('features', pattern_list, greedy='LONGEST')
    matches = matcher(doc)
    features = ''
    for match_id, start, end in matches:
        span = doc[start: end]
        features += (span.text) + '\n'
    return features

In [5]:
# Save cleaned reviews to csv file 
cleaned_review_name = 'cleaned reviews.txt'
df['review'].to_csv(cleaned_review_name, index=False, header=None)

In [None]:
pattern = [{'POS': 'NOUN'}]
with open(cleaned_review_name, "r", encoding="utf-8") as f:
    for l in f:
        nouns.add(get_features(l, [pattern]))
        with open('extracted nouns.txt', "a", encoding="utf-8") as extracted:
            extracted.write(get_features(l, [pattern]))