In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
import re
from joblib import Parallel, delayed
import multiprocessing
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download("averaged_perceptron_tagger")
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pinecone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pinecone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pinecone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pinecone/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
business = pd.read_csv('business.csv')
review = pd.read_csv('review.csv')
tip = pd.read_csv('tip.csv')

In [3]:
print(business.shape)
print(review.shape)
print(tip.shape)

(1603, 14)
(243536, 9)
(38980, 5)


In [4]:
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    res = []
    for word, tag in pos_tag(word_tokenize(sentence)):
        if word in stopwords.words('english'):
            continue
        if tag.startswith('NN'):
            res.append(wnl.lemmatize(word, pos='n'))
        elif tag.startswith('VB'):
            res.append(wnl.lemmatize(word, pos='v'))
        elif tag.startswith('JJ'):
            res.append(wnl.lemmatize(word, pos='a'))
        elif tag.startswith('R'):
            res.append(wnl.lemmatize(word, pos='r'))
        else:
            res.append(word)
    return res

In [5]:
def clean_sentences(sent):
    reviews = []
    review_text = BeautifulSoup(sent).get_text()
    
    #remove non-alphabetic characters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    lemma_words = lemmatize_all(review_text.lower())

    reviews.append(lemma_words)

    return(reviews)


def tmp_func(df):
    res = df['text'].apply(clean_sentences)
    return res

In [6]:
def apply_parallel(df_grouped, func):
    results = Parallel(n_jobs=-1)(delayed(func)(group) for name, group in df_grouped)
    return pd.concat(results)

In [10]:
df_review = review.groupby(review.index)
review_sentences = apply_parallel(df_review, tmp_func)

In [12]:
review_sentences[10000]

[['decent',
  'wait',
  'staff',
  'rib',
  'dry',
  'chicken',
  'wing',
  'overcook',
  'stay',
  'away',
  'mac',
  'n',
  'cheese',
  'plastic',
  'chair',
  'good',
  'skinny',
  'folk',
  'get',
  'good',
  'chair']]

In [13]:
review['text'] = review_sentences

In [15]:
review.to_csv('review_split.csv', index=False)

In [18]:
df_tip = review.groupby(tip.index)
tip_sentences = apply_parallel(df_tip, tmp_func)

In [20]:
tip_sentences[1]

[['hey', 'mom', 'look', 'nancy', 'friggen', 'kerrigan']]

In [21]:
tip['text'] = tip_sentences
tip.to_csv('tip_split.csv', index=False)