In [2]:

'''
train a model with all restaurant review
'''
import pandas as pd

# read
train = pd.read_table("/Users/luxiaopeng/notebook/inls613/closure_prediction/review_by_business.txt")

# Verify the number of reviews that were read (100,000 in total)
print("Read %d labeled train reviews" % (train["text"].size))



train.rename(columns={"text":"review"}, inplace=True)

train = train[["business_id","review"]]

len(train)

# drop na
train = train.dropna()

len(train)



# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords = False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

'''
Next, we want a specific input format. Word2Vec expects single sentences, 
each one as a list of words. In other words, the input format is a list of lists.

It is not at all straightforward how to split a paragraph into sentences. 
There are all kinds of gotchas in natural language. English sentences can 
end with "?", "!", """, or ".", among other things, and spacing and 
capitalization are not reliable guides either. For this reason, 
we'll use NLTK's punkt tokenizer for sentence splitting. In order to 
use this, you will need to install NLTK and use nltk.download() to 
download the relevant training file for punkt.
'''
# Download the punkt tokenizer for sentence splitting
import nltk.data
import nltk
# nltk.download('punkt')
# nltk.download('popular')
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences(review, tokenizer, remove_stopwords = False):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    # raw_sentences = tokenizer.tokenize(review.decode('utf-8').strip())
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

sentences = []
count = 0
print('parsing sentences from training set')
for review in train['review']:
    sentences += review_to_sentences(review, tokenizer)
    count += 1
    if count % 1000 == 0:
        print(count)

# check
print(len(sentences))
print("parsing finished!")

# try:
#     import pickle
#     with open('sentences.pkl', 'wb') as f:
#         pickle.dump(sentences, f)
# except:
#     print("cannot save as pkl")
#     pass


# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 3   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 5          # Context window size                                                                                    
# downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print('training model...')
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context)# , sample = downsampling
model_name = "yelp_restaurant_300features_3minwords_5context"
model.save(model_name)
print("model saved!")
model.most_similar("man")

Read 2087 labeled train reviews
parsing sentences from training set




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


1000
2000
1018368
parsing finished!


2018-05-02 16:49:15,172 : INFO : 'pattern' package not found; tag filters are not available for English
2018-05-02 16:49:15,179 : INFO : collecting all words and their counts
2018-05-02 16:49:15,180 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-02 16:49:15,223 : INFO : PROGRESS: at sentence #10000, processed 127734 words, keeping 7660 word types
2018-05-02 16:49:15,258 : INFO : PROGRESS: at sentence #20000, processed 263109 words, keeping 11213 word types
2018-05-02 16:49:15,296 : INFO : PROGRESS: at sentence #30000, processed 400444 words, keeping 13614 word types
2018-05-02 16:49:15,326 : INFO : PROGRESS: at sentence #40000, processed 534494 words, keeping 15826 word types
2018-05-02 16:49:15,363 : INFO : PROGRESS: at sentence #50000, processed 669076 words, keeping 17439 word types


training model...


2018-05-02 16:49:15,399 : INFO : PROGRESS: at sentence #60000, processed 805023 words, keeping 18954 word types
2018-05-02 16:49:15,457 : INFO : PROGRESS: at sentence #70000, processed 941028 words, keeping 20254 word types
2018-05-02 16:49:15,520 : INFO : PROGRESS: at sentence #80000, processed 1076273 words, keeping 21541 word types
2018-05-02 16:49:15,586 : INFO : PROGRESS: at sentence #90000, processed 1209192 words, keeping 22499 word types
2018-05-02 16:49:15,633 : INFO : PROGRESS: at sentence #100000, processed 1340161 words, keeping 23537 word types
2018-05-02 16:49:15,694 : INFO : PROGRESS: at sentence #110000, processed 1475649 words, keeping 24865 word types
2018-05-02 16:49:15,755 : INFO : PROGRESS: at sentence #120000, processed 1613248 words, keeping 26035 word types
2018-05-02 16:49:15,815 : INFO : PROGRESS: at sentence #130000, processed 1750338 words, keeping 27130 word types
2018-05-02 16:49:15,879 : INFO : PROGRESS: at sentence #140000, processed 1885806 words, keepi

2018-05-02 16:49:18,242 : INFO : PROGRESS: at sentence #780000, processed 10581814 words, keeping 63774 word types
2018-05-02 16:49:18,279 : INFO : PROGRESS: at sentence #790000, processed 10720625 words, keeping 64135 word types
2018-05-02 16:49:18,312 : INFO : PROGRESS: at sentence #800000, processed 10856799 words, keeping 64519 word types
2018-05-02 16:49:18,346 : INFO : PROGRESS: at sentence #810000, processed 10996193 words, keeping 64984 word types
2018-05-02 16:49:18,378 : INFO : PROGRESS: at sentence #820000, processed 11129959 words, keeping 65420 word types
2018-05-02 16:49:18,410 : INFO : PROGRESS: at sentence #830000, processed 11265782 words, keeping 65760 word types
2018-05-02 16:49:18,446 : INFO : PROGRESS: at sentence #840000, processed 11401362 words, keeping 66109 word types
2018-05-02 16:49:18,479 : INFO : PROGRESS: at sentence #850000, processed 11539208 words, keeping 66463 word types
2018-05-02 16:49:18,512 : INFO : PROGRESS: at sentence #860000, processed 116744

2018-05-02 16:49:53,013 : INFO : EPOCH 3 - PROGRESS: at 28.97% examples, 725369 words/s, in_qsize 7, out_qsize 0
2018-05-02 16:49:54,014 : INFO : EPOCH 3 - PROGRESS: at 36.24% examples, 725113 words/s, in_qsize 7, out_qsize 0
2018-05-02 16:49:55,020 : INFO : EPOCH 3 - PROGRESS: at 43.11% examples, 719568 words/s, in_qsize 8, out_qsize 0
2018-05-02 16:49:56,025 : INFO : EPOCH 3 - PROGRESS: at 50.03% examples, 716472 words/s, in_qsize 7, out_qsize 0
2018-05-02 16:49:57,034 : INFO : EPOCH 3 - PROGRESS: at 56.88% examples, 713811 words/s, in_qsize 7, out_qsize 0
2018-05-02 16:49:58,035 : INFO : EPOCH 3 - PROGRESS: at 63.28% examples, 707780 words/s, in_qsize 7, out_qsize 0
2018-05-02 16:49:59,038 : INFO : EPOCH 3 - PROGRESS: at 70.37% examples, 709267 words/s, in_qsize 7, out_qsize 0
2018-05-02 16:50:00,043 : INFO : EPOCH 3 - PROGRESS: at 78.00% examples, 714292 words/s, in_qsize 7, out_qsize 0
2018-05-02 16:50:01,049 : INFO : EPOCH 3 - PROGRESS: at 84.62% examples, 710621 words/s, in_qsiz

model saved!


[('guy', 0.7689632773399353),
 ('woman', 0.7436196208000183),
 ('lady', 0.7430142164230347),
 ('gentleman', 0.7339794039726257),
 ('girl', 0.7078391909599304),
 ('gal', 0.6827062964439392),
 ('dude', 0.6572169065475464),
 ('gentlemen', 0.5664964914321899),
 ('boy', 0.544607400894165),
 ('cashier', 0.507962703704834)]

In [1]:
similar_list = model.most_similar(positive=["great"],topn=20)
for tupl in similar_list:
    print("'",tupl[0],"'",",",sep='',end='')

NameError: name 'model' is not defined

In [1]:
from gensim.models import Word2Vec
model = Word2Vec.load("yelp_restaurant_300features_3minwords_5context")

In [2]:
similar_list = model.most_similar(positive=["dessert"],topn=20)
for tupl in similar_list:
    print("'",tupl[0],"'",",",sep='',end='')

'desert','tiramisu','cheesecake','gelato','desserts','cannoli','baklava','flan','brownie','panna','sundae','nutella','smoothie','creme','torte','souffle','beignets','cupcakes','cobbler','starters',

  """Entry point for launching an IPython kernel.
