In [1]:
#load the csv to the notebook
import pandas as pd

In [2]:
rawInput=pd.read_csv('tripadvisor millimum park.csv',encoding="ISO-8859-1")
rawInput.head()

Unnamed: 0.1,Unnamed: 0,user,location,date,rating,title,comments
0,0,,unknown,"June 21, 2018",50,Thanks again Go Card,Go early. They have weekend yoga activities. L...
1,0,,,"June 21, 2018",50,"Great people watching, skyline viewing","Really enjoyed the Gospel Fest, so much to see..."
2,0,,unknown,"June 21, 2018",40,Very Interesting,"I love the atmosphere, the art, and the struct..."
3,0,,,"June 20, 2018",40,Lovely place,Second visit to this smashing park between the...
4,0,,unknown,"June 20, 2018",40,great greenery sightseeing,"A perfect place for photos and driving a bike,..."


## preprocess text

In [6]:
#import the nltk lib for preprocessing
import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer

In [7]:
#nltk.download() #make sure the computer has all nltk corpus

In [8]:
import string #for getting punctuation
import gensim

In [9]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()##WordNetLemmatizer is a class

In [10]:
##a def to clean the text(typical way), very nice
def clean(doc):
    stop_free = " ".join([i for i in gensim.utils.simple_preprocess(doc,min_len=3) if i not in stop])
    ##simple_preprocess can lowercase words and tokenize them,you can set up min len of token too,here is 3, so word like AI will not be included
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())##这个用来找stem
    return normalized

In [11]:
doc_complete=rawInput['comments'][:100]

In [12]:
doc_clean = [clean(doc).split() for doc in doc_complete]    
#doc_clean is a list中包list,每个item是一个doc的tokens

In [43]:
doc_clean[:2]

[['early',
  'weekend',
  'yoga',
  'activity',
  'lot',
  'great',
  'area',
  'take',
  'picture',
  'restaurant',
  'line',
  'side',
  'visit',
  'gift',
  'shop',
  'reasonable',
  'priced',
  'souvenir'],
 ['really',
  'enjoyed',
  'gospel',
  'fest',
  'much',
  'see',
  'around',
  'stage',
  'awesome',
  'could',
  'spend',
  'hour']]

## Train Word2Vec

In [17]:
#import gensim.models.word2vec as w2v
import gensim
from gensim import corpora
import multiprocessing


In [18]:
#build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel, more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [77]:
word2vec = gensim.models.word2vec.Word2Vec(doc_clean,
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [78]:
#train the sentences
word2vec.train(doc_clean,total_examples=len(doc_clean), epochs=word2vec.iter)#epochs is training iterations

  


(4706, 11665)

In [85]:
#explore the trained model
word2vec.most_similar('enjoy',topn=20)#topn is the number of similar words

  


[('free', 0.999837338924408),
 ('nice', 0.9998323917388916),
 ('food', 0.9998291730880737),
 ('park', 0.9998283386230469),
 ('take', 0.9998214244842529),
 ('garden', 0.9998208284378052),
 ('kid', 0.9998206496238708),
 ('easy', 0.9998193383216858),
 ('water', 0.9998184442520142),
 ('place', 0.9998180866241455),
 ('also', 0.9998164772987366),
 ('picture', 0.999816358089447),
 ('concert', 0.9998162388801575),
 ('music', 0.9998158812522888),
 ('right', 0.999815046787262),
 ('picnic', 0.9998116493225098),
 ('view', 0.9998114705085754),
 ('around', 0.999810516834259),
 ('photo', 0.9998100996017456),
 ('keep', 0.9998099207878113)]