In [1]:
import spacy
import pandas as pd
import itertools as it

In [3]:
nlp = spacy.load('en')

In [4]:
des = "Don’t Starve is an uncompromising wilderness survival game full of science and magic. \
Enter a strange and unexplored world full of strange creatures, dangers, and surprises. Gather \
resources to craft items and structures that match your survival style."

In [24]:
des_unicode = unicode(des,encoding="utf-8")

In [25]:
parsed_des = nlp(des_unicode)

In [13]:
print(parsed_des)

Don’t Starve is an uncompromising wilderness survival game full of science and magic. Enter a strange and unexplored world full of strange creatures, dangers, and surprises. Gather resources to craft items and structures that match your survival style.


In [17]:
# parsing the sentences
for num, sentence in enumerate(parsed_des.sents):
    print 'Sentence {}:'.format(num + 1)
    print sentence
    print ''

Sentence 1:
Don’t Starve is an uncompromising wilderness survival game full of science and magic.

Sentence 2:
Enter a strange and unexplored world full of strange creatures, dangers, and surprises.

Sentence 3:
Gather resources to craft items and structures that match your survival style.



In [26]:
# parsing the entities - there were no entities
for num, entity in enumerate(parsed_des.ents):
    print 'Entity {}:'.format(num + 1), entity, '-', entity.label_
    print ''

In [46]:
# part of speech
token_text = [token.orth_ for token in parsed_des]
token_pos = [token.pos_ for token in parsed_des]

pd.DataFrame(zip(token_text, token_pos),
             columns=['token_text', 'part_of_speech']).head()

Unnamed: 0,token_text,part_of_speech
0,Do,VERB
1,n’t,ADV
2,Starve,VERB
3,is,VERB
4,an,DET


In [47]:
# lemmatization and shape
token_lemma = [token.lemma_ for token in parsed_des]
token_shape = [token.shape_ for token in parsed_des]

pd.DataFrame(zip(token_text, token_lemma, token_shape),
             columns=['token_text', 'token_lemma', 'token_shape']).head()

Unnamed: 0,token_text,token_lemma,token_shape
0,Do,do,Xx
1,n’t,not,x’x
2,Starve,starve,Xxxxx
3,is,be,xx
4,an,an,xx


In [48]:
token_entity_type = [token.ent_type_ for token in parsed_des]
token_entity_iob = [token.ent_iob_ for token in parsed_des]

pd.DataFrame(zip(token_text, token_entity_type, token_entity_iob),
             columns=['token_text', 'entity_type', 'inside_outside_begin']).head()

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,Do,,O
1,n’t,,O
2,Starve,,O
3,is,,O
4,an,,O


In [49]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_des]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,Do,-7.660115,Yes,,,,
1,n’t,-19.579313,,,,,Yes
2,Starve,-19.579313,,,,,
3,is,-4.329765,Yes,,,,
4,an,-5.953294,Yes,,,,
5,uncompromising,-19.579313,,,,,
6,wilderness,-19.579313,,,,,
7,survival,-10.912963,,,,,
8,game,-8.066769,,,,,
9,full,-8.511314,Yes,,,,


In [50]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [51]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [52]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(des):
            f.write(sentence + '\n')

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs


In [63]:
file_object  = open("/Users/jasonchiu0803/Desktop/data_bootcamp/mcd.txt", "rb")

In [65]:
a = file_object.read()

In [66]:
print(a)

{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf810
{\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset0 LucidaGrande;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\margl1440\margr1440\vieww10800\viewh8400\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0

\f0\fs24 \cf0 ITEM 1. Business McDonald\'92s Corporation, the registrant, together with its sub-sidiaries, is referred to herein as the \'93Company.\'94\
a. General\
During 2016, there were no material changes to the Company's corporate structure or in its method of conducting business. The business is structured with segments that combine markets with similar characteristics and opportunities for growth. Significant reportable segments include the United States ("U.S."), International Lead Markets and High Growth Markets. In addition, throughout this report we present the Foundational Markets & Corporate segment, which includes m

In [67]:
a_unicode = unicode(a,encoding="utf-8")

In [68]:
parsed_a = nlp(a_unicode)

In [69]:
print(parsed_a)

{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf810
{\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset0 LucidaGrande;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\margl1440\margr1440\vieww10800\viewh8400\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0

\f0\fs24 \cf0 ITEM 1. Business McDonald\'92s Corporation, the registrant, together with its sub-sidiaries, is referred to herein as the \'93Company.\'94\
a. General\
During 2016, there were no material changes to the Company's corporate structure or in its method of conducting business. The business is structured with segments that combine markets with similar characteristics and opportunities for growth. Significant reportable segments include the United States ("U.S."), International Lead Markets and High Growth Markets. In addition, throughout this report we present the Foundational Markets & Corporate segment, which includes m

In [70]:
for num, sentence in enumerate(parsed_a.sents):
    print 'Sentence {}:'.format(num + 1)
    print sentence
    print ''

Sentence 1:
{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf810
{\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset0 LucidaGrande;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\margl1440\margr1440\vieww10800\viewh8400\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0

\f0\fs24 \cf0 ITEM 1.

Sentence 2:
Business McDonald\'92s Corporation, the registrant, together with its sub-sidiaries, is referred to herein as the \'93Company.\'94\
a. General\
During 2016, there were no material changes to the Company's corporate structure or in its method of conducting business.

Sentence 3:
The business is structured with segments that combine markets with similar characteristics and opportunities for growth.

Sentence 4:
Significant reportable segments include the United States ("U.S."), International Lead Markets and High Growth Markets.

Sentence 5:
In addition, throughout this report we presen

In [71]:
for num, entity in enumerate(parsed_a.ents):
    print 'Entity {}:'.format(num + 1), entity, '-', entity.label_
    print ''

Entity 1: ITEM - ORG

Entity 2: Business McDonald\'92s Corporation - ORG

Entity 3: General\ - CARDINAL

Entity 4: 2016 - DATE

Entity 5: Company - ORG

Entity 6: the United States - GPE

Entity 7: U.S. - GPE

Entity 8: International Lead Markets - ORG

Entity 9: the Foundational Markets & Corporate - ORG

Entity 10: 80 - CARDINAL

Entity 11: the years ended December 31, 2016, 2015 - DATE

Entity 12: 2014 - DATE

Entity 13: II, - ORG

Entity 14: 8 - CARDINAL

Entity 15: 44 - CARDINAL

Entity 16: Narrative - GPE

Entity 17: Company - ORG

Entity 18: McDonald\'92s - ORG

Entity 19: more than 100 - CARDINAL

Entity 20: McDonald\'92s - FAC

Entity 21: Company-owned - PERSON

Entity 22: approximately 95% franchised - PERCENT

Entity 23: McDonald\'92s - ORG

Entity 24: McDonald\'92s - ORG

Entity 25: 
The Company - WORK_OF_ART

Entity 26: approximately 85% of - PERCENT

Entity 27: McDonald - ORG

Entity 28: McDonald\'92s - ORG

Entity 29: One - CARDINAL

Entity 30: Company - ORG

Entity 31: 

In [72]:
# part of speech
token_text = [token.orth_ for token in parsed_a]
token_pos = [token.pos_ for token in parsed_a]

pd.DataFrame(zip(token_text, token_pos),
             columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,{,PUNCT
1,\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrt...,PROPN
2,\n,SPACE
3,{,PUNCT
4,\fonttbl\f0\fswiss\fcharset0,NOUN
5,Helvetica;\f1\fnil\fcharset0,PROPN
6,LucidaGrande,PROPN
7,;,PUNCT
8,},PUNCT
9,\n,SPACE


In [73]:
token_lemma = [token.lemma_ for token in parsed_a]

In [74]:
token_lemma

[u'{',
 u'\\rtf1\\ansi\\ansicpg1252\\cocoartf1504\\cocoasubrtf810',
 u'\n',
 u'{',
 u'\\fonttbl\\f0\\fswiss\\fcharset0',
 u'helvetica;\\f1\\fnil\\fcharset0',
 u'lucidagrande',
 u';',
 u'}',
 u'\n',
 u'{',
 u'\\colortbl;\\red255\\green255\\blue255',
 u';',
 u'}',
 u'\n',
 u'{',
 u'\\*\\expandedcolortbl',
 u';',
 u';',
 u'}',
 u'\n',
 u'\\margl1440\\margr1440\\vieww10800\\viewh8400\\viewkind0',
 u'\n',
 u'\\pard\\tx720\\tx1440\\tx2160\\tx2880\\tx3600\\tx4320\\tx5040\\tx5760\\tx6480\\tx7200\\tx7920\\tx8640\\pardirnatural\\partightenfactor0',
 u'\n\n',
 u'\\f0\\fs24',
 u'\\cf0',
 u'item',
 u'1',
 u'.',
 u'business',
 u"mcdonald\\'92s",
 u'corporation',
 u',',
 u'the',
 u'registrant',
 u',',
 u'together',
 u'with',
 u'-PRON-',
 u'sub',
 u'-',
 u'sidiarie',
 u',',
 u'be',
 u'refer',
 u'to',
 u'herein',
 u'as',
 u'the',
 u"\\'93company.\\'94\\",
 u'\n',
 u'a.',
 u'general\\',
 u'\n',
 u'during',
 u'2016',
 u',',
 u'there',
 u'be',
 u'no',
 u'material',
 u'change',
 u'to',
 u'the',
 u'company'

In [98]:
token_nopunc = [token.is_punct for token in parsed_a]

In [103]:
words = [token for token in parsed_a if token.is_punct == False][16:]

In [107]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [108]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [132]:
import os
import codecs
unigram_sentences_filepath = os.path.join('./','unigram_sentences_all.txt')

In [142]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus("/Users/jasonchiu0803/Desktop/data_bootcamp/mcd.txt"):
            f.write(sentence + '\n')

CPU times: user 589 ms, sys: 441 ms, total: 1.03 s
Wall time: 1.22 s


In [143]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [144]:
for unigram_sentence in unigram_sentences:
    print u' '.join(unigram_sentence)
    print u''

\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf810

\fonttbl\f0\fswiss\fcharset0 helvetica;\f1\fnil\fcharset0 lucidagrande

\colortbl;\red255\green255\blue255

\*\expandedcolortbl

\margl1440\margr1440\vieww10800\viewh8400\viewkind0

\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0

\f0\fs24 \cf0 item 1

business mcdonald\'92s corporation the registrant together with -PRON- sub sidiarie be refer to herein as the \'93company.\'94\

a. general\

during 2016 there be no material change to the company 's corporate structure or in -PRON- method of conduct business

the business be structure with segment that combine market with similar characteristic and opportunity for growth

significant reportable segment include the united states u.s. international lead markets and high growth markets

in addition throughout this report -PRON- present the foundational markets corporate segment which include market in over 80 country

In [147]:
bigram_model_filepath = os.path.join('./', 'bigram_model_all')

In [148]:

%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 0:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

CPU times: user 33.9 ms, sys: 10.4 ms, total: 44.3 ms
Wall time: 50.5 ms


In [149]:
bigram_sentences_filepath = os.path.join("./",
                                         'bigram_sentences_all.txt')

In [155]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')

CPU times: user 73.8 ms, sys: 4.37 ms, total: 78.2 ms
Wall time: 128 ms


In [156]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [157]:
for bigram_sentence in it.islice(bigram_sentences, 0, 240):
    print u' '.join(bigram_sentence)
    print u''

\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf810

\fonttbl\f0\fswiss\fcharset0 helvetica;\f1\fnil\fcharset0 lucidagrande

\colortbl;\red255\green255\blue255

\*\expandedcolortbl

\margl1440\margr1440\vieww10800\viewh8400\viewkind0

\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0

\f0\fs24 \cf0 item 1

business mcdonald\'92s corporation the registrant together with -PRON- sub sidiarie be refer to herein as the \'93company.\'94\

a. general\

during 2016 there be no material change to the_company 's corporate structure or in -PRON- method of conduct business

the business be structure with segment that combine market with similar characteristic and opportunity for growth

significant reportable segment include the united_states u.s. international lead markets and high growth markets

in_addition throughout this report -PRON- present the foundational markets corporate segment which include market in over 80 country

In [159]:
trigram_model_filepath = os.path.join('./',
                                      'trigram_model_all')

In [160]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 0:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

CPU times: user 31.2 ms, sys: 9.37 ms, total: 40.6 ms
Wall time: 33.6 ms


In [161]:
trigram_sentences_filepath = os.path.join("./",
                                          'trigram_sentences_all.txt')

In [163]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

CPU times: user 67.6 ms, sys: 3.81 ms, total: 71.4 ms
Wall time: 71 ms


In [164]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [166]:
for trigram_sentence in it.islice(trigram_sentences, 0, 240):
    print u' '.join(trigram_sentence)
    print u''

\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf810

\fonttbl\f0\fswiss\fcharset0 helvetica;\f1\fnil\fcharset0 lucidagrande

\colortbl;\red255\green255\blue255

\*\expandedcolortbl

\margl1440\margr1440\vieww10800\viewh8400\viewkind0

\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0

\f0\fs24 \cf0 item 1

business mcdonald\'92s corporation the registrant together with -PRON- sub sidiarie be refer to herein as the \'93company.\'94\

a. general\

during 2016 there be no material change to the_company 's corporate structure or in -PRON- method of conduct business

the business be structure with segment that combine market with similar characteristic and opportunity for growth

significant reportable segment include the united_states u.s. international lead markets and high growth markets

in_addition throughout this report -PRON- present the foundational markets corporate segment which include market in over 80 country

In [189]:
trigram_reviews_filepath = os.path.join("./",
                                        'trigram_transformed_reviews_all.txt')

In [190]:
# ways to run through all three at once

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review("/Users/jasonchiu0803/Desktop/data_bootcamp/mcd.txt"),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.en.language_data.STOP_WORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')

In [191]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import cPickle as pickle

In [192]:
trigram_dictionary_filepath = os.path.join("./",
                                           'trigram_dict_all.dict')

In [193]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 0 == 0:

    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)


CPU times: user 20.5 ms, sys: 4.68 ms, total: 25.2 ms
Wall time: 21.3 ms


In [195]:
trigram_bow_filepath = os.path.join("./",
                                    'trigram_bow_corpus_all.mm')

In [196]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [198]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 0 == 0:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

CPU times: user 17.4 ms, sys: 2.81 ms, total: 20.2 ms
Wall time: 18.8 ms


In [199]:
lda_model_filepath = os.path.join("./", 'lda_model_all')

In [200]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 0 == 0:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=3)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

CPU times: user 330 ms, sys: 152 ms, total: 482 ms
Wall time: 621 ms


In [201]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print u'{:20} {}'.format(u'term', u'frequency') + u'\n'

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print u'{:20} {:.3f}'.format(term, round(frequency, 3))

In [246]:
topic_names = {0: u'a',
               1: u'b',
               2: u'c',
               3: u'd',
               4: u'e',
               5: u'f',
               6: u'g',
               7: u'h',
               8: u'i',
               9: u'j',
               10: u'k',
               11: u'l',
               12: u'm',
               13: u'n',
               14: u'o',
               15: u'p',
               16: u'q',
               17: u'r',
               18: u's',
               19: u't',
               20: u'u',
               21: u'v',
               22: u'w',
               23: u'x',
               24: u'y',
               25: u'z',
               26: u'aa',
               27: u'ab',
               28: u'ac',
               29: u'ad',
               30: u'ae',
               31: u'af',
               32: u'ag',
               33: u'ah',
               34: u'ai',
               35: u'aj',
               36: u'ak',
               37: u'al',
               38: u'am',
               39: u'an',
               40: u'ao',
               41: u'ap',
               42: u'aq',
               43: u'ar',
               44: u'as',
               45: u'at',
               46: u'au',
               47: u'av',
               48: u'aw',
               49: u'ax'}

In [247]:
def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review("/Users/jasonchiu0803/Desktop/data_bootcamp/mcd.txt"),
                          review_number, review_number+1))[0]

In [249]:
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.en.language_data.STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda = sorted(review_lda, key=lambda (topic_number, freq): -freq)
    
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print '{:25} {}'.format(topic_names[topic_number],
                                round(freq, 3))

In [250]:
sample_review = get_sample_review(40)
print sample_review

McDonald\'92s restaurants in the U.S. and many international markets offer a full or limited breakfast menu. Breakfast offerings may include Egg McMuffin, Sausage McMuffin with Egg, McGriddles, biscuit and bagel sandwiches and hotcakes.\



In [251]:
lda_description(sample_review)

ak                        0.67
am                        0.17


In [252]:
topic_names_filepath = os.path.join('./', 'topic_names.pkl')

with open(topic_names_filepath, 'w') as f:
    pickle.dump(topic_names, f)

In [253]:
LDAvis_data_filepath = os.path.join("./", 'ldavis_prepared')

In [255]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

    with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath) as f:
    LDAvis_prepared = pickle.load(f)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]


CPU times: user 864 ms, sys: 76.1 ms, total: 940 ms
Wall time: 1.83 s


In [256]:
pyLDAvis.display(LDAvis_prepared)


In [260]:
from gensim.models import Word2Vec

trigram_sentences = LineSentence(trigram_sentences_filepath)
word2vec_filepath = os.path.join("./", 'word2vec_model_all')

In [262]:
token_count = sum([len(sentence) for sentence in trigram_sentences])

In [263]:
token_count

7247

In [267]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the word2vec model yourself.
if 0 == 0:

    # initiate the model and perform the first epoch of training
    food2vec = Word2Vec(trigram_sentences, size=100, window=5,
                        min_count=20, sg=1, workers=4)
    
    food2vec.save(word2vec_filepath)

    # perform another 11 epochs of training
    for i in range(1,12):

        food2vec.train(trigram_sentences, total_examples = token_count, epochs=food2vec.iter)
        food2vec.save(word2vec_filepath)
        
# load the finished model from disk
food2vec = Word2Vec.load(word2vec_filepath)
food2vec.init_sims()

print u'{} training epochs so far.'.format(food2vec.train_count)

12 training epochs so far.
CPU times: user 497 ms, sys: 45.6 ms, total: 542 ms
Wall time: 561 ms


In [270]:
print u'{:,} terms in the food2vec vocabulary.'.format(len(food2vec.wv.vocab))

46 terms in the food2vec vocabulary.


In [277]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in food2vec.wv.vocab.iteritems()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda (term, index, count): -count)

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(food2vec.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
-PRON-,0.071421,-0.001411,-0.130727,0.171738,0.168539,0.204586,0.10212,-0.101801,-0.006015,0.047438,...,-0.109683,0.073886,-0.088152,-0.082956,0.069986,-0.100864,0.208725,-0.044635,0.018535,-0.018462
and,0.155849,0.006213,-0.13057,0.078606,0.032627,0.231023,0.074588,-0.127097,0.019867,0.035506,...,-0.177488,0.12741,-0.216536,-0.093412,0.059267,0.009622,0.099667,0.011719,0.026815,-0.115114
the,0.224218,-0.038701,-0.070422,0.079195,0.025803,0.126871,0.000512,-0.217058,-0.006914,0.034563,...,-0.23292,0.037623,-0.200053,-0.0532,-0.011712,0.008506,0.059083,0.088834,-0.069187,-0.065261
of,0.235987,-0.031847,-0.101493,0.066414,0.033176,0.07499,-0.033531,-0.174869,-0.076832,0.011644,...,-0.208143,0.014227,-0.232382,-0.036851,-0.040678,-0.041212,0.114656,0.101213,-0.055822,-0.01681
to,0.184837,0.049603,-0.168775,0.030522,-0.086197,0.172532,0.035045,-0.174679,-0.002708,-0.007116,...,-0.238221,0.082218,-0.193133,-0.08316,0.05228,-0.031314,0.073937,0.078793,-0.081293,-0.058763
be,0.160383,-0.001855,-0.125057,0.099042,-0.062676,0.104698,-0.064665,-0.052527,-0.025689,-0.035448,...,-0.093766,0.135511,-0.276702,0.009389,0.02939,-0.022774,0.133094,0.087334,-0.069397,0.013143
or,0.068484,0.027303,-0.194242,0.158276,0.261592,0.199756,-0.055404,-0.052003,-0.076413,0.021032,...,-0.159643,0.007094,-0.113585,-0.114039,0.046185,-0.081733,0.239358,-0.048087,0.00165,-0.050211
in,0.192825,-0.074724,-0.072616,0.052185,0.18379,0.139361,-0.040082,-0.20746,-0.04459,0.042163,...,-0.165194,-0.015614,-0.169355,-0.155835,0.018982,-0.068063,0.190457,0.023204,0.02999,-0.083352
a,0.232143,-0.013473,-0.060615,0.053787,-0.073921,-0.010602,0.002669,-0.177025,-0.081551,0.04189,...,-0.175443,0.024232,-0.176908,0.069321,-0.048419,-0.065208,-0.002981,0.112671,-0.123704,0.035947
may,0.093058,-0.041572,-0.146209,0.035166,0.182895,0.199576,-0.056406,-0.0953,-0.04598,0.002971,...,-0.14604,-0.007627,-0.172743,-0.226624,0.065028,-0.040982,0.259364,0.013446,0.031935,-0.088504


array([[ 0.07142086, -0.00141114, -0.13072662, ..., -0.04463548,
         0.0185354 , -0.01846205],
       [ 0.15584873,  0.00621272, -0.13056965, ...,  0.01171853,
         0.0268152 , -0.11511421],
       [ 0.22421838, -0.03870134, -0.07042163, ...,  0.0888344 ,
        -0.06918655, -0.06526057],
       ..., 
       [ 0.15398295, -0.04186517,  0.07578854, ..., -0.06090095,
         0.08582148, -0.15130161],
       [ 0.19145985,  0.03319355, -0.13039264, ...,  0.03825092,
         0.07283761, -0.04542649],
       [ 0.21185714, -0.11049097,  0.02566633, ...,  0.12871933,
        -0.04678935, -0.05578256]], dtype=float32)

In [278]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in food2vec.most_similar(positive=[token], topn=topn):

        print u'{:20} {}'.format(word, round(similarity, 3))

In [281]:
get_related_terms(u'cost',topn=50)

increase             0.954
-PRON-               0.857
could                0.85
business             0.801
risk                 0.799
include              0.778
brand                0.768
which                0.757
may                  0.746
these                0.721
and                  0.701
have                 0.697
market               0.68
ability_to           0.668
or                   0.667
in                   0.662
can                  0.652
operation            0.649
initiative           0.645
that                 0.631
if                   0.62
not                  0.593
result               0.589
also                 0.585
product              0.583
strategy             0.578
financial            0.548
other                0.543
on                   0.535
food                 0.499
with                 0.493
franchisee           0.438
to                   0.437
the                  0.403
of                   0.39
for                  0.388
information          0.286
be   