# WordNet

In [1]:
import nltk
from nltk.corpus import wordnet

In [2]:
syns = wordnet.synsets("new")
print(syns)
print(wordnet.synset('new.a.06').definition())
# we should use creative.a.02, by definition.
# but it doesn't have synonyms.
# so we use also creative.a.01, and gives

wordnet.synset('creative.a.01').lemma_names()

[Synset('new.a.01'), Synset('fresh.s.04'), Synset('raw.s.12'), Synset('new.s.04'), Synset('new.s.05'), Synset('new.a.06'), Synset('newfangled.s.01'), Synset('new.s.08'), Synset('modern.s.05'), Synset('new.s.10'), Synset('new.s.11'), Synset('newly.r.01')]
unaffected by use or exposure


['creative', 'originative']

In [3]:
first_word = wordnet.synset("creative.a.01")
second_word = wordnet.synset("originative.a.01")
print('Similarity: ' + str(first_word.wup_similarity(second_word)))

Similarity: 1.0


In [4]:
creativity_dict = ['creative', 'new', 'novel', 'interesting', 'genius', 
                   'imaginative', 'ingenious','innovative', 'inventive']

synonyms = []
antonyms = []

for creativity_word in creativity_dict:
    for syn in wordnet.synsets(creativity_word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
            if lemma.antonyms():
                for a in lemma.antonyms():
                    antonyms.append(a.name())

creative_list = list(set(synonyms))
antonyms = list(set(antonyms))
print(creative_list)
print(antonyms)

noncreative_list = []
for word in antonyms:
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            noncreative_list.append(lemma.name())

print(list(set(noncreative_list)))

['New', 'refreshing', 'interest', 'inventive', 'raw', 'genius', 'Einstein', 'maven', 'clever', 'innovational', 'ace', 'star', 'superstar', 'groundbreaking', 'brainiac', 'sensation', 'concern', 'unexampled', 'occupy', 'fresh', 'newly', 'adept', 'worry', 'champion', 'modern', 'wizard', 'imaginative', 'wizardry', 'Modern', 'mastermind', 'ingenious', 'advanced', 'matter_to', 'mavin', 'wiz', 'new', 'cunning', 'whiz', 'originative', 'whizz', 'flair', 'brilliance', 'young', 'interesting', 'virtuoso', 'creative', 'hotshot', 'freshly', 'forward-looking', 'novel', 'brain', 'newfangled', 'innovative']
['old', 'worn', 'uninteresting', 'bore', 'uncreative']
['hold', 'turn_out', 'abide', 'deport', 'drawn', 'fag_out', 'acquit', 'expect', 'gestate', 'have_a_bun_in_the_oven', 'eager', 'bore-hole', 'bear', 'gauge', 'sometime', 'have', 'haggard', 'accept', 'bust', 'erstwhile', 'honest-to-goodness', 'deliver', 'take_over', 'comport', 'stand', 'outwear', 'drill_hole', 'older', 'honest-to-god', 'raddled', '

# Word2Vec

# Pre-trained Google News Model

In [5]:
import gensim.downloader as api
pretrained_model = api.load('word2vec-google-news-300')

KeyboardInterrupt: 

In [None]:
from gensim import models

google_model = models.KeyedVectors.load_word2vec_format(
    '/Users/nessyliu/Desktop/RA/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
len(list(pretrained_model.vocab))

In [None]:
print(pretrained_model['creative'])

In [None]:
# # Similarity
# pairs = [
#     ('creative', 'interesting'),   
#     ('banana', 'apple') 
# ]
# for w1, w2 in pairs:
#     print('%r\t%r\t%.2f' % (w1, w2, pretrained_model.similarity(w1, w2)))
    
# top similar words to the words in our dict
top_similar_1 = list(set(pretrained_model.most_similar(positive=creativity_dict, topn=20)))
print("similar words to the words in our dict:")
print(top_similar_1)

# top similar words to "uncreative"
print("\nsimilar words to 'uncreative':")
print(list(set(pretrained_model.most_similar(positive=['creative'], topn=10))))


# top negative words to the words in our dict
top_negative_1 = list(set(pretrained_model.most_similar(negative=creativity_dict, topn=20)))
print("\nnegative words to the words in our dict:")
print(top_negative_1)


# Build our own word2vec model with review data

In [None]:
import pandas as pd
import gensim
from gensim import utils
import tempfile

class ReviewCorpus(object):
    """An interator that yields sentences (list of tokens)."""

    def __iter__(self):
        df = pd.read_csv('/Users/nessyliu/Desktop/RA/AllReviews_26thNov2019.csv')
        for review in df['review_text']:
            yield utils.simple_preprocess(review)

reviews = ReviewCorpus()
review_model = gensim.models.Word2Vec(reviews, min_count=5, size=300, window = 5)

In [None]:
# save the model
with tempfile.NamedTemporaryFile(prefix='review-model-', delete=False) as tmp:
        review_model_path = tmp.name
        print(review_model_path)
        review_model.save(review_model_path)

In [None]:
only_review_model = gensim.models.Word2Vec.load(review_model_path)

In [None]:
words = list(only_review_model.wv.vocab)
print(len(words))

In [None]:
# # Similarity
# pairs = [
#     ('king', 'queen'),   
#     ('banana', 'apple') 
# ]
# for w1, w2 in pairs:
#     print('%r\t%r\t%.2f' % (w1, w2, only_review_model.wv.similarity(w1, w2)))


    
top_similar_2 = list(set(only_review_model.wv.most_similar(positive=creativity_dict, topn=20)))
print("similar words to the words in our dict:")
print(top_similar_2)

# # top similar words to "uncreative"
# print("\nsimilar words to 'uncreative':")
# print(list(set(only_review_model.most_similar(positive=['uncreative'], topn=10))))

# top negative words to the words in our dict
top_negative_2 = list(set(only_review_model.wv.most_similar(negative=creativity_dict, topn=20)))
print("\nnegative words to the words in our dict:")
print(top_negative_2)


In [None]:
print(only_review_model.wv['creative'])

# Train model with our review data based on pre-trained model

In [None]:
print(len(list(pretrained_model.vocab)))

df = pd.read_csv('/Users/nessyliu/Desktop/RA/AllReviews_26thNov2019.csv')
review_list = []
for review in df['review_text']:
    review_list.append(utils.simple_preprocess(review))
print(len(review_list))

In [None]:
new_model = gensim.models.Word2Vec(min_count=5, size=300, window = 5)

new_model.build_vocab(review_list)
new_model.intersect_word2vec_format("/Users/nessyliu/Desktop/RA/GoogleNews-vectors-negative300.bin", 
                                  binary=True, lockf=1.0)
new_model.train(review_list, total_examples=len(review_list), epochs=new_model.epochs)

print(len(list(new_model.wv.vocab)))

# pairs = [
#     ('creative', 'interesting'),   
#     ('banana', 'apple') 
# ]
# for w1, w2 in pairs:
#     print('%r\t%r\t%.2f' % (w1, w2, new_model.wv.similarity(w1, w2)))

print(new_model.wv['creative'])

In [None]:
# top similar words to the words in our dict
top_similar_3 = list(set(new_model.wv.most_similar(positive=creativity_dict, topn=100)))
print("similar words to the words in our dict:")
print(top_similar_3)

# # top similar words to "uncreative"
# print("\nsimilar words to 'uncreative':")
# print(list(set(new_model.most_similar(positive=['uncreative'], topn=10))))

# top negative words to the words in our dict
top_negative_3 = list(set(new_model.wv.most_similar(negative=creativity_dict, topn=20)))
print("\nnegative words to the words in our dict:")
print(top_negative_3)

In [None]:
uncreative_3 = list(set(new_model.wv.most_similar(positive=['old', 'bore', 'worn', 'uninteresting',
                                                            'uninteresting','uninspired',
                                                            'boring','bland'], topn=30)))
print("similar words to the 'uncreative' words:")
print(uncreative_3)

for i in uncreative_3:
    print(i[0])

In [None]:
with tempfile.NamedTemporaryFile(prefix='new-model-', delete=False) as tmp:
        new_model_path = tmp.name
        print(new_model_path)
        new_model.save(new_model_path)

In [None]:
new_model_load = gensim.models.Word2Vec.load(new_model_path)

# Build Matrix

In [6]:
gensim.models.Word2Vec.load('/Users/nessyliu/Documents/GitHub/Recipe_Project/OUT/word2vec_model/model_3')

NameError: name 'gensim' is not defined