In [1]:
import gensim
import pandas as pd

In [2]:
# seed words for addition
AddSeed_list = ['add']

# seed words for deletion
DeleteSeed_list = ['remove', 'subtract', 'omit']

# seed words for replacement
ReplaceSeed_list = ['substitute', 'replace', 'instead']

## Read in and do simple preprocessing to review text

In [3]:
df = pd.read_csv('input_data/AllReviews_26thNov2019.csv')
review_list = []
for review in df['review_text']:
    review_list.append(gensim.utils.simple_preprocess(review))

## Train Word2Vec Model with Google Model Result as Initialization

In [4]:
new_model = gensim.models.Word2Vec(min_count=5, size=300, window = 5)

new_model.build_vocab(review_list)
new_model.intersect_word2vec_format("input_data/GoogleNews-vectors-negative300.bin", 
                                  binary=True, lockf=1.0)
new_model.train(review_list, total_examples=len(review_list), epochs=new_model.epochs)

(149894901, 206896570)

## Expand the Seed Word List

In [5]:
# Addition: top similar words to the words in our seed list
top_similar_add = list(set(new_model.wv.most_similar(positive=AddSeed_list, topn=10)))
print("similar words to the words in Addition Seed list:")
print(top_similar_add)

similar words to the words in Addition Seed list:
[('replace', 0.4866390526294708), ('use', 0.6703165173530579), ('eliminate', 0.5008467435836792), ('increase', 0.5632421970367432), ('added', 0.6035432815551758), ('ad', 0.5459101796150208), ('adding', 0.5992623567581177), ('omit', 0.5326415300369263), ('substitute', 0.5167902112007141), ('include', 0.5351263880729675)]


In [6]:
# Deletion: top similar words to the words in our seed list
top_similar_delete = list(set(new_model.wv.most_similar(positive=DeleteSeed_list, topn=10)))
print("similar words to the words in Deletion Seed list:")
print(top_similar_delete)

similar words to the words in Deletion Seed list:
[('discard', 0.5125858783721924), ('eliminate', 0.6815260648727417), ('leave', 0.5072888135910034), ('increase', 0.4705949127674103), ('skip', 0.536050021648407), ('add', 0.47948920726776123), ('nix', 0.4743996262550354), ('drain', 0.47148722410202026), ('choose', 0.46553415060043335), ('delete', 0.701276421546936)]


In [7]:
# Replacement: top similar words to the words in our seed list
top_similar_replace = list(set(new_model.wv.most_similar(positive=ReplaceSeed_list, topn=10)))
print("similar words to the words in Replacement Seed list:")
print(top_similar_replace)

similar words to the words in Replacement Seed list:
[('lieu', 0.510610818862915), ('replaced', 0.5776525139808655), ('substitue', 0.6454289555549622), ('sub', 0.7224389314651489), ('place', 0.5112087726593018), ('omit', 0.5549572706222534), ('subbed', 0.5459792017936707), ('substitued', 0.5180657505989075), ('substituted', 0.5913184881210327), ('subsitute', 0.5261411666870117)]
