In [1]:
from lib.amazon_reviews_loader import AmazonReviewsDS
from lib.amazon_reviews_cfg import DS_CFG_NO_SW, DS_CFG_SW
import gensim
from multiprocessing import cpu_count

_POS_REV_FILE = 'dataset/pos.txt'
_NEG_REV_FILE = 'dataset/neg.txt'
_MODEL_WV_PATH = 'dataset/word2vec_embeddings.kv'

if __name__ ==  '__main__':
    print('Retrieving Amazon Reviews Dataset with Stopwords')
    amazon_rev_sw = AmazonReviewsDS(_POS_REV_FILE, _NEG_REV_FILE, DS_CFG_SW)
    amazon_rev_sw_sents = amazon_rev_sw.data

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/varsrao/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Retrieving Amazon Reviews Dataset with Stopwords
----- Dataset Synthesis Start -----
Loading Positive Reviews from dataset/pos.txt
Loading Negative Reviews from dataset/neg.txt
Generating data and labels
Tokenizing the data
Shuffling the data
----- Dataset Synthesis Complete -----


In [2]:
    _EMBEDDING_SIZE = 300
    _WINDOW_SIZE = 4
    _MIN_TOKEN_COUNT = 4
   
    print('Creating Word2Vec Model: With Stopwords')
    sw_model = gensim.models.Word2Vec(
            amazon_rev_sw_sents,
            size=_EMBEDDING_SIZE,
            window=_WINDOW_SIZE,
            min_count=_MIN_TOKEN_COUNT,
            workers=cpu_count())
    sw_model.wv.save(_MODEL_WV_PATH)

Creating Word2Vec Model: With Stopwords


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
    print('Finding 20 Similar Words To Good & Bad: With Stopwords')
    print('-----------------------------------------------------')
    print(f'Good: {sw_model.wv.most_similar(positive = "good", topn = 20)}')
    print('-----------------------------------------------------')
    print(f'Bad: {sw_model.wv.most_similar(positive = "bad", topn = 20)}')

Finding 20 Similar Words To Good & Bad: With Stopwords
-----------------------------------------------------
Good: [('decent', 0.7684434652328491), ('great', 0.7373508214950562), ('terrific', 0.7015677690505981), ('fantastic', 0.6987839937210083), ('nice', 0.6600608825683594), ('wonderful', 0.6583725214004517), ('superb', 0.6548490524291992), ('fabulous', 0.638202965259552), ('excellent', 0.6118650436401367), ('reasonable', 0.6083284616470337), ('bad', 0.5981848835945129), ('impressive', 0.5981660485267639), ('awesome', 0.586902916431427), ('poor', 0.5721795558929443), ('neat', 0.5720945596694946), ('amazing', 0.5595406293869019), ('terrible', 0.5532640814781189), ('hefty', 0.5461064577102661), ('classy', 0.5422768592834473), ('clever', 0.541985034942627)]
-----------------------------------------------------
Bad: [('horrible', 0.6540060639381409), ('terrible', 0.6364455819129944), ('good', 0.5981849431991577), ('awful', 0.596996545791626), ('funny', 0.5528815388679504), ('strange', 0.