In [1]:
from lib.amazon_reviews_loader import AmazonReviewsDS
from lib.amazon_reviews_cfg import DS_CFG_NO_SW, DS_CFG_SW
import gensim
from multiprocessing import cpu_count

_POS_REV_FILE = 'dataset/pos.txt'
_NEG_REV_FILE = 'dataset/neg.txt'

if __name__ ==  '__main__':
    print('Retrieving Amazon Reviews Dataset with No Stopwords')
    amazon_rev_no_sw = AmazonReviewsDS(_POS_REV_FILE, _NEG_REV_FILE, DS_CFG_NO_SW)
    print('Retrieving Amazon Reviews Dataset with Stopwords')
    amazon_rev_sw = AmazonReviewsDS(_POS_REV_FILE, _NEG_REV_FILE, DS_CFG_SW)

    amazon_rev_no_sw_sents = amazon_rev_no_sw.data
    amazon_rev_sw_sents = amazon_rev_sw.data

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/varsrao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Retrieving Amazon Reviews Dataset with No Stopwords
----- Dataset Synthesis Start -----
Loading Positive Reviews from dataset/pos.txt
Loading Negative Reviews from dataset/neg.txt
Generating data and labels
Tokenizing the data
Removing stop words
Shuffling the data
----- Dataset Synthesis Complete -----
Retrieving Amazon Reviews Dataset with Stopwords
----- Dataset Synthesis Start -----
Loading Positive Reviews from dataset/pos.txt
Loading Negative Reviews from dataset/neg.txt
Generating data and labels
Tokenizing the data
Shuffling the data
----- Dataset Synthesis Complete -----


In [2]:
    _EMBEDDING_SIZE = 150
    _WINDOW_SIZE = 4
    _MIN_TOKEN_COUNT = 4

    print('Creating Word2Vec Model: Without Stopwords')
    no_sw_model = gensim.models.Word2Vec(
            amazon_rev_no_sw_sents,
            size=_EMBEDDING_SIZE,
            window=_WINDOW_SIZE,
            min_count=_MIN_TOKEN_COUNT,
            workers=cpu_count())
    
    print('Creating Word2Vec Model: With Stopwords')
    sw_model = gensim.models.Word2Vec(
            amazon_rev_sw_sents,
            size=_EMBEDDING_SIZE,
            window=_WINDOW_SIZE,
            min_count=_MIN_TOKEN_COUNT,
            workers=cpu_count())

Creating Word2Vec Model: Without Stopwords
Creating Word2Vec Model: With Stopwords


In [3]:
    print('Finding 20 Similar Words To Good & Bad: Without Stopwords')
    print(f'Good: {no_sw_model.wv.most_similar(positive = "good", topn = 20)}')
    print(f'Bad: {no_sw_model.wv.most_similar(positive = "bad", topn = 20)}')

Finding 20 Similar Words To Good & Bad: Without Stopwords
Good: [('great', 0.773969829082489), ('decent', 0.7685281038284302), ('fantastic', 0.6710556745529175), ('excellent', 0.6628658771514893), ('amazing', 0.6426146626472473), ('terrific', 0.6412367224693298), ('nice', 0.6279391646385193), ('awesome', 0.6170191764831543), ('wonderful', 0.583147406578064), ('superb', 0.5648380517959595), ('ok', 0.562408447265625), ('bad', 0.550562858581543), ('impressive', 0.5503613948822021), ('okay', 0.5492343902587891), ('reasonable', 0.5296091437339783), ('perfect', 0.5269806385040283), ('exceptional', 0.503654420375824), ('outstanding', 0.5002436637878418), ('fabulous', 0.4893912374973297), ('well', 0.474298357963562)]
Bad: [('awful', 0.611964225769043), ('terrible', 0.6007453203201294), ('horrible', 0.5842011570930481), ('good', 0.550562858581543), ('poor', 0.5173922181129456), ('funny', 0.5113478899002075), ('lousy', 0.4984070062637329), ('weird', 0.4943544864654541), ('strange', 0.48082044720

In [4]:
    print('Finding 20 Similar Words To Good & Bad: With Stopwords')
    print(f'Good: {sw_model.wv.most_similar(positive = "good", topn = 20)}')
    print(f'Bad: {sw_model.wv.most_similar(positive = "bad", topn = 20)}')

Finding 20 Similar Words To Good & Bad: With Stopwords
Good: [('decent', 0.7879394888877869), ('great', 0.7771520614624023), ('fantastic', 0.7244586944580078), ('nice', 0.6965898275375366), ('terrific', 0.6937927007675171), ('wonderful', 0.6735690832138062), ('impressive', 0.6575987935066223), ('superb', 0.6506286859512329), ('bad', 0.6454977989196777), ('reasonable', 0.6350195407867432), ('fabulous', 0.623285710811615), ('excellent', 0.6213153600692749), ('awesome', 0.6116971969604492), ('amazing', 0.609173059463501), ('poor', 0.6045313477516174), ('clever', 0.5801587104797363), ('terrible', 0.5770386457443237), ('ok', 0.5687574148178101), ('okay', 0.5666078329086304), ('neat', 0.5659569501876831)]
Bad: [('horrible', 0.6774163246154785), ('terrible', 0.6629313230514526), ('good', 0.6454977989196777), ('awful', 0.603400468826294), ('strange', 0.5964905619621277), ('funny', 0.5800220370292664), ('sad', 0.5780524015426636), ('poor', 0.5764942169189453), ('lame', 0.572816014289856), ('stu