In [36]:
# Word2vec, published by Google in 2013, is a neural network implementation that learns distributed representations for
# words. Other deep or recurrent neural network architectures had been proposed for learning word representations prior to 
# this, but the major problem with these was the long time required to train the models. Word2vec learns quickly relative 
# to other models.

In [37]:
# Word2Vec does not need labels in order to create meaningful representations. This is useful, since most data in
# the real world is unlabeled. If the network is given enough training data (tens of billions of words), it 
# produces word vectors with intriguing characteristics. Words with similar meanings appear in clusters, and 
# clusters are spaced such that some word relationships, such as analogies, can be reproduced using vector math.
# The famous example is that, with highly trained word vectors, "king - man + woman = queen."

In [38]:
# Word2Vec can learn from unlabeled data

import pandas as pd
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter='\t', quoting=3)

In [39]:
print( "read %d labled train data, %d labeled test data, %d unlabeled reviews" %(train["review"].size, test["review"].size, unlabeled_train["review"].size))

read 25000 labled train data, 25000 labeled test data, 50000 unlabeled reviews


In [40]:
# to train Word2Vec it is better not to remove stop words because the algorithm relies on the broader context
# of the sentence in order to produce high-quality word vectors

# Import various model for clening the text
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [76]:
def review_to_wordlist(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #
    # 2.remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # 3. convert word to lower case and split
    words = review_text.lower().split()
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [77]:
# Word2Vec expects single sentences, each one as a list of words. In other words, 
# the input format is a list of lists.

# Download the punkt tokenizer for sentence splitting
import nltk.data

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [78]:
# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
#     print(raw_sentences)
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [79]:
sentences = []

print("parsing the sentence from training set")
for review in train["review"]:
    rev = review_to_sentences(review, tokenizer)
    sentences += rev
    
print("parsing sentence from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

parsing the sentence from training set




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


parsing sentence from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [80]:
len(sentences)

795538

In [82]:
print(sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [83]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [84]:
# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2018-03-20 15:55:09,852 : INFO : collecting all words and their counts
2018-03-20 15:55:09,854 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2018-03-20 15:55:09,958 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2018-03-20 15:55:10,024 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2018-03-20 15:55:10,078 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2018-03-20 15:55:10,140 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2018-03-20 15:55:10,195 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2018-03-20 15:55:10,259 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2018-03-20 15:55:10,318 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2018-03-20 15:55:10,385 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2018-03-20 15:55:10,458 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 4813

2018-03-20 15:55:14,517 : INFO : PROGRESS: at sentence #730000, processed 16331870 words, keeping 118954 word types
2018-03-20 15:55:14,578 : INFO : PROGRESS: at sentence #740000, processed 16552903 words, keeping 119668 word types
2018-03-20 15:55:14,636 : INFO : PROGRESS: at sentence #750000, processed 16771230 words, keeping 120295 word types
2018-03-20 15:55:14,693 : INFO : PROGRESS: at sentence #760000, processed 16990622 words, keeping 120930 word types
2018-03-20 15:55:14,764 : INFO : PROGRESS: at sentence #770000, processed 17217759 words, keeping 121703 word types
2018-03-20 15:55:14,839 : INFO : PROGRESS: at sentence #780000, processed 17447905 words, keeping 122402 word types
2018-03-20 15:55:14,913 : INFO : PROGRESS: at sentence #790000, processed 17674981 words, keeping 123066 word types
2018-03-20 15:55:14,952 : INFO : collected 123504 word types from a corpus of 17798082 raw words and 795538 sentences
2018-03-20 15:55:14,954 : INFO : Loading a fresh vocabulary
2018-03-20

2018-03-20 15:56:09,966 : INFO : EPOCH 2 - PROGRESS: at 99.94% examples, 482472 words/s, in_qsize 1, out_qsize 1
2018-03-20 15:56:09,966 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-03-20 15:56:09,977 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-03-20 15:56:09,978 : INFO : EPOCH - 2 : training on 17798082 raw words (12749071 effective words) took 26.4s, 482510 effective words/s
2018-03-20 15:56:10,992 : INFO : EPOCH 3 - PROGRESS: at 4.05% examples, 511613 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:56:12,010 : INFO : EPOCH 3 - PROGRESS: at 8.11% examples, 508217 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:56:13,029 : INFO : EPOCH 3 - PROGRESS: at 12.12% examples, 505017 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:56:14,041 : INFO : EPOCH 3 - PROGRESS: at 15.84% examples, 495557 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:56:15,041 : INFO : EPOCH 3 - PROGRESS: at 20.05% examples, 502068 words/s, in_qsize 7, out_qsize 0
20

2018-03-20 15:57:13,462 : INFO : EPOCH 5 - PROGRESS: at 3.99% examples, 503413 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:57:14,468 : INFO : EPOCH 5 - PROGRESS: at 7.88% examples, 496760 words/s, in_qsize 8, out_qsize 1
2018-03-20 15:57:15,475 : INFO : EPOCH 5 - PROGRESS: at 12.01% examples, 504217 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:57:16,495 : INFO : EPOCH 5 - PROGRESS: at 16.06% examples, 504648 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:57:17,534 : INFO : EPOCH 5 - PROGRESS: at 19.66% examples, 489895 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:57:18,550 : INFO : EPOCH 5 - PROGRESS: at 22.37% examples, 464593 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:57:19,556 : INFO : EPOCH 5 - PROGRESS: at 25.91% examples, 462216 words/s, in_qsize 7, out_qsize 0
2018-03-20 15:57:20,569 : INFO : EPOCH 5 - PROGRESS: at 29.93% examples, 467865 words/s, in_qsize 6, out_qsize 1
2018-03-20 15:57:21,573 : INFO : EPOCH 5 - PROGRESS: at 33.55% examples, 465692 words/s, in_qsize 

In [85]:
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

2018-03-20 16:01:46,440 : INFO : precomputing L2-norms of word weight vectors


In [86]:
# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-03-20 16:01:48,875 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2018-03-20 16:01:48,878 : INFO : not storing attribute vectors_norm
2018-03-20 16:01:48,879 : INFO : not storing attribute cum_table
2018-03-20 16:01:49,553 : INFO : saved 300features_40minwords_10context


In [90]:
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

In [89]:
model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [91]:
model.wv.most_similar('man')

[('woman', 0.6231456398963928),
 ('lad', 0.5931454300880432),
 ('lady', 0.5881258845329285),
 ('soldier', 0.5257889032363892),
 ('guy', 0.5224387049674988),
 ('farmer', 0.5221095681190491),
 ('businessman', 0.5186640024185181),
 ('sailor', 0.511244535446167),
 ('person', 0.5107656121253967),
 ('men', 0.5106342434883118)]

In [93]:
model.wv.most_similar('queen')

[('princess', 0.6704393625259399),
 ('bride', 0.6298378705978394),
 ('maid', 0.6206814646720886),
 ('mistress', 0.6033903956413269),
 ('kristel', 0.595789909362793),
 ('regina', 0.5935704708099365),
 ('belle', 0.5932125449180603),
 ('stepmother', 0.5913530588150024),
 ('eva', 0.5894976854324341),
 ('victoria', 0.5854543447494507)]

In [99]:
model.wv.most_similar('awful')

[('terrible', 0.7584094405174255),
 ('horrible', 0.7371830940246582),
 ('dreadful', 0.726965606212616),
 ('atrocious', 0.7194123864173889),
 ('abysmal', 0.7061768174171448),
 ('horrendous', 0.6896570920944214),
 ('horrid', 0.6790741086006165),
 ('appalling', 0.6767245531082153),
 ('lousy', 0.6429832577705383),
 ('crappy', 0.6205602884292603)]