In [1]:
import pandas as pd

# Read data from files 
train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

# Verify the number of reviews that were read (100,000 in total)
print "Read %d labeled train reviews, %d labeled test reviews, " \
      "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size )

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [11]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords


def review_to_wordlist(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    
    # 1. Remove HTML
    review_text = BeautifulSoup(review, 'lxml').get_text()
    
    # 2. Remove non-letters
    review_text = re.sub(r'[^\x00-\x7F]', ' ', review_text)
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()

    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    # 5. Return a list of words
    return words

In [24]:
import nltk
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


# Define a function to split a review into parsed sentences
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    
    # replace non ascii
    review = re.sub(r'[^\x00-\x7F]+',' ', review)
    
    for i in review:
		try:
			unicode(i, 'ascii')
		except UnicodeError:
			print i
		else:
			pass
	    
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist( raw_sentence, remove_stopwords))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [25]:
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print "Parsing sentences from unlabeled set"
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)


  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup


  ' that document to Beautiful Soup.' % decoded_markup


  ' that document to Beautiful Soup.' % decoded_markup


  ' Beautiful Soup.' % markup)


  ' that document to Beautiful Soup.' % decoded_markup


  ' that document to Beautiful Soup.' % decoded_markup


In [28]:
print len(sentences)

795317


In [30]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec

print "Training model..."
model = word2vec.Word2Vec(sentences, workers=num_workers, 
            size=num_features, min_count = min_word_count, 
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-08-29 12:58:35,407 : INFO : collecting all words and their counts


2017-08-29 12:58:35,409 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


2017-08-29 12:58:35,528 : INFO : PROGRESS: at sentence #10000, processed 225818 words, keeping 17776 word types


Training model...


2017-08-29 12:58:35,610 : INFO : PROGRESS: at sentence #20000, processed 451941 words, keeping 24951 word types


2017-08-29 12:58:35,695 : INFO : PROGRESS: at sentence #30000, processed 671563 words, keeping 30034 word types


2017-08-29 12:58:35,792 : INFO : PROGRESS: at sentence #40000, processed 898095 words, keeping 34353 word types


2017-08-29 12:58:35,877 : INFO : PROGRESS: at sentence #50000, processed 1117439 words, keeping 37766 word types


2017-08-29 12:58:35,964 : INFO : PROGRESS: at sentence #60000, processed 1338710 words, keeping 40724 word types


2017-08-29 12:58:36,057 : INFO : PROGRESS: at sentence #70000, processed 1561925 words, keeping 43335 word types


2017-08-29 12:58:36,151 : INFO : PROGRESS: at sentence #80000, processed 1781522 words, keeping 45720 word types


2017-08-29 12:58:36,280 : INFO : PROGRESS: at sentence #90000, processed 2005549 words, keeping 48138 word types


2017-08-29 12:58:36,399 : INFO : PROGRESS: at sentence #100000, processed 2227545 words, keeping 50213 word types


2017-08-29 12:58:36,517 : INFO : PROGRESS: at sentence #110000, processed 2447461 words, keeping 52088 word types


2017-08-29 12:58:36,616 : INFO : PROGRESS: at sentence #120000, processed 2669386 words, keeping 54126 word types


2017-08-29 12:58:36,717 : INFO : PROGRESS: at sentence #130000, processed 2895056 words, keeping 55852 word types


2017-08-29 12:58:36,809 : INFO : PROGRESS: at sentence #140000, processed 3108024 words, keeping 57356 word types


2017-08-29 12:58:36,902 : INFO : PROGRESS: at sentence #150000, processed 3333321 words, keeping 59058 word types


2017-08-29 12:58:36,991 : INFO : PROGRESS: at sentence #160000, processed 3556013 words, keeping 60625 word types


2017-08-29 12:58:37,080 : INFO : PROGRESS: at sentence #170000, processed 3779485 words, keeping 62084 word types


2017-08-29 12:58:37,171 : INFO : PROGRESS: at sentence #180000, processed 4000389 words, keeping 63502 word types


2017-08-29 12:58:37,261 : INFO : PROGRESS: at sentence #190000, processed 4225408 words, keeping 64806 word types


2017-08-29 12:58:37,353 : INFO : PROGRESS: at sentence #200000, processed 4449472 words, keeping 66090 word types


2017-08-29 12:58:37,449 : INFO : PROGRESS: at sentence #210000, processed 4671292 words, keeping 67401 word types


2017-08-29 12:58:37,544 : INFO : PROGRESS: at sentence #220000, processed 4895793 words, keeping 68699 word types


2017-08-29 12:58:37,647 : INFO : PROGRESS: at sentence #230000, processed 5118949 words, keeping 69968 word types


2017-08-29 12:58:37,743 : INFO : PROGRESS: at sentence #240000, processed 5346171 words, keeping 71173 word types


2017-08-29 12:58:37,832 : INFO : PROGRESS: at sentence #250000, processed 5560415 words, keeping 72358 word types


2017-08-29 12:58:37,926 : INFO : PROGRESS: at sentence #260000, processed 5780447 words, keeping 73489 word types


2017-08-29 12:58:38,021 : INFO : PROGRESS: at sentence #270000, processed 6002196 words, keeping 74772 word types


2017-08-29 12:58:38,114 : INFO : PROGRESS: at sentence #280000, processed 6228572 words, keeping 76424 word types


2017-08-29 12:58:38,208 : INFO : PROGRESS: at sentence #290000, processed 6451053 words, keeping 77848 word types


2017-08-29 12:58:38,320 : INFO : PROGRESS: at sentence #300000, processed 6675522 words, keeping 79175 word types


2017-08-29 12:58:38,422 : INFO : PROGRESS: at sentence #310000, processed 6901436 words, keeping 80500 word types


2017-08-29 12:58:38,513 : INFO : PROGRESS: at sentence #320000, processed 7126126 words, keeping 81820 word types


2017-08-29 12:58:38,608 : INFO : PROGRESS: at sentence #330000, processed 7348178 words, keeping 83042 word types


2017-08-29 12:58:38,699 : INFO : PROGRESS: at sentence #340000, processed 7578088 words, keeping 84291 word types


2017-08-29 12:58:38,793 : INFO : PROGRESS: at sentence #350000, processed 7800635 words, keeping 85442 word types


2017-08-29 12:58:38,897 : INFO : PROGRESS: at sentence #360000, processed 8021017 words, keeping 86600 word types


2017-08-29 12:58:38,995 : INFO : PROGRESS: at sentence #370000, processed 8249991 words, keeping 87726 word types


2017-08-29 12:58:39,090 : INFO : PROGRESS: at sentence #380000, processed 8473709 words, keeping 88881 word types


2017-08-29 12:58:39,269 : INFO : PROGRESS: at sentence #390000, processed 8704026 words, keeping 89916 word types


2017-08-29 12:58:39,377 : INFO : PROGRESS: at sentence #400000, processed 8926414 words, keeping 90928 word types


2017-08-29 12:58:39,472 : INFO : PROGRESS: at sentence #410000, processed 9148267 words, keeping 91893 word types


2017-08-29 12:58:39,561 : INFO : PROGRESS: at sentence #420000, processed 9369273 words, keeping 92922 word types


2017-08-29 12:58:39,652 : INFO : PROGRESS: at sentence #430000, processed 9597199 words, keeping 93940 word types


2017-08-29 12:58:39,742 : INFO : PROGRESS: at sentence #440000, processed 9823194 words, keeping 94922 word types


2017-08-29 12:58:39,830 : INFO : PROGRESS: at sentence #450000, processed 10047676 words, keeping 96048 word types


2017-08-29 12:58:39,928 : INFO : PROGRESS: at sentence #460000, processed 10280044 words, keeping 97091 word types


2017-08-29 12:58:40,015 : INFO : PROGRESS: at sentence #470000, processed 10507823 words, keeping 97940 word types


2017-08-29 12:58:40,106 : INFO : PROGRESS: at sentence #480000, processed 10728623 words, keeping 98876 word types


2017-08-29 12:58:40,194 : INFO : PROGRESS: at sentence #490000, processed 10955190 words, keeping 99876 word types


2017-08-29 12:58:40,276 : INFO : PROGRESS: at sentence #500000, processed 11177176 words, keeping 100784 word types


2017-08-29 12:58:40,395 : INFO : PROGRESS: at sentence #510000, processed 11402379 words, keeping 101703 word types


2017-08-29 12:58:40,488 : INFO : PROGRESS: at sentence #520000, processed 11625843 words, keeping 102605 word types


2017-08-29 12:58:40,575 : INFO : PROGRESS: at sentence #530000, processed 11850322 words, keeping 103405 word types


2017-08-29 12:58:40,660 : INFO : PROGRESS: at sentence #540000, processed 12075165 words, keeping 104274 word types


2017-08-29 12:58:40,753 : INFO : PROGRESS: at sentence #550000, processed 12300792 words, keeping 105151 word types


2017-08-29 12:58:40,841 : INFO : PROGRESS: at sentence #560000, processed 12521788 words, keeping 106004 word types


2017-08-29 12:58:40,936 : INFO : PROGRESS: at sentence #570000, processed 12750762 words, keeping 106792 word types


2017-08-29 12:58:41,024 : INFO : PROGRESS: at sentence #580000, processed 12972890 words, keeping 107678 word types


2017-08-29 12:58:41,115 : INFO : PROGRESS: at sentence #590000, processed 13198677 words, keeping 108516 word types


2017-08-29 12:58:41,197 : INFO : PROGRESS: at sentence #600000, processed 13420428 words, keeping 109223 word types


2017-08-29 12:58:41,307 : INFO : PROGRESS: at sentence #610000, processed 13642212 words, keeping 110102 word types


2017-08-29 12:58:41,399 : INFO : PROGRESS: at sentence #620000, processed 13867815 words, keeping 110845 word types


2017-08-29 12:58:41,500 : INFO : PROGRESS: at sentence #630000, processed 14092269 words, keeping 111623 word types


2017-08-29 12:58:41,581 : INFO : PROGRESS: at sentence #640000, processed 14313958 words, keeping 112426 word types


2017-08-29 12:58:41,671 : INFO : PROGRESS: at sentence #650000, processed 14538897 words, keeping 113205 word types


2017-08-29 12:58:41,755 : INFO : PROGRESS: at sentence #660000, processed 14762210 words, keeping 113964 word types


2017-08-29 12:58:41,864 : INFO : PROGRESS: at sentence #670000, processed 14985012 words, keeping 114655 word types


2017-08-29 12:58:41,959 : INFO : PROGRESS: at sentence #680000, processed 15210504 words, keeping 115367 word types


2017-08-29 12:58:42,040 : INFO : PROGRESS: at sentence #690000, processed 15432787 words, keeping 116152 word types


2017-08-29 12:58:42,154 : INFO : PROGRESS: at sentence #700000, processed 15660699 words, keeping 116954 word types


2017-08-29 12:58:42,247 : INFO : PROGRESS: at sentence #710000, processed 15884058 words, keeping 117618 word types


2017-08-29 12:58:42,414 : INFO : PROGRESS: at sentence #720000, processed 16109616 words, keeping 118239 word types


2017-08-29 12:58:42,554 : INFO : PROGRESS: at sentence #730000, processed 16335611 words, keeping 118967 word types


2017-08-29 12:58:42,653 : INFO : PROGRESS: at sentence #740000, processed 16557240 words, keeping 119682 word types


2017-08-29 12:58:42,743 : INFO : PROGRESS: at sentence #750000, processed 16777080 words, keeping 120312 word types


2017-08-29 12:58:42,835 : INFO : PROGRESS: at sentence #760000, processed 16994936 words, keeping 120941 word types


2017-08-29 12:58:42,932 : INFO : PROGRESS: at sentence #770000, processed 17222687 words, keeping 121716 word types


2017-08-29 12:58:43,027 : INFO : PROGRESS: at sentence #780000, processed 17452375 words, keeping 122417 word types


2017-08-29 12:58:43,122 : INFO : PROGRESS: at sentence #790000, processed 17679708 words, keeping 123085 word types


2017-08-29 12:58:43,183 : INFO : collected 123504 word types from a corpus of 17798082 raw words and 795317 sentences


2017-08-29 12:58:43,184 : INFO : Loading a fresh vocabulary


2017-08-29 12:58:43,317 : INFO : min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)


2017-08-29 12:58:43,318 : INFO : min_count=40 leaves 17238940 word corpus (96% of original 17798082, drops 559142)


2017-08-29 12:58:43,375 : INFO : deleting the raw counts dictionary of 123504 items


2017-08-29 12:58:43,393 : INFO : sample=0.001 downsamples 48 most-common words


2017-08-29 12:58:43,395 : INFO : downsampling leaves estimated 12749658 word corpus (74.0% of prior 17238940)


2017-08-29 12:58:43,397 : INFO : estimated required memory for 16490 words and 300 dimensions: 47821000 bytes


2017-08-29 12:58:43,510 : INFO : resetting layer weights


2017-08-29 12:58:43,838 : INFO : training model with 4 workers on 16490 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10


2017-08-29 12:58:44,851 : INFO : PROGRESS: at 0.95% examples, 602414 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:45,855 : INFO : PROGRESS: at 1.90% examples, 600217 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:46,876 : INFO : PROGRESS: at 2.66% examples, 556524 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:47,879 : INFO : PROGRESS: at 3.66% examples, 573943 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:58:48,889 : INFO : PROGRESS: at 4.61% examples, 578095 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:49,903 : INFO : PROGRESS: at 5.42% examples, 566205 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:58:50,913 : INFO : PROGRESS: at 6.21% examples, 556048 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:51,922 : INFO : PROGRESS: at 7.18% examples, 562645 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:52,926 : INFO : PROGRESS: at 8.14% examples, 568229 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:58:53,931 : INFO : PROGRESS: at 9.13% examples, 574697 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:54,948 : INFO : PROGRESS: at 10.17% examples, 582456 words/s, in_qsize 7, out_qsize 1


2017-08-29 12:58:55,955 : INFO : PROGRESS: at 11.03% examples, 579043 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:56,960 : INFO : PROGRESS: at 11.97% examples, 580980 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:57,963 : INFO : PROGRESS: at 12.99% examples, 585820 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:58:58,968 : INFO : PROGRESS: at 14.00% examples, 589458 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:58:59,972 : INFO : PROGRESS: at 14.99% examples, 592267 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:00,987 : INFO : PROGRESS: at 15.53% examples, 577229 words/s, in_qsize 7, out_qsize 1


2017-08-29 12:59:02,002 : INFO : PROGRESS: at 16.27% examples, 571212 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:02,996 : INFO : PROGRESS: at 16.77% examples, 557726 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:04,018 : INFO : PROGRESS: at 17.53% examples, 553596 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:05,020 : INFO : PROGRESS: at 18.31% examples, 551037 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:06,024 : INFO : PROGRESS: at 19.06% examples, 547383 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:07,038 : INFO : PROGRESS: at 19.92% examples, 547504 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:08,049 : INFO : PROGRESS: at 20.85% examples, 549116 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:09,054 : INFO : PROGRESS: at 21.86% examples, 552477 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:10,059 : INFO : PROGRESS: at 22.81% examples, 554216 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:11,141 : INFO : PROGRESS: at 23.65% examples, 551650 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:12,172 : INFO : PROGRESS: at 24.06% examples, 540661 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:13,175 : INFO : PROGRESS: at 24.58% examples, 533405 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:14,190 : INFO : PROGRESS: at 25.55% examples, 536131 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:15,195 : INFO : PROGRESS: at 26.51% examples, 538225 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:16,219 : INFO : PROGRESS: at 27.50% examples, 540852 words/s, in_qsize 8, out_qsize 1


2017-08-29 12:59:17,229 : INFO : PROGRESS: at 28.37% examples, 540980 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:18,246 : INFO : PROGRESS: at 29.18% examples, 539950 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:19,282 : INFO : PROGRESS: at 29.52% examples, 530410 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:20,303 : INFO : PROGRESS: at 29.90% examples, 522224 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:21,311 : INFO : PROGRESS: at 30.25% examples, 514102 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:22,330 : INFO : PROGRESS: at 31.12% examples, 514983 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:23,341 : INFO : PROGRESS: at 31.92% examples, 515019 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:24,350 : INFO : PROGRESS: at 32.80% examples, 515964 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:25,363 : INFO : PROGRESS: at 33.64% examples, 516287 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:26,381 : INFO : PROGRESS: at 34.48% examples, 516646 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:27,377 : INFO : PROGRESS: at 35.42% examples, 518524 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:28,396 : INFO : PROGRESS: at 36.41% examples, 520784 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:29,414 : INFO : PROGRESS: at 37.39% examples, 522957 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:30,421 : INFO : PROGRESS: at 38.13% examples, 521794 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:31,456 : INFO : PROGRESS: at 38.94% examples, 521269 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:32,459 : INFO : PROGRESS: at 39.62% examples, 519488 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:33,475 : INFO : PROGRESS: at 40.41% examples, 519143 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:34,475 : INFO : PROGRESS: at 41.21% examples, 518851 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:35,475 : INFO : PROGRESS: at 42.09% examples, 519596 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:36,558 : INFO : PROGRESS: at 42.92% examples, 519042 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:37,553 : INFO : PROGRESS: at 43.84% examples, 520007 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:38,561 : INFO : PROGRESS: at 44.65% examples, 519846 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:39,598 : INFO : PROGRESS: at 45.54% examples, 520315 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:40,604 : INFO : PROGRESS: at 46.37% examples, 520288 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:41,622 : INFO : PROGRESS: at 47.23% examples, 520631 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:42,626 : INFO : PROGRESS: at 47.91% examples, 519059 words/s, in_qsize 6, out_qsize 0


2017-08-29 12:59:43,630 : INFO : PROGRESS: at 48.60% examples, 517743 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:44,637 : INFO : PROGRESS: at 49.46% examples, 518224 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:45,638 : INFO : PROGRESS: at 50.32% examples, 518748 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:46,651 : INFO : PROGRESS: at 51.12% examples, 518572 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:47,668 : INFO : PROGRESS: at 51.85% examples, 517772 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:48,669 : INFO : PROGRESS: at 52.75% examples, 518526 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:49,675 : INFO : PROGRESS: at 53.42% examples, 517231 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:50,685 : INFO : PROGRESS: at 54.17% examples, 516471 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:51,686 : INFO : PROGRESS: at 54.95% examples, 516236 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:52,701 : INFO : PROGRESS: at 55.84% examples, 516840 words/s, in_qsize 6, out_qsize 0


2017-08-29 12:59:53,717 : INFO : PROGRESS: at 56.62% examples, 516482 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:54,726 : INFO : PROGRESS: at 57.37% examples, 515886 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:55,730 : INFO : PROGRESS: at 58.20% examples, 516047 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:56,730 : INFO : PROGRESS: at 59.14% examples, 517110 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:57,730 : INFO : PROGRESS: at 59.99% examples, 517564 words/s, in_qsize 8, out_qsize 0


2017-08-29 12:59:58,748 : INFO : PROGRESS: at 60.83% examples, 517692 words/s, in_qsize 7, out_qsize 0


2017-08-29 12:59:59,789 : INFO : PROGRESS: at 61.52% examples, 516332 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:00,799 : INFO : PROGRESS: at 62.25% examples, 515595 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:01,807 : INFO : PROGRESS: at 63.20% examples, 516647 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:02,815 : INFO : PROGRESS: at 64.08% examples, 517023 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:03,825 : INFO : PROGRESS: at 65.08% examples, 518450 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:04,841 : INFO : PROGRESS: at 65.96% examples, 518924 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:05,857 : INFO : PROGRESS: at 66.96% examples, 520155 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:06,869 : INFO : PROGRESS: at 67.93% examples, 521294 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:07,892 : INFO : PROGRESS: at 68.63% examples, 520241 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:08,945 : INFO : PROGRESS: at 69.32% examples, 519008 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:10,055 : INFO : PROGRESS: at 70.05% examples, 517723 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:11,067 : INFO : PROGRESS: at 70.91% examples, 518024 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:12,075 : INFO : PROGRESS: at 71.64% examples, 517448 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:13,105 : INFO : PROGRESS: at 72.23% examples, 515791 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:14,102 : INFO : PROGRESS: at 72.77% examples, 513833 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:15,118 : INFO : PROGRESS: at 73.63% examples, 514151 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:16,150 : INFO : PROGRESS: at 74.61% examples, 515292 words/s, in_qsize 8, out_qsize 1


2017-08-29 13:00:17,141 : INFO : PROGRESS: at 75.64% examples, 516799 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:18,145 : INFO : PROGRESS: at 76.69% examples, 518353 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:19,153 : INFO : PROGRESS: at 77.69% examples, 519627 words/s, in_qsize 8, out_qsize 1


2017-08-29 13:00:20,153 : INFO : PROGRESS: at 78.73% examples, 521059 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:21,171 : INFO : PROGRESS: at 79.76% examples, 522450 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:22,173 : INFO : PROGRESS: at 80.79% examples, 523741 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:23,174 : INFO : PROGRESS: at 81.81% examples, 525011 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:24,178 : INFO : PROGRESS: at 82.84% examples, 526182 words/s, in_qsize 6, out_qsize 0


2017-08-29 13:00:25,185 : INFO : PROGRESS: at 83.85% examples, 527307 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:26,185 : INFO : PROGRESS: at 84.84% examples, 528307 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:27,185 : INFO : PROGRESS: at 85.82% examples, 529201 words/s, in_qsize 6, out_qsize 0


2017-08-29 13:00:28,187 : INFO : PROGRESS: at 86.74% examples, 529664 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:29,190 : INFO : PROGRESS: at 87.47% examples, 529037 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:30,196 : INFO : PROGRESS: at 88.44% examples, 529884 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:31,208 : INFO : PROGRESS: at 89.39% examples, 530549 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:32,208 : INFO : PROGRESS: at 90.35% examples, 531336 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:33,218 : INFO : PROGRESS: at 91.31% examples, 532049 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:34,229 : INFO : PROGRESS: at 92.29% examples, 532936 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:35,275 : INFO : PROGRESS: at 92.96% examples, 531866 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:36,250 : INFO : PROGRESS: at 93.28% examples, 528955 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:37,311 : INFO : PROGRESS: at 93.62% examples, 525902 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:38,316 : INFO : PROGRESS: at 94.42% examples, 525780 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:39,331 : INFO : PROGRESS: at 95.29% examples, 525924 words/s, in_qsize 6, out_qsize 1


2017-08-29 13:00:40,344 : INFO : PROGRESS: at 96.22% examples, 526487 words/s, in_qsize 7, out_qsize 2


2017-08-29 13:00:41,346 : INFO : PROGRESS: at 97.19% examples, 527250 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:42,349 : INFO : PROGRESS: at 98.13% examples, 527857 words/s, in_qsize 7, out_qsize 0


2017-08-29 13:00:43,354 : INFO : PROGRESS: at 99.06% examples, 528329 words/s, in_qsize 8, out_qsize 0


2017-08-29 13:00:44,303 : INFO : worker thread finished; awaiting finish of 3 more threads


2017-08-29 13:00:44,324 : INFO : worker thread finished; awaiting finish of 2 more threads


2017-08-29 13:00:44,353 : INFO : worker thread finished; awaiting finish of 1 more threads


2017-08-29 13:00:44,360 : INFO : PROGRESS: at 100.00% examples, 528962 words/s, in_qsize 1, out_qsize 0


2017-08-29 13:00:44,367 : INFO : worker thread finished; awaiting finish of 0 more threads


2017-08-29 13:00:44,373 : INFO : training on 88990410 raw words (63749554 effective words) took 120.5s, 528905 effective words/s


2017-08-29 13:00:44,378 : INFO : precomputing L2-norms of word weight vectors


2017-08-29 13:00:44,575 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None


2017-08-29 13:00:44,576 : INFO : not storing attribute syn0norm


2017-08-29 13:00:44,577 : INFO : not storing attribute cum_table


2017-08-29 13:00:44,901 : INFO : saved 300features_40minwords_10context


In [34]:
print model.most_similar('awful disgusting'.split())

[(u'horrible', 0.7145130038261414), (u'laughable', 0.7140963077545166), (u'appalling', 0.6930283308029175), (u'terrible', 0.6911394596099854), (u'atrocious', 0.6908338665962219), (u'dreadful', 0.6855851411819458), (u'horrendous', 0.685319185256958), (u'pathetic', 0.6839738488197327), (u'stupid', 0.6766130924224854), (u'embarrassing', 0.6764904260635376)]
