In [13]:
#Importing package and data
import pandas as pd
import re, string
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

df = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
print (df.shape)

(50000, 2)


In [14]:
print (df.columns)

Index(['id', 'review'], dtype='object')


In [15]:
#Cleaning the string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:
    #string = "".join(string.astype('str').tail(1).tolist())    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>1]
    return " ".join(words)	
  except:
    return ""

df['clean_review'] = df['review'].apply(clean_str)
print (df[:2])

          id                                             review  \
0   "9999_0"  "Watching Time Chasers, it obvious that it was...   
1  "45057_0"  "I saw this film about 20 years ago and rememb...   

                                        clean_review  
0  watching time chasers it obvious that it was m...  
1  saw this film about years ago and remember it ...  


In [16]:
#Create list of words for each document for feeding to Word2Vec
documents = []
for doc in df['clean_review']:
    documents.append(doc.split(' '))

In [18]:
#Build the model
w2v_movie_review = gensim.models.Word2Vec(documents, min_count=10, sample=1e-3,workers=4,size=300,window=5,iter=10)

2017-08-07 10:50:08,227 : INFO : collecting all words and their counts
2017-08-07 10:50:08,227 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-07 10:50:08,579 : INFO : PROGRESS: at sentence #10000, processed 2253398 words, keeping 51628 word types
2017-08-07 10:50:08,949 : INFO : PROGRESS: at sentence #20000, processed 4540660 words, keeping 69051 word types
2017-08-07 10:50:09,318 : INFO : PROGRESS: at sentence #30000, processed 6825039 words, keeping 81489 word types
2017-08-07 10:50:09,671 : INFO : PROGRESS: at sentence #40000, processed 9080729 words, keeping 91659 word types
2017-08-07 10:50:10,024 : INFO : collected 100453 word types from a corpus of 11348950 raw words and 50000 sentences
2017-08-07 10:50:10,040 : INFO : Loading a fresh vocabulary
2017-08-07 10:50:10,358 : INFO : min_count=10 retains 28296 unique words (28% of original 100453, drops 72157)
2017-08-07 10:50:10,358 : INFO : min_count=10 leaves 11174747 word corpus (98% of original

2017-08-07 10:51:16,199 : INFO : PROGRESS: at 96.16% examples, 1249286 words/s, in_qsize 8, out_qsize 0
2017-08-07 10:51:17,199 : INFO : PROGRESS: at 97.66% examples, 1249544 words/s, in_qsize 7, out_qsize 0
2017-08-07 10:51:18,214 : INFO : PROGRESS: at 99.19% examples, 1249776 words/s, in_qsize 7, out_qsize 0
2017-08-07 10:51:18,730 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-08-07 10:51:18,730 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-08-07 10:51:18,746 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-08-07 10:51:18,746 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-08-07 10:51:18,746 : INFO : training on 113489500 raw words (84801167 effective words) took 67.8s, 1250133 effective words/s


In [20]:
#How many words in the model and how many features
w2v_movie_review.wv.syn0.shape

(28296, 300)

### Saving Word2Vec Model

In [26]:
w2v_movie_review.save('/tmp/w2v-movie-review')

2017-08-07 11:25:23,013 : INFO : saving Word2Vec object under /tmp/w2v-movie-review, separately None
2017-08-07 11:25:23,014 : INFO : not storing attribute syn0norm
2017-08-07 11:25:23,015 : INFO : not storing attribute cum_table
2017-08-07 11:25:23,774 : INFO : saved /tmp/w2v-movie-review


In [28]:
w2v_movie_review

<gensim.models.word2vec.Word2Vec at 0x1dbe191f7f0>