In [1]:
import pandas as pd
# data cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
from tokenizer import tokenize, TOK
import logging
from pattern.en import conjugate, lemma, lexeme, parse
import multiprocessing
from gensim.models import word2vec
from gensim.models import Doc2Vec
import gensim.models.doc2vec

# Read data from files 
train = pd.read_csv("labeledTrainData.tsv\labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

In [2]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # Remove HTML and stop words
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [3]:
#this line was done once only for the install
#nltk.download()   

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())

    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords ))

    # Return the list of sentences (each sentence is a list of words)
    return sentences

In [4]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [5]:
# Check how many sentences we have in total - should be around 850,000+
print(len(sentences))
print(sentences[0])
print("break break \n")
print(sentences[1])


795538
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']
break break 

['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [6]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 8       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
    
# Initialize and train the model (this will take some time)
assert gensim.models.doc2vec.FAST_VERSION > -1
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

Training model...


2019-07-18 16:57:08,081 : INFO : precomputing L2-norms of word weight vectors
2019-07-18 16:57:08,105 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2019-07-18 16:57:08,107 : INFO : not storing attribute vectors_norm
2019-07-18 16:57:08,108 : INFO : not storing attribute cum_table
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-07-18 16:57:08,936 : INFO : saved 300features_40minwords_10context


In [7]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [8]:
model.doesnt_match("france england germany berlin".split())

  """Entry point for launching an IPython kernel.


'berlin'

In [9]:
model.doesnt_match("paris berlin london austria".split())

  """Entry point for launching an IPython kernel.


'austria'

In [10]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6350013017654419),
 ('lad', 0.5965719223022461),
 ('lady', 0.5806177854537964),
 ('millionaire', 0.5322234034538269),
 ('guy', 0.5308572053909302),
 ('chap', 0.5260276794433594),
 ('men', 0.5251904726028442),
 ('monk', 0.5250006914138794),
 ('soldier', 0.5113788843154907),
 ('person', 0.506030797958374)]

In [11]:
model.most_similar("queen")

  """Entry point for launching an IPython kernel.


[('princess', 0.6499308347702026),
 ('belle', 0.6282992362976074),
 ('victoria', 0.619742214679718),
 ('bride', 0.6057969331741333),
 ('goddess', 0.6030101776123047),
 ('mistress', 0.5972389578819275),
 ('marlene', 0.5859156847000122),
 ('stepmother', 0.5787820219993591),
 ('prince', 0.578509509563446),
 ('catherine', 0.576341986656189)]

In [12]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.768750786781311),
 ('horrible', 0.7342603206634521),
 ('abysmal', 0.7291581630706787),
 ('atrocious', 0.721089243888855),
 ('dreadful', 0.7041036486625671),
 ('horrendous', 0.6979161500930786),
 ('horrid', 0.6871669888496399),
 ('appalling', 0.6693041920661926),
 ('lousy', 0.6156965494155884),
 ('laughable', 0.6114096641540527)]