In [19]:
import pandas as pd
from bs4 import BeautifulSoup
import re

In [20]:
train = pd.read_csv("./datasets/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("./datasets/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("./datasets/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [21]:
def review_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    words = letters_only.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
    return words    

In [22]:
import nltk.data
# nltk.download("punkt")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if (len(raw_sentence) > 0):
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [23]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [24]:
len(sentences)

795538

In [36]:
num_features = 300
min_word_count = 40
num_workers = 8
context = 10
downsampling = 1e-3

In [37]:
from gensim.models import word2vec

In [39]:
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, 
                         min_count = min_word_count, window=context, sample=downsampling)
model.init_sims(replace=True)



Training model...


In [41]:
model_name = "300features_40minwords_10context"
model.save(model_name)

In [42]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [43]:
model.doesnt_match("paris berlin london austria".split())

  """Entry point for launching an IPython kernel.


'austria'

In [44]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6276388764381409),
 ('lad', 0.6071393489837646),
 ('lady', 0.58219313621521),
 ('men', 0.5235027074813843),
 ('guy', 0.5211378931999207),
 ('person', 0.5167438983917236),
 ('monk', 0.5147053599357605),
 ('soldier', 0.510221004486084),
 ('farmer', 0.5099583864212036),
 ('businessman', 0.5084689855575562)]

In [45]:
model.most_similar("queen")

  """Entry point for launching an IPython kernel.


[('princess', 0.6749973893165588),
 ('bride', 0.6070979833602905),
 ('stepmother', 0.5906005501747131),
 ('goddess', 0.5840883255004883),
 ('belle', 0.572691023349762),
 ('latifah', 0.5719773173332214),
 ('dame', 0.5698574185371399),
 ('victoria', 0.563430905342102),
 ('kristel', 0.5623289942741394),
 ('hunchback', 0.5578924417495728)]

In [46]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.7633955478668213),
 ('horrible', 0.734089732170105),
 ('atrocious', 0.7212321758270264),
 ('abysmal', 0.7150809168815613),
 ('dreadful', 0.706222414970398),
 ('horrendous', 0.6922901272773743),
 ('appalling', 0.666459858417511),
 ('horrid', 0.6415724158287048),
 ('lousy', 0.6407931447029114),
 ('laughable', 0.6062197089195251)]