In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re

In [2]:
train = pd.read_csv("./datasets/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("./datasets/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("./datasets/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [3]:
def review_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    words = letters_only.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
    return words    

In [4]:
import nltk.data
# nltk.download("punkt")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if (len(raw_sentence) > 0):
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [5]:
sentences = []
number = 1000
print("Parsing sentences from training set")
for review in train["review"][:number]:
    sentences += review_to_sentences(review, tokenizer)
for review in unlabeled_train["review"][:number]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [7]:
len(sentences)

20812

In [12]:
num_features = 300
min_word_count = 40
num_workers = 8
context = 10
downsampling = 1e-3

In [13]:
from gensim.models import word2vec

In [14]:
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, 
                         min_count = min_word_count, window=context, sample=downsampling)
model.init_sims(replace=True)



Training model...


In [15]:
model_name = "300features_40minwords_10context"
model.save(model_name)

In [17]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'child'

In [18]:
model.doesnt_match("paris berlin london austria".split())

  """Entry point for launching an IPython kernel.


'london'

In [19]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.9758438467979431),
 ('father', 0.951120138168335),
 ('daughter', 0.949702799320221),
 ('girl', 0.9490815997123718),
 ('mother', 0.9442487359046936),
 ('wife', 0.9383818507194519),
 ('young', 0.9375650882720947),
 ('son', 0.9369031190872192),
 ('finds', 0.9272494912147522),
 ('boy', 0.9223595261573792)]

In [21]:
model.most_similar("boy")

  """Entry point for launching an IPython kernel.


[('son', 0.975487470626831),
 ('starts', 0.9747697114944458),
 ('named', 0.9739161729812622),
 ('finds', 0.9711717367172241),
 ('daughter', 0.9708755016326904),
 ('falls', 0.9697022438049316),
 ('mother', 0.9693663120269775),
 ('brother', 0.9688738584518433),
 ('girl', 0.966877281665802),
 ('young', 0.9649040102958679)]

In [22]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.9705706238746643),
 ('boring', 0.9602379202842712),
 ('truly', 0.9583942294120789),
 ('quality', 0.9551643133163452),
 ('writing', 0.9492000937461853),
 ('entertaining', 0.9477874040603638),
 ('ending', 0.9319590330123901),
 ('original', 0.9301999807357788),
 ('lame', 0.9283192753791809),
 ('overall', 0.927104115486145)]