In [1]:
import pandas as pd
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter='\t', quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter='\t', quoting=3)

In [2]:
# Import various model for clening the text
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [3]:
def review_to_wordlist(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #
    # 2.remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # 3. convert word to lower case and split
    words = review_text.lower().split()
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [4]:
# Download the punkt tokenizer for sentence splitting
import nltk.data

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
#     print(raw_sentences)
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [5]:
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")

In [7]:
model.wv["flower"]

array([-0.0514271 , -0.05480901, -0.03706408,  0.0451298 ,  0.03551459,
       -0.06432813, -0.01661876, -0.00520615, -0.08906938, -0.13924482,
       -0.07806679,  0.12562075,  0.08069624,  0.05373814,  0.06561399,
       -0.00235362, -0.07317895,  0.09190165,  0.02494637,  0.04594135,
        0.10622973, -0.03211069, -0.01861409, -0.00083636,  0.02217456,
        0.01866037, -0.04152292, -0.0349835 , -0.03670044,  0.02415779,
        0.02476616,  0.03164262,  0.00752363,  0.03579576, -0.0991041 ,
       -0.01247764, -0.0346579 ,  0.00638266, -0.0077857 , -0.01602524,
       -0.00717401,  0.00167522, -0.05680299,  0.07383982,  0.08701462,
       -0.05897753, -0.01924523,  0.10350288, -0.02288168, -0.04093151,
       -0.01484664, -0.04878024, -0.01388485,  0.01716304, -0.03698753,
       -0.00460978, -0.07099182, -0.00913173, -0.03816442, -0.04769471,
        0.07382052,  0.08291068, -0.0315371 , -0.04461202, -0.03119381,
        0.06610774,  0.09737919, -0.01033472,  0.03383221,  0.02

In [16]:
import numpy as np  # Make sure that numpy is imported

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%1000. == 0.:
           print("Review %d of %d" % (counter, len(reviews)))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
       counter = counter + 1.
    return reviewFeatureVecs

In [17]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.
num_features = 300
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Review 0 of 25000




IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [21]:
len(model.wv.index2word)

16490

In [23]:
model.wv.vectors

array([[  5.38068218e-03,  -6.26454428e-02,  -1.63734323e-04, ...,
         -4.87488806e-02,  -4.90719161e-04,   7.77191967e-02],
       [ -9.39467847e-02,  -5.51806152e-05,   7.60902883e-03, ...,
         -3.16449143e-02,  -1.24446005e-01,   6.42076358e-02],
       [  1.18126146e-01,   2.26852903e-03,  -1.68994609e-02, ...,
         -8.63448605e-02,   1.86871621e-03,   7.55333528e-02],
       ..., 
       [ -2.36577522e-02,  -8.50679800e-02,  -2.29315069e-02, ...,
          5.30488119e-02,  -6.13216534e-02,   1.64637044e-02],
       [  4.18576486e-02,  -4.97073010e-02,  -5.73279336e-02, ...,
         -3.17289606e-02,  -2.48052180e-03,   9.36293900e-02],
       [  4.84285504e-02,  -7.94675350e-02,   4.23087962e-02, ...,
         -1.62440315e-02,  -1.18006162e-01,   9.66558699e-03]], dtype=float32)