In [1]:
import pandas as pd

train = pd.read_csv('labeledTrainData.tsv',header=0,delimiter='\t',quoting=3)
test = pd.read_csv('testData.tsv',header=0,delimiter='\t',quoting=3)
unlabeled_train = pd.read_csv('unlabeledTrainData.tsv',header=0,delimiter='\t',quoting=3)

print("Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled review\n"%(train['review'].size,test['review'].size,unlabeled_train['review'].size))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled review



In [2]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review,remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words. Return a list of words.
    review_text = BeautifulSoup(review).get_text()
    
    review_text = re.sub("[^a-zA-Z1-9]"," ",review_text)
    
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    return(words)

In [3]:
import nltk.data

# Use NLTK's punkt tokenizer for sentence splitting
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # Return a list of sentences, where each sentence is a list of words
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0 :
            sentences.append(review_to_wordlist(raw_sentence,remove_stopwords))
    return sentences

In [4]:
sentences = []
print('Parsing sentences from training set')
for review in train['review']:
    sentences += review_to_sentences(review,tokenizer)
    
print("Paring sentences from unlabelled set")
for review in unlabeled_train['review']:
    sentences += review_to_sentences(review,tokenizer)
# "append" will only append the first list
# "+=" will join all of the lists at once
    

Parsing sentences from training set




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Paring sentences from unlabelled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [8]:
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s',level=logging.INFO)

num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count, helps limit the size of the vocabulary to meaningful words
num_workers = 4 # Number of threads to run in parallel
context = 40 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words

# Initialize and train the model
from gensim.models import Word2Vec
print("Training the model...")
model = word2vec.Word2Vec(sentences, workers = num_workers, size = num_features, min_count=min_word_count,\
                          window = context, sample=downsampling)

model.init_sims(replace=True) # make the model more memory-efficient

model_name = "300features_40minwords_10context"
model.save(model_name)

2017-11-08 11:20:33,796:INFO: collecting all words and their counts
2017-11-08 11:20:33,797:INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-11-08 11:20:33,867:INFO: PROGRESS: at sentence #10000, processed 227353 words, keeping 17965 word types
2017-11-08 11:20:33,960:INFO: PROGRESS: at sentence #20000, processed 454813 words, keeping 25212 word types


Training the model...


2017-11-08 11:20:34,014:INFO: PROGRESS: at sentence #30000, processed 675603 words, keeping 30355 word types
2017-11-08 11:20:34,087:INFO: PROGRESS: at sentence #40000, processed 903462 words, keeping 34720 word types
2017-11-08 11:20:34,135:INFO: PROGRESS: at sentence #50000, processed 1124077 words, keeping 38170 word types
2017-11-08 11:20:34,181:INFO: PROGRESS: at sentence #60000, processed 1346959 words, keeping 41167 word types
2017-11-08 11:20:34,230:INFO: PROGRESS: at sentence #70000, processed 1571546 words, keeping 43800 word types
2017-11-08 11:20:34,279:INFO: PROGRESS: at sentence #80000, processed 1792149 words, keeping 46207 word types
2017-11-08 11:20:34,332:INFO: PROGRESS: at sentence #90000, processed 2017757 words, keeping 48659 word types
2017-11-08 11:20:34,388:INFO: PROGRESS: at sentence #100000, processed 2241043 words, keeping 50755 word types
2017-11-08 11:20:34,446:INFO: PROGRESS: at sentence #110000, processed 2462195 words, keeping 52649 word types
2017-11-08

2017-11-08 11:20:38,539:INFO: PROGRESS: at sentence #770000, processed 17326890 words, keeping 123559 word types
2017-11-08 11:20:38,592:INFO: PROGRESS: at sentence #780000, processed 17558529 words, keeping 124280 word types
2017-11-08 11:20:38,640:INFO: PROGRESS: at sentence #790000, processed 17786955 words, keeping 124962 word types
2017-11-08 11:20:38,668:INFO: collected 125407 word types from a corpus of 17910811 raw words and 795538 sentences
2017-11-08 11:20:38,669:INFO: Loading a fresh vocabulary
2017-11-08 11:20:39,788:INFO: min_count=40 retains 16670 unique words (13% of original 125407, drops 108737)
2017-11-08 11:20:39,789:INFO: min_count=40 leaves 17346833 word corpus (96% of original 17910811, drops 563978)
2017-11-08 11:20:39,842:INFO: deleting the raw counts dictionary of 125407 items
2017-11-08 11:20:39,847:INFO: sample=0.001 downsamples 48 most-common words
2017-11-08 11:20:39,848:INFO: downsampling leaves estimated 12870163 word corpus (74.2% of prior 17346833)
2017

2017-11-08 11:21:50,652:INFO: PROGRESS: at 75.34% examples, 688044 words/s, in_qsize 7, out_qsize 0
2017-11-08 11:21:51,662:INFO: PROGRESS: at 76.45% examples, 688354 words/s, in_qsize 7, out_qsize 0
2017-11-08 11:21:52,664:INFO: PROGRESS: at 77.54% examples, 688538 words/s, in_qsize 7, out_qsize 0
2017-11-08 11:21:53,668:INFO: PROGRESS: at 78.64% examples, 688789 words/s, in_qsize 7, out_qsize 0
2017-11-08 11:21:54,681:INFO: PROGRESS: at 79.76% examples, 689063 words/s, in_qsize 8, out_qsize 0
2017-11-08 11:21:55,688:INFO: PROGRESS: at 80.87% examples, 689370 words/s, in_qsize 7, out_qsize 0
2017-11-08 11:21:56,696:INFO: PROGRESS: at 82.00% examples, 689658 words/s, in_qsize 7, out_qsize 0
2017-11-08 11:21:57,698:INFO: PROGRESS: at 83.09% examples, 689815 words/s, in_qsize 7, out_qsize 0
2017-11-08 11:21:58,706:INFO: PROGRESS: at 84.20% examples, 689915 words/s, in_qsize 7, out_qsize 0
2017-11-08 11:21:59,723:INFO: PROGRESS: at 85.33% examples, 690204 words/s, in_qsize 7, out_qsize 0


In [13]:
# From Words To Paragraphs : Vector Averaging
import numpy as np

def makeFeatureVec(words,model,num_features):
    # Funciton to averaging all of the word vectors in a given paragraph
    featureVec=np.zeros((num_features,),dtype='float32')
    
    nwords = 0
    
    index2word_set = set(model.wv.index2word)
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed
    
    for word in words:
        if word in index2word_set:
            nwords+=1
            featureVec = np.add(featureVec,model[word])
            
    # Divide the result by the number of vector to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews,model,num_features):
    # Given a set of reviews(each one a list of words),calculate
    # the average feature vector for each one and return a 2D numpy array
    
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype='float32')
    
    for review in reviews:
        
        if counter%5000 == 0:
            print("Review %d of %d" % (counter,len(reviews)))
            
        reviewFeatureVecs[counter] = makeFeatureVec(review,model,num_features)
        
        counter += 1
        
    return reviewFeatureVecs

In [15]:
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_to_wordlist(review,remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews,model,num_features)



print("Creating average feature vecs fro test reviews")
clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))
    
testDataVecs=getAvgFeatureVecs(clean_test_reviews,model,num_features)




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
Creating average feature vecs fro test reviews
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000


In [16]:
# Fit a random forest to the training data,using 100 trees
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)

print("Fitting a random forest to labelled training data...")
forest = forest.fit(trainDataVecs,train['sentiment'])

# Test & extract results
result = forest.predict(testDataVecs)

output = pd.DataFrame(data={'id':test['id'],'sentiment':result})
output.to_csv("Word2Vec_AverageVectors.csv", index=False,quoting=3)

Fitting a random forest to labelled training data...


In [23]:
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.syn0
num_clusters = word_vectors.shape[0] / 5

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

TypeError: 'float' object cannot be interpreted as an integer