In [1]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
from sklearn.cluster import KMeans
import time

In [2]:
def review_to_wordlist(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)


def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # Function to split a review into parsed sentences. Returns a
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist(
                raw_sentence, remove_stopwords))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences


def create_bag_of_centroids(wordlist, word_centroid_map):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max(word_centroid_map.values()) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

## Reading data

In [3]:
train = pd.read_csv(".\\data\\labeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)
test = pd.read_csv(".\\data\\testData.tsv",
                   header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv(".\\data\\unlabeledTrainData.tsv",
                              header=0, delimiter="\t", quoting=3)
model = Word2Vec.load(".\\data\\300features_40minwords_10context")

## K-means clustering

In [5]:
start = time.time()  # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.vectors
num_clusters = int(word_vectors.shape[0] / 5)

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans(n_clusters=num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

Time taken for K Means clustering:  1177.8344213962555 seconds.


In [6]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number
word_centroid_map = dict(zip(model.wv.index_to_key,idx))

In [8]:
# For the first 10 clusters
for cluster in range(0, 10):
    #
    # Print the cluster number
    print("\nCluster {}".format(cluster))
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(0, len(word_centroid_map.values())):
        if(list(word_centroid_map.values())[i] == cluster):
            words.append(list(word_centroid_map.keys())[i])
    print(words)


Cluster 0
['meyer', 'russ', 'tamblyn']

Cluster 1
['inhuman', 'unspeakable', 'ruthlessly']

Cluster 2
['province', 'summation']

Cluster 3
['booth', 'operator']

Cluster 4
['questionable', 'disastrous', 'baffling', 'deplorable']

Cluster 5
['demand', 'schedule']

Cluster 6
['soldiers', 'forces', 'terrorists', 'germans', 'troops', 'flag', 'russians', 'resistance', 'fighters', 'protest', 'judges', 'allies', 'rebels', 'marines', 'surrender', 'allied', 'nuke', 'peasants', 'ban', 'pows', 'infantry']

Cluster 7
['dress', 'underwear', 'panties', 'undress', 'garb']

Cluster 8
['quirks', 'asides']

Cluster 9
['land', 'treasure', 'kingdom', 'newly', 'tokyo', 'frontier', 'patrol', 'fortress', 'council', 'colony', 'viking', 'flood', 'mining', 'coal', 'mines', 'tibet', 'belonging', 'miners', 'soil', 'populace', 'gilligan', 'rebuild', 'outpost', 'dolphins']


In [9]:
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append(
        review_to_wordlist(review, remove_stopwords=True))

print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(
        review_to_wordlist(review, remove_stopwords=True))

Creating average feature vecs for test reviews


In [10]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train["review"].size, num_clusters), \
    dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros(( test["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

In [11]:
# Fit a random forest and extract predictions 
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

# Write the test results 
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( ".\\result\\result_word2vec_clustering.csv", index=False, quoting=3 )

Fitting a random forest to labeled training data...
