# 1. Read the raw data

## Importations

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Read data from files 

In [2]:
train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

## Verify the number of reviews that were read (100,000 in total)

In [3]:
print "Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size )

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



# 2. Cleaning the data

## Importations

In [None]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

## Convert a document to a sequence of words

In [4]:
def review_to_wordlist( review ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Remove stop words (false by default)
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

## Split a review into parsed sentences

In [5]:
import nltk.data
nltk.download()   

# Load the tokenizer from nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

showing info http://www.nltk.org/nltk_data/


## Computing on the data

In [6]:
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for review in train["review"]:
    sentences += review_to_sentences(unicode(review, 'utf-8'), tokenizer)

print "Parsing sentences from unlabeled set"
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(unicode(review, 'utf-8'), tokenizer)

Parsing sentences from training set
Parsing sentences from unlabeled set


## Tests

In [7]:
print len(sentences)

795538


In [10]:
print sentences[4]

[u'some', u'may', u'call', u'mj', u'an', u'egotist', u'for', u'consenting', u'to', u'the', u'making', u'of', u'this', u'movie', u'but', u'mj', u'and', u'most', u'of', u'his', u'fans', u'would', u'say', u'that', u'he', u'made', u'it', u'for', u'the', u'fans', u'which', u'if', u'true', u'is', u'really', u'nice', u'of', u'him', u'the', u'actual', u'feature', u'film', u'bit', u'when', u'it', u'finally', u'starts', u'is', u'only', u'on', u'for', u'minutes', u'or', u'so', u'excluding', u'the', u'smooth', u'criminal', u'sequence', u'and', u'joe', u'pesci', u'is', u'convincing', u'as', u'a', u'psychopathic', u'all', u'powerful', u'drug', u'lord']


# 3. Training Word2Vec

## Set values for various parameters

In [12]:
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

## Importation

In [13]:
from gensim.models import word2vec

## Training our model

In [None]:
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

## We don't plan to train the model any further

In [None]:
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

## We save it

In [None]:
# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

## Tests

In [38]:
model

<gensim.models.word2vec.Word2Vec at 0x171fb8550>

In [39]:
model.doesnt_match("man woman child tree daughter".split())

'tree'

In [40]:
model.doesnt_match("france england germany berlin".split())

'berlin'

In [42]:
model.most_similar("fantastic")

[(u'brilliant', 0.6416057348251343),
 (u'wonderful', 0.6375398635864258),
 (u'superb', 0.6132605671882629),
 (u'amazing', 0.6075152158737183),
 (u'great', 0.6044968366622925),
 (u'terrific', 0.5843934416770935),
 (u'excellent', 0.5677196383476257),
 (u'fabulous', 0.563037633895874),
 (u'marvelous', 0.5626873970031738),
 (u'awesome', 0.5597708225250244)]

In [35]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn = 10)

[(u'queen', 0.4603006839752197),
 (u'lion', 0.40316981077194214),
 (u'princess', 0.39915820956230164),
 (u'aladdin', 0.3486478626728058),
 (u'kong', 0.3401307463645935),
 (u'prince', 0.32044780254364014),
 (u'witch', 0.3197129964828491),
 (u'belle', 0.30830180644989014),
 (u'du', 0.3080085217952728),
 (u'madeleine', 0.3068543076515198)]

-------

# 4. Compute clustering

## Load the model

In [34]:
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")

## Some tests

In [15]:
type(model.syn0)

numpy.ndarray

In [16]:
model.syn0.shape

(16490, 300)

In [17]:
model["flower"]

array([-0.06488955, -0.07009859,  0.00692126, -0.03094291,  0.05867873,
        0.05126816,  0.11081391, -0.02649415,  0.02058307,  0.05279991,
       -0.00633742, -0.06086138, -0.04158127,  0.0055691 , -0.02870898,
       -0.16036388,  0.06647004, -0.03284989, -0.04153325, -0.04816008,
       -0.11241508, -0.02980684, -0.00244578,  0.07241758,  0.03970441,
        0.06498393,  0.10887307, -0.05738701, -0.05569573,  0.00159743,
        0.03683783,  0.13546272,  0.00915493, -0.00978644, -0.00554668,
        0.12002576,  0.02318511, -0.01718634, -0.02011414, -0.17007081,
       -0.00674725,  0.02484204,  0.01301769, -0.04426865, -0.039248  ,
       -0.01579146,  0.0019246 ,  0.17274301, -0.05032826,  0.00842139,
       -0.08518724,  0.03673296, -0.0671815 , -0.04935077,  0.00463567,
        0.03251321, -0.02030898,  0.04250539, -0.05616349, -0.02230252,
        0.02729607,  0.07848719, -0.06396292,  0.05632715,  0.00628025,
       -0.15204202, -0.00303472,  0.09669513,  0.13205555, -0.06

## Set values for various parameters

In [21]:
word_vectors = model.syn0
num_clusters = word_vectors.shape[0] / 10

## Clustering with sklearn

In [23]:
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/10th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.syn0
num_clusters = word_vectors.shape[0] / 10

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print "Time taken for K Means clustering: ", elapsed, "seconds."

Time taken for K Means clustering:  1116.1862781 seconds.


In [27]:
import cPickle
f = file('idx.save', 'wb')
cPickle.dump(idx, f, protocol=cPickle.HIGHEST_PROTOCOL)
f.close()

## Clustering with Spark

In [20]:
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans
from numpy import array
from math import sqrt

In [None]:
import time

start = time.time() # Start time
# -----
kmeans_clustering_spark = KMeans.train(sc.parallelize(word_vectors), num_clusters, \
                                 maxIterations=3, runs=30, initializationMode="random")
print "Processing predict"
idx_spark = kmeans_clustering_spark.predict(word_vectors)
# -----
end = time.time()
elapsed = end - start
print "Time taken for K Means Spark clustering: ", elapsed, "seconds."

------

# 5. Results

## Loading

In [29]:
f = file('idx.save', 'rb')
idx = cPickle.load(f)
f.close()

## Mapping a word to its cluster

In [30]:
word_centroid_map = dict(zip( model.index2word, idx ))

## Print some results

In [31]:
# For the first 10 clusters
for cluster in xrange(0,20):
    #
    # Print the cluster number  
    print "\nCluster %d" % cluster
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in xrange(0,len(word_centroid_map.values())):
        if( word_centroid_map.values()[i] == cluster ):
            words.append(word_centroid_map.keys()[i])
    print words


Cluster 0
[u'tendency']

Cluster 1
[u'krueger', u'kruger', u'glove', u'englund']

Cluster 2
[u'achilles', u'casualty']

Cluster 3
[u'shambles', u'ineffective', u'futile', u'ploy', u'moralistic', u'lumbering', u'curiously', u'unwelcome', u'uber', u'deathly', u'squarely', u'heinous', u'descended', u'catastrophe', u'contention', u'handedly']

Cluster 4
[u'inadvertently', u'disguise', u'stealing', u'cooks', u'clutches', u'threatens', u'unsuccessfully', u'ruse', u'suicidal', u'fakes', u'lawless', u'colleagues', u'vegetarian', u'sends', u'pose', u'unsuccessful', u'faking', u'overnight', u'kindly', u'threatening', u'threaten', u'crosses', u'kidnap', u'unemployed', u'therapy', u'blackmail', u'bets', u'deed', u'seduce', u'goody', u'cure', u'resurrect', u'lure', u'ridiculed', u'sells', u'satanist', u'prefers', u'complains', u'caller', u'serum', u'urges', u'execute']

Cluster 5
[u'dismissed', u'banned', u'hindsight', u'panned', u'critics', u'hype', u'recognised', u'praised', u'label', u'remade',