In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import nltk.data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import os
import re

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def evaluate( prediction, truth ):
    print np.sum(prediction == truth)
    return np.sum(prediction == truth) / float(len(prediction))
    
#Function to remove stopwords 
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)  
    
# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences
    

In [3]:
os.chdir('C:\Users\schiang\Documents\yelp')
reviews = pd.read_csv('yelp_academic_dataset_review.csv', header=0, delimiter=',')
reviews = reviews.head(100000)

num_reviews, _ = reviews.shape
    
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for i in range(num_reviews):
    temp = review_to_sentences((reviews['text'][i]).decode('utf-8'), tokenizer)
    sentences += temp

# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
print len(sentences)
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

Parsing sentences from training set
Training model...
840012


In [None]:
model.doesnt_match("good okay kitchen poo".split())

In [None]:
model.most_similar("good")

In [4]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       # Print a status message every 1000th review
       if counter%1000. == 0.:
           print "Review %d of %d" % (counter, len(reviews))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
       #
       # Increment the counter
       counter = counter + 1.
    return reviewFeatureVecs

In [5]:
test = pd.read_csv('yelp_academic_dataset_review.csv', header=0, delimiter=',')
test = test.head(200000)
test = test.tail(100000)

num_tests, _ = test.shape

from sklearn import cross_validation
from numpy import float32
import math
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

print "Creating average feature vecs for training reviews"
clean_data = []
clean_results = []

data = test['text'].values
stars = test['stars'].values

print data[0]

for i in range(num_tests):
    clean_data.append( review_to_wordlist((data[i]).decode('utf-8'), remove_stopwords=True ))
    clean_results.append(stars[i])

dataVecs = getAvgFeatureVecs( clean_data, model, num_features )

print dataVecs.shape
dataVecs = imp.fit_transform(dataVecs)

# X_train, X_test, y_train, y_test = cross_validation.train_test_split(dataVecs.astype(float32), clean_results, test_size = 0.2, random_state = 0)

# # Fit a random forest to the training data, using 100 trees
# from sklearn.ensemble import RandomForestClassifier
# forest = RandomForestClassifier( n_estimators = 100 )

# print "Fitting a random forest to labeled training data..."
# forest = forest.fit( X_train, y_train )

# # Test & extract results 
# result = forest.predict( X_test )

# print evaluate(result, y_test)


Creating average feature vecs for training reviews
Going to Houston's either on Scottsdale or on Camelback is like throwing dice. Sometimes its good and sometimes its not at all. Over the years and I have been eating there for years the staff has become less and less polished. I've watched it go from slacks to jeans and very attractive staff to just ok. 

It's not cheap to eat there and you expect the food to be consistent but it always isn't. My wife loves the fall vegetable plate but the last two times it was not good the fish that I ordered was very overcooked. I sent it back and the wait staff rolled her eyes. 

One other time (years ago) I was waiting with my wife and another couple and we waiting for 30 mins which was normal and person came in with his friends and handed the front person 100.00 it looked like and they were seated right away. I asked the wait girl how much she made a week and she looked shocked that I caught her. She said excuse me walked away I didn't see her for

#Generating average stars for each words in vocabulary

In [7]:
rating = dict()

for key in set(model.vocab):
    key = str(key)
    for i, val in enumerate(data):
        
        if key in val:
            #if vocab doesn't exist in the dictionary yet
            if (key in rating):
                rating[key][0] += stars[i]
                rating[key][1] += 1
                
            #otherwise you want to add it on appropriately
            else:
                rating[key] = [stars[i], 1]
                

In [24]:
'cyprus'in rating

False

In [25]:
x,y = model.syn0.shape

words = np.empty([x,y])
labels = []
ratingLabels = []
i = 0

for key in set(model.vocab):
    labels.append(key)
    words[i,:] = model[key]
    if (key in rating):
        ratingLabels.append(float(rating[str(key)][0])/rating[str(key)][1])
    else:
        ratingLabels.append(0)
    i += 1

In [26]:
# %load tsne.py
#
#  tsne.py
#
# Implementation of t-SNE in Python. The implementation was tested on Python 2.7.10, and it requires a working
# installation of NumPy. The implementation comes with an example on the MNIST dataset. In order to plot the
# results of this example, a working installation of matplotlib is required.
#
# The example can be run by executing: `ipython tsne.py`
#
#
#  Created by Laurens van der Maaten on 20-12-08.
#  Copyright (c) 2008 Tilburg University. All rights reserved.

import numpy as Math
import pylab as Plot

def Hbeta(D = Math.array([]), beta = 1.0):
	"""Compute the perplexity and the P-row for a specific value of the precision of a Gaussian distribution."""

	# Compute P-row and corresponding perplexity
	P = Math.exp(-D.copy() * beta);
	sumP = sum(P);
	H = Math.log(sumP) + beta * Math.sum(D * P) / sumP;
	P = P / sumP;
	return H, P;


def x2p(X = Math.array([]), tol = 1e-5, perplexity = 30.0):
	"""Performs a binary search to get P-values in such a way that each conditional Gaussian has the same perplexity."""

	# Initialize some variables
	print "Computing pairwise distances..."
	(n, d) = X.shape;
	sum_X = Math.sum(Math.square(X), 1);
	D = Math.add(Math.add(-2 * Math.dot(X, X.T), sum_X).T, sum_X);
	P = Math.zeros((n, n));
	beta = Math.ones((n, 1));
	logU = Math.log(perplexity);

	# Loop over all datapoints
	for i in range(n):

		# Print progress
		if i % 500 == 0:
			print "Computing P-values for point ", i, " of ", n, "..."

		# Compute the Gaussian kernel and entropy for the current precision
		betamin = -Math.inf;
		betamax =  Math.inf;
		Di = D[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))];
		(H, thisP) = Hbeta(Di, beta[i]);

		# Evaluate whether the perplexity is within tolerance
		Hdiff = H - logU;
		tries = 0;
		while Math.abs(Hdiff) > tol and tries < 50:

			# If not, increase or decrease precision
			if Hdiff > 0:
				betamin = beta[i].copy();
				if betamax == Math.inf or betamax == -Math.inf:
					beta[i] = beta[i] * 2;
				else:
					beta[i] = (beta[i] + betamax) / 2;
			else:
				betamax = beta[i].copy();
				if betamin == Math.inf or betamin == -Math.inf:
					beta[i] = beta[i] / 2;
				else:
					beta[i] = (beta[i] + betamin) / 2;

			# Recompute the values
			(H, thisP) = Hbeta(Di, beta[i]);
			Hdiff = H - logU;
			tries = tries + 1;

		# Set the final row of P
		P[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))] = thisP;

	# Return final P-matrix
	print "Mean value of sigma: ", Math.mean(Math.sqrt(1 / beta));
	return P;


def pca(X = Math.array([]), no_dims = 50):
	"""Runs PCA on the NxD array X in order to reduce its dimensionality to no_dims dimensions."""

	print "Preprocessing the data using PCA..."
	(n, d) = X.shape;
	X = X - Math.tile(Math.mean(X, 0), (n, 1));
	(l, M) = Math.linalg.eig(Math.dot(X.T, X));
	Y = Math.dot(X, M[:,0:no_dims]);
	return Y;


def tsne(X = Math.array([]), no_dims = 2, initial_dims = 50, perplexity = 30.0):
	"""Runs t-SNE on the dataset in the NxD array X to reduce its dimensionality to no_dims dimensions.
	The syntaxis of the function is Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array."""

	# Check inputs
	if isinstance(no_dims, float):
		print "Error: array X should have type float.";
		return -1;
	if round(no_dims) != no_dims:
		print "Error: number of dimensions should be an integer.";
		return -1;

	# Initialize variables
	X = pca(X, initial_dims).real;
	(n, d) = X.shape;
	max_iter = 300;
	initial_momentum = 0.5;
	final_momentum = 0.8;
	eta = 500;
	min_gain = 0.01;
	Y = Math.random.randn(n, no_dims);
	dY = Math.zeros((n, no_dims));
	iY = Math.zeros((n, no_dims));
	gains = Math.ones((n, no_dims));

	# Compute P-values
	P = x2p(X, 1e-5, perplexity);
	P = P + Math.transpose(P);
	P = P / Math.sum(P);
	P = P * 4;									# early exaggeration
	P = Math.maximum(P, 1e-12);

	# Run iterations
	for iter in range(max_iter):

		# Compute pairwise affinities
		sum_Y = Math.sum(Math.square(Y), 1);
		num = 1 / (1 + Math.add(Math.add(-2 * Math.dot(Y, Y.T), sum_Y).T, sum_Y));
		num[range(n), range(n)] = 0;
		Q = num / Math.sum(num);
		Q = Math.maximum(Q, 1e-12);

		# Compute gradient
		PQ = P - Q;
		for i in range(n):
			dY[i,:] = Math.sum(Math.tile(PQ[:,i] * num[:,i], (no_dims, 1)).T * (Y[i,:] - Y), 0);

		# Perform the update
		if iter < 20:
			momentum = initial_momentum
		else:
			momentum = final_momentum
		gains = (gains + 0.2) * ((dY > 0) != (iY > 0)) + (gains * 0.8) * ((dY > 0) == (iY > 0));
		gains[gains < min_gain] = min_gain;
		iY = momentum * iY - eta * (gains * dY);
		Y = Y + iY;
		Y = Y - Math.tile(Math.mean(Y, 0), (n, 1));

		# Compute current value of cost function
		if (iter + 1) % 10 == 0:
			C = Math.sum(P * Math.log(P / Q));
			print "Iteration ", (iter + 1), ": error is ", C

		# Stop lying about P-values
		if iter == 100:
			P = P / 4;

	# Return solution
	return Y;

In [27]:
%pylab

import matplotlib.pyplot as plt

print "Run tsne.py to visualize word embedding data"

vocab = labels
Y = tsne(words, 2, 50, 20.0);
#fig, ax = plt.subplots()
#ax.scatter(Y[:,0], Y[:,1], 20);

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib
Run tsne.py to visualize word embedding data
Preprocessing the data using PCA...
Computing pairwise distances...
Computing P-values for point  0  of  9144 ...
Computing P-values for point  500  of  9144 ...
Computing P-values for point  1000  of  9144 ...
Computing P-values for point  1500  of  9144 ...
Computing P-values for point  2000  of  9144 ...
Computing P-values for point  2500  of  9144 ...
Computing P-values for point  3000  of  9144 ...
Computing P-values for point  3500  of  9144 ...
Computing P-values for point  4000  of  9144 ...
Computing P-values for point  4500  of  9144 ...
Computing P-values for point  5000  of  9144 ...
Computing P-values for point  5500  of  9144 ...
Computing P-values for point  6000  of  9144 ...
Computing P-values for point  6500  of  9144 ...
Computing P-values for point  7000  of  9144 ...
Computing P-values for point  7500  of  9144 ...
Computing P

`%matplotlib` prevents importing * from pylab and numpy


In [87]:
import mpld3
import matplotlib.pyplot as plt
import matplotlib.colors as cc
from matplotlib.pyplot import figure, show

def onpick3(event):
    ind = event.ind
    print labels[ind[0]]
    print ('onpick3 scatter:', labels[ind[0]], np.take(Y[:,0, ind[0]]), np.take(Y[:,1, ind[0]]))

colours = (label / 5 for label in ratingLabels)

fig = plt.figure()
ax = fig.add_subplot(111)
 
ax.scatter(Y[:,0], Y[:,1], alpha=0.5, c = list(colours), picker = True, cmap = 'RdYlGn')
#plt.imshow()

fig.canvas.mpl_connect('pick_event', onpick3)

# tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
# mpld3.plugins.connect(fig, tooltip)

# mpld3.show()




6

hibachi
hibachi
hibachi
hibachi
equivalent
excessively
forgettable
swallow
swallow
swallow
aloha
hawaiian
leaves
leaves
desired
fast
fast
fast
fast
quick
fast
fast
quick

In [71]:
Y

array([[ 16.86032037, -16.0196732 ],
       [-18.87756676,  16.32741071],
       [-29.84830284,  -4.46716483],
       ..., 
       [ 15.12297558, -22.55163348],
       [  4.3898635 ,   3.67277064],
       [-14.81883323,  26.0651307 ]])

In [90]:
os.chdir('C:\Users\schiang\Documents\yelp')
reviews = pd.read_csv('yelp_academic_dataset_review.csv', header=0, delimiter=',')
businesses = pd.read_csv('yelp_academic_dataset_business.csv', header=0, delimiter=',')
reviews = reviews.head(100000)

  data = self._reader.read(nrows)


#Identifying covariate shift

In [100]:
cities = ['Las Vegas', 'Karlsruhe']
businesses.columns.values

array(['attributes.Ambience.divey',
       'attributes.Dietary Restrictions.vegan', 'attributes.Happy Hour',
       'hours.Thursday.open', 'attributes.Order at Counter',
       'attributes.Hair Types Specialized In.africanamerican',
       'attributes.Hair Types Specialized In.kids', 'attributes.BYOB',
       'hours.Friday.open', 'categories', 'latitude',
       'attributes.Outdoor Seating', 'attributes.Alcohol',
       'attributes.Ambience.classy', 'attributes.Payment Types.mastercard',
       'attributes.Parking.lot', 'business_id',
       'attributes.Ambience.touristy', 'attributes.Corkage',
       'hours.Tuesday.open', 'attributes.Good For.brunch',
       'attributes.Payment Types.amex', 'name', 'hours.Monday.open',
       'attributes.Waiter Service', 'attributes.Parking.street',
       'attributes.Ambience.hipster', 'attributes.BYOB/Corkage',
       'attributes.Hair Types Specialized In.straightperms',
       'attributes.Music.live',
       'attributes.Dietary Restrictions.dairy-f

In [99]:
num_reviews, _ = reviews.shape

train1 = [np.zeros(2)]
train2 = [np.zeros(2)]

for i in range(num_reviews):
    bid = reviews['business_id'][i]
    bname = businesses.loc(businesses[businesses['business_id'] == bid].index.tolist())
    print bname
    print bid
    if len(bname == 1):
        index = bname[0]
        temp = [reviews['text'][i], reviews['stars'][i]]
        if (businesses['business_id'][index] == cities[0]):
            train1 = np.concatenate((train1, [temp]), axis = 0)
        elif (businesses['business_id'][index] == cities[1]):
            train1 = np.concatenate((train2, [temp]), axis = 0)
        
# sentences = []  # Initialize an empty list of sentences

# print "Parsing sentences from training set"
# for i in range(num_reviews):
#     temp = review_to_sentences((reviews['text'][i]).decode('utf-8'), tokenizer)
#     sentences += temp

# # Import the built-in logging module and configure it so that Word2Vec 
# # creates nice output messages
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
#     level=logging.INFO)

# # Set values for various parameters
# num_features = 300    # Word vector dimensionality                      
# min_word_count = 40   # Minimum word count                        
# num_workers = 4       # Number of threads to run in parallel
# context = 10          # Context window size                                                                                    
# downsampling = 1e-3   # Downsample setting for frequent words

# # Initialize and train the model (this will take some time)
# from gensim.models import word2vec
# print "Training model..."
# print len(sentences)
# model = word2vec.Word2Vec(sentences, workers=num_workers, \
#             size=num_features, min_count = min_word_count, \
#             window = context, sample = downsampling)

# # If you don't plan to train the model any further, calling 
# # init_sims will make the model much more memory-efficient.
# model.init_sims(replace=True)

# # It can be helpful to create a meaningful model name and 
# # save the model for later use. You can load it later using Word2Vec.load()
# model_name = "300features_40minwords_10context"
# model.save(model_name)

<pandas.core.indexing._LocIndexer object at 0x0000000146726C18>
vcNAWiLM4dR7D2nwwJ7nCA


TypeError: object of type 'bool' has no len()