In [1]:
import graphlab

A newer version of GraphLab Create (v1.10.1) is available! Your current version is v1.9.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.


## Loading yelp review dataset

In [2]:
yelp_overall = graphlab.load_sframe('review/')

This non-commercial license of GraphLab Create is assigned to znasim@khi.iba.edu.pk and will expire on February 01, 2017. For commercial licensing options, visit https://dato.com/buy/.


2016-06-15 01:01:01,033 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.9 started. Logging: C:\Users\znasim\AppData\Local\Temp\graphlab_server_1465934456.log.0


In [3]:
yelp_overall.num_rows()

2225213

## Distribution of Stars in Review dataset

In [4]:
graphlab.canvas.set_target('ipynb')
yelp_overall['stars'].show(view='Categorical')

## Sampling dataset
Due to computational limitation I worked on 50% of review dataset

In [5]:
yelp_subset,other = yelp_overall.random_split(0.5,seed = 1)

In [6]:
yelp_subset['stars'] = yelp_subset['stars'].astype(int)

In [7]:
yelp_subset.num_rows()

1112332

In [8]:
yelp_dataset = yelp_subset['text','stars']

In [9]:
del yelp_overall,yelp_subset,other

## Splitting data into Training and Test data

Splitting dataset into training and test set

In [10]:
train_set,test_set = yelp_dataset.random_split(.7, seed=0)

In [11]:
#Distribution of stars in subset
yelp_dataset['stars'].show(view='Categorical')

In [12]:
train_set.num_rows()

778966

In [13]:
test_set.num_rows()

333366

## Feature Extraction using Word2Vec
We followed vector based approach to extract features from raw text.Word2vec vectorizes about words, and by doing so it makes natural language computer-readable - we can start to perform powerful mathematical operations on words to detect their similarities

Word2vec, published by Google in 2013, is a neural network implementation that learns distributed representations for words.
Word2vec is an unsupervised learning approach that clusters similar words together.
In this project,word2vec implementation of genism package is used. Word2vec model was trained on training data i.e 7,78,966 reviews

In [14]:
%matplotlib inline

import numpy as np
import pandas as pd
import gensim
import logging
import re
import nltk.data

from matplotlib import pyplot as plt
from nltk.corpus import stopwords

In [15]:
# converting from SFrame to numpy array
train = train_set.to_dataframe()
test = test_set.to_dataframe()

In [15]:
train.head()

Unnamed: 0,text,stars
0,All the food is great here. But the best thing...,5
1,Wing sauce is like water. Pretty much a lot of...,1
2,This place is absolute garbage... Half of the...,1
3,Before I finally made it over to this range I ...,4
4,I drove by yesterday to get a sneak peak. It ...,4


In [16]:

print( 'train shape: {0}'.format( train.shape ) )
print( 'test shape: {0}'.format( test.shape ) )

train shape: (778966, 2)
test shape: (333366, 2)


In [21]:
stops = set(stopwords.words("english"))
not_remove = ['most','not','very','few','more','only','nor','too','but']
for word in not_remove:
    stops.remove(word)

### Data Cleaning

In [16]:
# borrowed from Kaggle https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors
def review_to_wordlist( review, remove_stopwords=True ):
    # remove HTML
    review_text = re.sub("[^a-zA-Z]"," ", review)
    words = review_text.lower().split()
    if remove_stopwords:
         words = [w for w in words if not w in stops]
    return ( words )

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(review.strip().decode('utf8', 'ignore'))
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords ))
    return sentences

In [None]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

sentences = []
for review in train[ 'text' ]:
    sentences += review_to_sentences(review, tokenizer)

In [19]:
len(sentences)

6633268

In [None]:
logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO )

num_features = 500    # dimensionality                      
min_word_count = 20   # minimum word count                        
num_workers = 6       # number of threads to run in parallel
context = 5          # context window size                                                                                    
downsampling = 1e-3   # downsample setting for frequent words

model = gensim.models.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# free memory
model.init_sims( replace=True )

# save model
model_name = "500features_20minwords_5context_1.1MReviews"
model.save(model_name)

# stat
print( 'total run time: {0} [s]'.format( model.total_train_time ) )

In [25]:
del model

In [18]:
import gensim
model = gensim.models.Word2Vec.load("500features_20minwords_5context_1.1MReviews")

num_features = 500    # dimensionality                      
min_word_count = 20   # minimum word count                        
num_workers = 6       # number of threads to run in parallel
context = 5          # context window size                                                                                    
downsampling = 1e-3   # downsample setting for frequent words

In [19]:
nan_words = {}

def makeFeatureVec( words, model, num_features, index2word_set ):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            if np.isnan( model[ word ] ).any():
                if word in nan_words:
                    nan_words[ word ] += 1
                else:
                    nan_words[ word ] = 1
    
            featureVec = np.add(featureVec,model[word])
    if nwords != 0:
        featureVec = np.divide(featureVec,nwords)

    return featureVec

def getAvgFeatureVecs(reviews, model, num_features, index2word_set ):
    counter = 0.
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")

    for review in reviews:
       if counter % 1000 == 0.:
           print "Review %d of %d" % (counter, len(reviews))
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features, index2word_set )
       counter = counter + 1.

    return reviewFeatureVecs

#### Creating Word Vectors

In [22]:
index2word_set = set( model.index2word )

clean_train_reviews = []
for review in train['text']:
    clean_train_reviews.append( review_to_wordlist( review, remove_stopwords=True ) )

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features, index2word_set )

clean_test_reviews = []
for review in test['text']:
    clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ) )

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features, index2word_set )

Review 0 of 778966
Review 1000 of 778966
Review 2000 of 778966
Review 3000 of 778966
Review 4000 of 778966
Review 5000 of 778966
Review 6000 of 778966
Review 7000 of 778966
Review 8000 of 778966
Review 9000 of 778966
Review 10000 of 778966
Review 11000 of 778966
Review 12000 of 778966
Review 13000 of 778966
Review 14000 of 778966
Review 15000 of 778966
Review 16000 of 778966
Review 17000 of 778966
Review 18000 of 778966
Review 19000 of 778966
Review 20000 of 778966
Review 21000 of 778966
Review 22000 of 778966
Review 23000 of 778966
Review 24000 of 778966
Review 25000 of 778966
Review 26000 of 778966
Review 27000 of 778966
Review 28000 of 778966
Review 29000 of 778966
Review 30000 of 778966
Review 31000 of 778966
Review 32000 of 778966
Review 33000 of 778966
Review 34000 of 778966
Review 35000 of 778966
Review 36000 of 778966
Review 37000 of 778966
Review 38000 of 778966
Review 39000 of 778966
Review 40000 of 778966
Review 41000 of 778966
Review 42000 of 778966
Review 43000 of 778966
R

In [22]:
model.most_similar('food')

[(u'cuisine', 0.5932974219322205),
 (u'foods', 0.505925178527832),
 (u'sushi', 0.5007404088973999),
 (u'fare', 0.4933341443538666),
 (u'meals', 0.4885599911212921),
 (u'meal', 0.47217613458633423),
 (u'grub', 0.4721677899360657),
 (u'cusine', 0.43543577194213867),
 (u'pizza', 0.4307407736778259),
 (u'banchan', 0.42488497495651245)]

In [23]:
model.most_similar('sushi')

[(u'nigiri', 0.6163148880004883),
 (u'sashimi', 0.6000047922134399),
 (u'ramen', 0.5397598743438721),
 (u'kbbq', 0.5378069877624512),
 (u'maki', 0.5368378162384033),
 (u'ayce', 0.5168564915657043),
 (u'food', 0.5007403492927551),
 (u'hibachi', 0.4971942603588104),
 (u'seafood', 0.4960891008377075),
 (u'unagi', 0.4955548644065857)]

In [26]:
del sentences

In [22]:
len(model.vocab) - 1

35106

In [23]:
len(trainDataVecs)

778966

## Modeling

### 1. Model Training Using Random Forest
I first train  model using Random Forest Regression

In [24]:
print "Training the random forest regressionn..."
from sklearn.ensemble import RandomForestRegressor

# Initialize a Random Forest classifier with 800 trees
forest = RandomForestRegressor(n_estimators = 800,max_depth=10, random_state=12, verbose=1,n_jobs=-1) 


forest = forest.fit(trainDataVecs,train['stars'])

Training the random forest regressionn...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 96.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 419.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 934.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 1693.2min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 1706.9min finished


In [25]:
pred = forest.predict(testDataVecs)
from sklearn.metrics import r2_score
r2_score(test['stars'],pred)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    4.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   10.0s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   17.9s
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:   18.0s finished


0.53595168204489974

In [31]:
print "Training the random forest regressionn..."
from sklearn.ensemble import RandomForestRegressor

# Initialize a Random Forest classifier with 1000 trees
forest = RandomForestRegressor(n_estimators = 1000,max_depth=10, random_state=12, verbose=1,n_jobs=-1
                               ) 

forest = forest.fit(trainDataVecs,train['stars'])

Training the random forest regressionn...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 79.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 353.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 815.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 1457.8min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 1849.5min finished


### Evaluting  Model trained Using Random Forest

In [32]:
pred = forest.predict(testDataVecs)
from sklearn.metrics import r2_score
r2_score(test['stars'],pred)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   16.4s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   20.6s finished


0.53607242286042156

In [36]:
## Saving Random Forest model
from sklearn.externals import joblib
joblib.dump(forest,'rf.pkl') 


['rf.pkl',
 'rf.pkl_01.npy',
 'rf.pkl_02.npy',
 'rf.pkl_03.npy',
 'rf.pkl_04.npy',
 'rf.pkl_05.npy',
 'rf.pkl_06.npy',
 'rf.pkl_07.npy',
 'rf.pkl_08.npy',
 'rf.pkl_09.npy',
 'rf.pkl_10.npy',
 'rf.pkl_11.npy',
 'rf.pkl_12.npy',
 'rf.pkl_13.npy',
 'rf.pkl_14.npy',
 'rf.pkl_15.npy',
 'rf.pkl_16.npy',
 'rf.pkl_17.npy',
 'rf.pkl_18.npy',
 'rf.pkl_19.npy',
 'rf.pkl_20.npy',
 'rf.pkl_21.npy',
 'rf.pkl_22.npy',
 'rf.pkl_23.npy',
 'rf.pkl_24.npy',
 'rf.pkl_25.npy',
 'rf.pkl_26.npy',
 'rf.pkl_27.npy',
 'rf.pkl_28.npy',
 'rf.pkl_29.npy',
 'rf.pkl_30.npy',
 'rf.pkl_31.npy',
 'rf.pkl_32.npy',
 'rf.pkl_33.npy',
 'rf.pkl_34.npy',
 'rf.pkl_35.npy',
 'rf.pkl_36.npy',
 'rf.pkl_37.npy',
 'rf.pkl_38.npy',
 'rf.pkl_39.npy',
 'rf.pkl_40.npy',
 'rf.pkl_41.npy',
 'rf.pkl_42.npy',
 'rf.pkl_43.npy',
 'rf.pkl_44.npy',
 'rf.pkl_45.npy',
 'rf.pkl_46.npy',
 'rf.pkl_47.npy',
 'rf.pkl_48.npy',
 'rf.pkl_49.npy',
 'rf.pkl_50.npy',
 'rf.pkl_51.npy',
 'rf.pkl_52.npy',
 'rf.pkl_53.npy',
 'rf.pkl_54.npy',
 'rf.pkl_55.npy',

In [37]:
del pred
del forest

from sklearn.externals import joblib
forest = joblib.load('rf.pkl')

In [38]:
pred = forest.predict(testDataVecs)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    9.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   16.9s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   21.1s finished


In [39]:
from sklearn.metrics import mean_squared_error
mean_squared_error(test['stars'],pred)

0.86795812333185396

In [40]:
rmse = np.sqrt(np.mean((pred-test['stars'])**2))
print rmse

0.93164270154


### Model Training Using Xgboost

In [23]:
import xgboost as xgb
param = {}
param['eta'] = 0.01
param['max_depth'] = 10
param['min_child_weight']  = 15
param['subsample']=0.8
param['nthread']=7
param['colsample_bytree']=0.8
param['seed']=12
param['silent'] = False
param['objective'] = 'reg:linear'

In [25]:
xg_train = xgb.DMatrix(trainDataVecs,train['stars'])
xg_test = xgb.DMatrix(testDataVecs,test['stars'])
watchlist = [(xg_train,'train'),(xg_test,'test')]

In [None]:
bst = xgb.train(param,xg_train,12000,watchlist)

In [None]:
#Saving model
bst.save_model('bst27may0.69.model')

### Evaluating Xgboost Regressor

In [26]:
bst = xgb.Booster(model_file='bst27may0.69.model')

In [27]:
pred = bst.predict(xg_test)

In [28]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
rscore  = r2_score(test['stars'],pred)
mse  = mean_squared_error(test['stars'],pred)
rmse = np.sqrt(mse)
print( 'R-score : {0}'.format(rscore ) )
print( 'MSE : {0}'.format(mse ) )
print( 'RMSE : {0}'.format(rmse ) )

R-score : 0.695573413884
MSE : 0.569549087826
RMSE : 0.754684760563


## Histogram of Error Values

In [29]:
test_set['predictedStars'] = pred

In [30]:
test['predictedStars'] = test_set['predictedStars']

In [31]:
test_set['errors'] = test_set['stars'] - test_set['predictedStars']

In [32]:
graphlab.canvas.set_target('ipynb')

In [33]:
test_set['errors'].show()

In [34]:
sf = graphlab.SFrame()
sf['Predicted-Rating'] = test_set['predictedStars']
sf['Actual-Rating'] = test_set['stars']
predict_count = sf.groupby('Actual-Rating', [graphlab.aggregate.COUNT('Actual-Rating')
                                             , graphlab.aggregate.AVG('Predicted-Rating')])
predict_count.topk('Actual-Rating', k=5, reverse=True)   

Actual-Rating,Count,Avg of Predicted-Rating
1,39216,1.83543923551
2,28335,2.53592618699
3,42468,3.28687600064
4,88344,4.05111180226
5,135003,4.52421377236


### Confusion Matrix

In [35]:
def eval_wrapper(yhat):  
       return  np.clip(np.round(yhat).astype(int),1,5)

In [36]:
test_set['predictedStar_rounded'] =  eval_wrapper(test_set['predictedStars'])

In [37]:
test['predictedStar_rounded'] = test_set['predictedStar_rounded']

In [38]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test['stars'].astype(str),test['predictedStar_rounded'].astype(str),labels = ['1','2','3',
                                                                                              '4','5'])

array([[15472, 15970,  6451,  1275,    48],
       [ 2515, 11215, 11569,  2950,    86],
       [  380,  5160, 19777, 16196,   955],
       [   67,  1189, 12258, 55294, 19536],
       [   41,   661,  4992, 44828, 84481]])