In [1]:
from __future__ import print_function
import json
import numpy as np
from util import *
from skutil import *

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [2]:
with open('../data/yelp_academic_dataset_user.json') as f:
    users = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_review.json') as f:
    reviews = [json.loads(line) for line in f]

In [3]:
# Split users into train and test set.
usersTrain, usersTest = train_test_split(users, random_state=0)

In [4]:
userReviewMap = mapReviewsByUsers(reviews)

In [5]:
baseUserFeatures = {'average_stars', 'fans', 'review_count'}

# Seperate pipelines for processing different subsets of features.
base = make_pipeline(SelectKeys(baseUserFeatures), DictVectorizer())
compliments = make_pipeline(SelectValue('compliments'), DictVectorizer())
votes = make_pipeline(SelectValue('votes'), DictVectorizer())
numFriends = make_pipeline(SelectValue('friends'), ApplyFunc(lambda x: len(x)), IdentityVectorizer())
tfidf = make_pipeline(UserReviewText(userReviewMap), TfidfVectorizer(max_features=1000))

#Union all the features together
userFeatures = make_union(base, tfidf)

In [6]:
nbclf = make_pipeline(userFeatures, MultinomialNB())

In [7]:
nbclf.fit(usersTrain, [isElite(u) for u in usersTrain])

Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('selectkeys', SelectKeys(keys={'fans', 'average_stars', 'review_count'})), ('dictvectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('pi...ormer_weights=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [8]:
predictions = nbclf.predict(usersTest)
print(classification_report([isElite(u) for u in usersTest], predictions))

             precision    recall  f1-score   support

      False       0.99      0.94      0.97     85414
       True       0.53      0.92      0.67      6265

avg / total       0.96      0.94      0.95     91679



In [9]:
logregclf = make_pipeline(userFeatures, LogisticRegression())

In [10]:
logregclf.fit(usersTrain, [isElite(u) for u in usersTrain])

Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('selectkeys', SelectKeys(keys={'fans', 'average_stars', 'review_count'})), ('dictvectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('pi...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [11]:
predictions = logregclf.predict(usersTest)
print(classification_report([isElite(u) for u in usersTest], predictions))

             precision    recall  f1-score   support

      False       0.97      0.99      0.98     85414
       True       0.83      0.61      0.70      6265

avg / total       0.96      0.96      0.96     91679



In [12]:
svmclf = make_pipeline(userFeatures, LinearSVC())

In [13]:
svmclf.fit(usersTrain, [isElite(u) for u in usersTrain])

Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('selectkeys', SelectKeys(keys={'fans', 'average_stars', 'review_count'})), ('dictvectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('pi...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [14]:
predictions = svmclf.predict(usersTest)
print(classification_report([isElite(u) for u in usersTest], predictions))

             precision    recall  f1-score   support

      False       0.98      0.98      0.98     85414
       True       0.72      0.79      0.75      6265

avg / total       0.97      0.96      0.96     91679

