In [None]:
from __future__ import print_function
import json
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [None]:
with open('../data/yelp_academic_dataset_user.json') as f:
    users = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_review.json') as f:
    reviews = [json.loads(line) for line in f]

In [94]:
# Split users into train and test set.
usersTrain, usersTest = train_test_split(users, random_state=0)

In [None]:
class SelectKeys(BaseEstimator, TransformerMixin):
    """Transforms a list of a dictionaries into a list of subset dictionaries.
    For each dictionary, select a subset of their key and values."""
    def __init__(self, keys):
        self.keys = keys
    def fit(self, x, y=None):
        return self
    def transform(self, data):
        return np.array([{k: d[k] for k in self.keys} for d in data])

In [None]:
class SelectValue(BaseEstimator, TransformerMixin):
    """Transforms a list of dictionaries into a list of values by selecting the value 
    of the given key for each dictionary."""
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data):
        return [d[self.key] for d in data]

In [None]:
class ApplyFunc(BaseEstimator, TransformerMixin):
    """Transforms a list of values by apply the function to each value."""
    def __init__(self, f):
        self.f = f
    def fit(self, x, y=None):
        return self
    def transform(self, data):
        return [self.f(d) for d in data]

In [None]:
class IdentityVectorizer(BaseEstimator, TransformerMixin):
    """Wraps data values in a list."""
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, data):
        return [[d] for d in data]

In [None]:
def isElite(year, user):
    """True if user was elite in the given year."""
    return year in user['elite']

In [None]:
users[0]

In [None]:
baseUserFeatures = {'average_stars', 'fans', 'review_count'}

# Seperate pipelines for processing different subsets of features.
base = make_pipeline(SelectKeys(baseUserFeatures), DictVectorizer())
compliments = make_pipeline(SelectValue('compliments'), DictVectorizer())
numFriends = make_pipeline(SelectValue('friends'), ApplyFunc(lambda x: len(x)), IdentityVectorizer())
## Features which take into elite history into account.
curYear = 2015
prevYearElite = make_pipeline(SelectValue('elite'), ApplyFunc(lambda x: curYear - 1 in x), IdentityVectorizer())
wasElite = make_pipeline(SelectValue('elite'), ApplyFunc(lambda x: True if [y for y in x if y < curYear] else False), IdentityVectorizer())

#Union all the features together
userFeatures = make_union(base, compliments, numFriends, prevYearElite, wasElite)

In [None]:
logregclf = make_pipeline(userFeatures, LogisticRegression())

In [None]:
logregclf.fit(usersTrain, [isElite(2015, u) for u in usersTrain])

In [None]:
predictions = logregclf.predict(usersTest)
print(classification_report([isElite(2015, u) for u in usersTest], predictions))

In [None]:
svmclf = make_pipeline(userFeatures, LinearSVC())

In [None]:
svmclf.fit(usersTrain, [isElite(2015, u) for u in usersTrain])

In [None]:
predictions = svmclf.predict(usersTest)
print(classification_report([isElite(2015, u) for u in usersTest], predictions))