In [1]:
import json
import datetime
import numpy as np
import networkx as nx
from util import *
from skutil import *

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [2]:
with open('../data/yelp_academic_dataset_user.json') as f:
    users = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_review.json') as f:
    reviews = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_business.json') as f:
    businesses = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_tip.json') as f:
    tips = [json.loads(line) for line in f]

In [3]:
userMap = mapUsers(users)
userReviewMap = mapReviewsByUsers(reviews)
userTipMap = mapTipsByUser(tips)
businessMap = mapBusinesses(businesses)

In [None]:
userGraph = graphUsers(users)

In [None]:
userPageRank = nx.pagerank(userGraph)

In [None]:
userHub, userAuth = nx.hits(userGraph)

In [4]:
centroid, label = getKmeans(reviews, businessMap)
localEliteFriends = getLocalEliteFriends(reviews, userMap, label)

In [None]:
users[0]

In [None]:
reviews[0]

In [None]:
businesses[0]

In [5]:
usersTrain, usersTest = train_test_split(users, random_state=0)

In [6]:
def parseDateString(d):
    return datetime.date(*[int(s) for s in d.split('-')])

In [17]:
def getFeatureWeights(pipe):
    featureNames = pipe.named_steps['featurize'].named_steps['vectorize'].get_feature_names()
    featureWeights = pipe.named_steps['model'].coef_[0]
    return list(zip(featureNames, featureWeights))

In [27]:
def userFeatures(user):
    #Preprocess
    userID = user['user_id']
    numFans = user['fans']
    numReviews = user['review_count']
    numEliteFriends = len([f for f in user['friends'] if isElite(userMap[f])])
    numFriends = len(user['friends'])
    yelpingYear, yelpingMonth = [int(s) for s in user['yelping_since'].split('-')]
    reviews = userReviewMap[userID]
    categories = set()
    for cs in [businessMap[r['business_id']]['categories'] for r in reviews]:
        categories.update(cs)
    cities = set()
    for cs in [businessMap[r['business_id']]['city'] for r in reviews]:
        cities.add(cs)
    #Num Reviews sliding window
    numSlidingReviews = []
    userReviewDates = [parseDateString(r['date']) for r in userReviewMap[userID]]
    for i in range(2004, 2016):
        start, end = datetime.date(i, 1, 1), datetime.date(i, 12, 31)
        slidingReviewDates = [d for d in userReviewDates if d >= start and d <= end]
        numSlidingReviews.append(len(slidingReviewDates))
    
    #Features
    features = {}
    features['constant'] = 1
    
    #Time
    features['yelpingYear/{}'.format(yelpingYear)] = True
    #features['profileAge'] = 2015 - yelpingYear
    
    #Social Features
    features['fans'] = numFans
    features['fansFriendsRatio'] = user['fans']/(1 + numFriends)
    features['numFriends'] = numFriends
    features['numEliteFriends'] = numEliteFriends
    features['eliteFriendsRatio'] = numEliteFriends/(1 + numFriends)
    #features['pagerank'] = userPageRank[userID]
    #features['hub'] = userHub[userID]
    for c in user['compliments']:
        features['{}/{}'.format('complimentsFansRatio', c)] = user['compliments'][c]/(1 + numFans)
    
    #Review Features
    features['numReviews'] = user['review_count']
    features['numTips'] = len(userTipMap[userID]) if userID in userTipMap else 0
    for v in user['votes']:
        features['{}/{}'.format('votesReviewsRatio', v)] = user['votes'][v]/(1 + numReviews)
    features['maxNumSlidingReviews'] = max(numSlidingReviews)
    for y, n in zip(range(2004, 2016), numSlidingReviews):
        features['{}/{}'.format('numReviewsYear', y)] = n
    features['numCategories'] = len(categories)
    for c in categories:
        features['{}/{}'.format('category', c)] = True
        
    #Location
    for c in cities:
        features['{}/{}'.format('cities', c)] = True
        
    #??? Honam
    features['localEliteFriends'] = localEliteFriends[userID]
    return features

In [28]:
featurizeUser = Pipeline(steps=[('featurize', ApplyFunc(lambda u: userFeatures(u)))\
                               ,('vectorize', DictVectorizer(sparse=True))])

In [29]:
logregclf = Pipeline(steps=[('featurize', featurizeUser)\
                            #,('pca', PCA(20))
                            ,('model', LogisticRegression(fit_intercept=False))])

In [30]:
logregclf.fit(usersTrain, [isElite(u) for u in usersTrain])

Pipeline(steps=[('featurize', Pipeline(steps=[('featurize', ApplyFunc(f=<function <lambda> at 0x000000017A506F28>)), ('vectorize', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('model', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [31]:
predictions = logregclf.predict(usersTest)
print(classification_report([isElite(u) for u in usersTest], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.979     0.990     0.985     85414
       True      0.840     0.710     0.770      6265

avg / total      0.969     0.971     0.970     91679



In [32]:
sorted(getFeatureWeights(logregclf), key=lambda x: x[1], reverse=True)

[('eliteFriendsRatio', 4.2889445062462679),
 ('complimentsFansRatio/writer', 0.60272582306768363),
 ('complimentsFansRatio/more', 0.37163027560991946),
 ('category/Airports', 0.26264856235673911),
 ('complimentsFansRatio/hot', 0.24585045405915468),
 ('category/Breweries', 0.23776080644649566),
 ('category/Beer, Wine & Spirits', 0.20207065075708225),
 ('category/Bagels', 0.20038067508291779),
 ('category/Cinema', 0.19084259314314153),
 ('category/Day Spas', 0.17828442051193136),
 ('category/Dance Clubs', 0.16895260963759365),
 ('cities/Madison', 0.16765733124930893),
 ('category/Burgers', 0.15853250934284835),
 ('category/Food Trucks', 0.15781734327476749),
 ('category/Gastropubs', 0.1535376331536516),
 ('complimentsFansRatio/list', 0.1525787664790382),
 ('cities/Champaign', 0.15161101958705828),
 ('category/Grocery', 0.14932362163264273),
 ('category/Salad', 0.14273061500881548),
 ('fans', 0.14065086909747865),
 ('category/Wine Bars', 0.13363697180401835),
 ('category/Mongolian', 0.130

In [38]:
with open('log_weights.csv', 'w') as f:
    for feat, w in sorted(getFeatureWeights(logregclf), key=lambda x: x[1], reverse=True):
        print(feat, w, file=f, sep=',')

In [33]:
svmclf = Pipeline(steps=[('featurize', featurizeUser)\
                         ,('model', LinearSVC(fit_intercept=False, C=40, dual=False))])

In [34]:
svmclf.fit(usersTrain, [isElite(u) for u in usersTrain])

Pipeline(steps=[('featurize', Pipeline(steps=[('featurize', ApplyFunc(f=<function <lambda> at 0x000000017A506F28>)), ('vectorize', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('model', LinearSVC(C=40, class_weight=None, dual=False, fit_intercept=False,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [35]:
predictions = svmclf.predict(usersTest)
print(classification_report([isElite(u) for u in usersTest], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.977     0.990     0.983     85414
       True      0.838     0.677     0.749      6265

avg / total      0.967     0.969     0.967     91679



In [36]:
sorted(getFeatureWeights(svmclf), key=lambda x: x[1], reverse=True)

[('category/Castles', 2.0523685146088271),
 ('cities/Water of Leith', 1.7345438162297016),
 ('cities/Sainte-Ann-De-Bellevue', 1.3884064341304783),
 ('category/Television Stations', 1.373581433056098),
 ('category/Radio Stations', 1.3089779181602523),
 ('cities/Stutensee', 1.2950227261326324),
 ('cities/Newberry Springs', 1.2476301433687849),
 ('eliteFriendsRatio', 1.1323964900713503),
 ('cities/Banksville', 1.1057749557843861),
 ('cities/Loanhead', 1.0765916405614671),
 ('cities/Straiton', 0.97174917940186445),
 ('cities/Scotland', 0.96821581819881752),
 ('cities/Le Sud-Ouest', 0.96319541975986744),
 ('category/Rugs', 0.94278219185455869),
 ('cities/Cramond Bridge', 0.92317235433329403),
 ('cities/South Las Vegas', 0.90982147332431118),
 ('cities/Waddell', 0.89870037959159399),
 ('category/Doulas', 0.87128180139064526),
 ('cities/Lasswade', 0.85546567823605535),
 ('cities/Saint Jacobs', 0.82429244434542326),
 ('category/Prosthetics', 0.7996626674898164),
 ('category/Beer Garden', 0.799

In [39]:
with open('results/svm_weights.csv', 'w') as f:
    for feat, w in sorted(getFeatureWeights(svmclf), key=lambda x: x[1], reverse=True):
        print(feat, w, file=f, sep=',')