In [77]:
from __future__ import print_function
import json
import datetime
import csv
import numpy as np
import networkx as nx
from util import *
from skutil import *

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [2]:
with open('../data/yelp_academic_dataset_user.json') as f:
    users = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_review.json') as f:
    reviews = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_business.json') as f:
    businesses = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_tip.json') as f:
    tips = [json.loads(line) for line in f]

In [3]:
userMap = mapUsers(users)
userReviewMap = mapReviewsByUsers(reviews)
userTipMap = mapTipsByUser(tips)
businessMap = mapBusinesses(businesses)

In [None]:
userGraph = graphUsers(users)

In [None]:
userPageRank = nx.pagerank(userGraph)

In [None]:
userHub, userAuth = nx.hits(userGraph)

In [4]:
centroid, label = getKmeans(reviews, businessMap)
localEliteFriends = getLocalEliteFriends(reviews, userMap, label)

In [None]:
users[0]

In [53]:
reviews[0]

{'business_id': 'vcNAWiLM4dR7D2nwwJ7nCA',
 'date': '2007-05-17',
 'review_id': '15SdjuK7DmYqUAj6rjGowg',
 'stars': 5,
 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 'type': 'review',
 'user_id': 'Xqd0DzHaiyRqVH3WRG7hzg',
 'votes': {'cool': 1, 'funny': 0, 'useful': 2}}

In [None]:
businesses[0]

In [5]:
usersTrain, usersTest = train_test_split(users, random_state=0)

In [6]:
def parseDateString(d):
    return datetime.date(*[int(s) for s in d.split('-')])

In [17]:
def getFeatureWeights(pipe):
    featureNames = pipe.named_steps['featurize'].named_steps['vectorize'].get_feature_names()
    featureWeights = pipe.named_steps['model'].coef_[0]
    return list(zip(featureNames, featureWeights))

In [78]:
def userFeatures(user):
    #Preprocess
    userID = user['user_id']
    numFans = user['fans']
    numReviews = user['review_count']
    numEliteFriends = len([f for f in user['friends'] if isElite(userMap[f])])
    numFriends = len(user['friends'])
    yelpingYear, yelpingMonth = [int(s) for s in user['yelping_since'].split('-')]
    reviews = userReviewMap[userID]
    categories = set()
    for cs in [businessMap[r['business_id']]['categories'] for r in reviews]:
        categories.update(cs)
    cities = set()
    for cs in [businessMap[r['business_id']]['city'] for r in reviews]:
        cities.add(cs)
    #Num Reviews sliding window
    numSlidingReviews = []
    userReviewDates = [parseDateString(r['date']) for r in userReviewMap[userID]]
    for i in range(2004, 2016):
        start, end = datetime.date(i, 1, 1), datetime.date(i, 12, 31)
        slidingReviewDates = [d for d in userReviewDates if d >= start and d <= end]
        numSlidingReviews.append(len(slidingReviewDates))
    
    #Features
    features = {}
    features['constant'] = 1
    
    #Time
    features['yelpingYear/{}'.format(yelpingYear)] = True
    #features['profileAge'] = 2015 - yelpingYear
    
    #Social Features
    features['fans'] = numFans
    features['fansFriendsRatio'] = user['fans']/(1 + numFriends)
    features['numFriends'] = numFriends
    features['numEliteFriends'] = numEliteFriends
    features['eliteFriendsRatio'] = numEliteFriends/(1 + numFriends)
    #features['pagerank'] = userPageRank[userID]
    #features['hub'] = userHub[userID]
    for c in user['compliments']:
        features['{}/{}'.format('complimentsFansRatio', c)] = user['compliments'][c]/(1 + numFans)
    
    #Review Features
    features['numReviews'] = user['review_count']
    features['numTips'] = len(userTipMap[userID]) if userID in userTipMap else 0
    for v in user['votes']:
        features['{}/{}'.format('votesReviewsRatio', v)] = user['votes'][v]/(1 + numReviews)
    features['maxNumSlidingReviews'] = max(numSlidingReviews)
    for y, n in zip(range(2004, 2016), numSlidingReviews):
        features['{}/{}'.format('numReviewsYear', y)] = n
    features['numCategories'] = len(categories)
    for c in categories:
        features['{}/{}'.format('category', c)] = True
    for i in range(1, 6):
        features['numReviews/{} Star'.format(i)] = len([r for r in reviews if r['stars'] == i])
    features['avgReviewLength'] = np.mean([len(r['text'].split()) for r in reviews])
        
    #Location
    for c in cities:
        features['{}/{}'.format('cities', c)] = True
        
    #??? Honam
    features['localEliteFriends'] = localEliteFriends[userID]
    return features

In [79]:
featurizeUser = Pipeline(steps=[('featurize', ApplyFunc(lambda u: userFeatures(u)))\
                               ,('vectorize', DictVectorizer(sparse=True))])

In [84]:
logregclf = Pipeline(steps=[('featurize', featurizeUser)\
                            #,('pca', PCA(20))
                            #,('normalize', Normalizer())
                            ,('model', LogisticRegression(fit_intercept=False))])

In [85]:
logregclf.fit(usersTrain, [isElite(u) for u in usersTrain])

Pipeline(steps=[('featurize', Pipeline(steps=[('featurize', ApplyFunc(f=<function <lambda> at 0x000000017E4DEE18>)), ('vectorize', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('model', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [86]:
predictions = logregclf.predict(usersTest)
print(classification_report([isElite(u) for u in usersTest], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.978     0.991     0.984     85414
       True      0.844     0.697     0.764      6265

avg / total      0.969     0.971     0.969     91679



In [87]:
sorted(getFeatureWeights(logregclf), key=lambda x: x[1], reverse=True)

[('eliteFriendsRatio', 2.7476330073890294),
 ('complimentsFansRatio/writer', 0.6037351715840803),
 ('complimentsFansRatio/more', 0.39719020496297069),
 ('complimentsFansRatio/hot', 0.25601314670961051),
 ('category/Burgers', 0.16384662767341004),
 ('category/Beer, Wine & Spirits', 0.1592055050009811),
 ('category/Airports', 0.15718451937822286),
 ('fans', 0.15679688276334267),
 ('category/Buffets', 0.14995496998030838),
 ('category/Breweries', 0.1413667845242888),
 ('category/Lounges', 0.12602660153650569),
 ('category/Dance Clubs', 0.12495297957141252),
 ('category/American (New)', 0.11978408640816844),
 ('category/Grocery', 0.11856169785712524),
 ('category/Bagels', 0.11355343395582271),
 ('category/Day Spas', 0.11326429316198477),
 ('category/Pubs', 0.11164339135068013),
 ('category/Cinema', 0.10899441269077946),
 ('complimentsFansRatio/cool', 0.10747113143600158),
 ('category/Gastropubs', 0.10384594506510962),
 ('category/Salad', 0.10013294991836148),
 ('category/Steakhouses', 0.09

In [88]:
featureWeights = sorted(getFeatureWeights(logregclf), key=lambda x: x[1], reverse=True)
with open('results/log_weights.tsv', 'w', newline='') as f:
    w = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)
    w.writerows(featureWeights)

In [89]:
svmclf = Pipeline(steps=[('featurize', featurizeUser)\
                         ,('model', LinearSVC(fit_intercept=False, C=1, dual=False))])

In [90]:
svmclf.fit(usersTrain, [isElite(u) for u in usersTrain])

Pipeline(steps=[('featurize', Pipeline(steps=[('featurize', ApplyFunc(f=<function <lambda> at 0x000000017E4DEE18>)), ('vectorize', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('model', LinearSVC(C=1, class_weight=None, dual=False, fit_intercept=False,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [91]:
predictions = svmclf.predict(usersTest)
print(classification_report([isElite(u) for u in usersTest], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.977     0.991     0.984     85414
       True      0.845     0.679     0.753      6265

avg / total      0.968     0.970     0.968     91679



In [92]:
sorted(getFeatureWeights(svmclf), key=lambda x: x[1], reverse=True)

[('eliteFriendsRatio', 1.1191314777179233),
 ('category/Castles', 0.86192095057055262),
 ('category/Radio Stations', 0.80964008422204936),
 ('category/Television Stations', 0.80456957700818976),
 ('cities/Waddell', 0.74555993274581678),
 ('category/Curry Sausage', 0.72052882568370979),
 ('cities/Le Sud-Ouest', 0.70915622194399286),
 ('category/Yelp Events', 0.66776459561786738),
 ('category/Gastroenterologist', 0.59984235331413016),
 ('cities/South Queensferry', 0.59163074900162993),
 ('cities/Île des Soeurs', 0.51753819721389582),
 ('cities/Chomedey', 0.51721187759546805),
 ('cities/South Las Vegas', 0.4880288000289254),
 ('cities/Banksville', 0.48365444941737024),
 ('category/Flight Instruction', 0.48197889694674484),
 ('category/General Litigation', 0.46651342189337669),
 ('cities/North Las Vegas ', 0.46466904026331934),
 ('category/Trinidadian', 0.4641790576860329),
 ('cities/Sainte-Ann-De-Bellevue', 0.4521068163796691),
 ('category/Brewing Supplies', 0.45190024100222548),
 ('categ

In [93]:
featureWeights = sorted(getFeatureWeights(svmclf), key=lambda x: x[1], reverse=True)
with open('results/svm_weights.tsv', 'w', newline='') as f:
    w = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)
    w.writerows(featureWeights)