In [25]:
from __future__ import print_function
import json
import datetime
import csv
import math
import operator
import lda
import numpy as np
import networkx as nx
from util import *
from skutil import *

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [3]:
with open('../data/yelp_academic_dataset_user.json') as f:
    users = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_review.json') as f:
    reviews = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_business.json') as f:
    businesses = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_tip.json') as f:
    tips = [json.loads(line) for line in f]

In [4]:
userMap = mapUsers(users)
userReviewMap = mapReviewsByUsers(reviews)
userTipMap = mapTipsByUser(tips)
businessMap = mapBusinesses(businesses)

In [None]:
userGraph = graphUsers(users)

In [None]:
userPageRank = nx.pagerank(userGraph)

In [None]:
userHub, userAuth = nx.hits(userGraph)

In [5]:
def HasEverBeenElite(u):
    if len(u['elite']) > 0:
        return 1.0
    return 0.0
    
def InsertCategoriesToDict(d, cats):
    for c in cats:
        if c not in d:
            d[c] = 0
        d[c] += 1
        
UsersCategoriesDict = {}
UsersCategoriesDictSorted = {}
UsersCategoriesDictSortedList = {}
for r in reviews:
    bid = r['business_id']
    uid = r['user_id']
    bcats = businessMap[bid]['categories']
    if uid not in UsersCategoriesDict:
        UsersCategoriesDict[uid] = {}
    InsertCategoriesToDict(UsersCategoriesDict[uid], bcats)

for k, v in UsersCategoriesDict.items():
    s = sum(v.values())
    for k1,v1 in v.items():
        v[k1] = v1 * 1.0/s
    a = sorted(v.items(), key=operator.itemgetter(1))
    a.reverse()
    b = {}
    for aa in a:
        b[aa[0]] = aa[1]
    UsersCategoriesDictSorted[k] = b
    UsersCategoriesDictSortedList[k] = a

In [6]:
def trueDiv(u):
    q = 2
    userId = u['user_id']
    # variance
    cats = UsersCategoriesDictSortedList[userId]
    lam = 0
    if len(cats) > 0:
        for c in cats:
            lam += math.pow(c[1]*1.0, q)

        return lam ** (1.0/(1.0-q))
    return 0

In [7]:
centroid, label = getKmeans(reviews, businessMap)
localEliteFriends = getLocalEliteFriends(reviews, userMap, label)

In [None]:
users[0]

In [53]:
reviews[0]

{'business_id': 'vcNAWiLM4dR7D2nwwJ7nCA',
 'date': '2007-05-17',
 'review_id': '15SdjuK7DmYqUAj6rjGowg',
 'stars': 5,
 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 'type': 'review',
 'user_id': 'Xqd0DzHaiyRqVH3WRG7hzg',
 'votes': {'cool': 1, 'funny': 0, 'useful': 2}}

In [None]:
businesses[0]

In [10]:
usersTrain, usersTest = train_test_split(users, random_state=0, train_size=0.7)
usersTestValid, usersTestTest = train_test_split(usersTest, random_state=1, train_size=0.5)

In [11]:
def parseDateString(d):
    return datetime.date(*[int(s) for s in d.split('-')])

In [12]:
def getFeatureWeights(pipe):
    featureNames = pipe.named_steps['featurize'].named_steps['vectorize'].get_feature_names()
    featureWeights = pipe.named_steps['model'].coef_[0]
    return list(zip(featureNames, featureWeights))

In [13]:
def userFeatures(user):
    #Preprocess
    userID = user['user_id']
    numFans = user['fans']
    numReviews = user['review_count']
    numEliteFriends = len([f for f in user['friends'] if isElite(userMap[f])])
    numFriends = len(user['friends'])
    yelpingYear, yelpingMonth = [int(s) for s in user['yelping_since'].split('-')]
    reviews = userReviewMap[userID]
    categories = set()
    for cs in [businessMap[r['business_id']]['categories'] for r in reviews]:
        categories.update(cs)
    cities = set()
    for cs in [businessMap[r['business_id']]['city'] for r in reviews]:
        cities.add(cs)
    #Num Reviews sliding window
    numSlidingReviews = []
    userReviewDates = [parseDateString(r['date']) for r in userReviewMap[userID]]
    for i in range(2004, 2016):
        start, end = datetime.date(i, 1, 1), datetime.date(i, 12, 31)
        slidingReviewDates = [d for d in userReviewDates if d >= start and d <= end]
        numSlidingReviews.append(len(slidingReviewDates))
    
    #Features
    features = {}
    features['constant'] = 1
    
    #Time
    features['yelpingYear/{}'.format(yelpingYear)] = True
    #features['profileAge'] = 2015 - yelpingYear
    
    #Social Features
    features['fans'] = numFans
    features['fansFriendsRatio'] = user['fans']/(1 + numFriends)
    features['numFriends'] = numFriends
    features['numEliteFriends'] = numEliteFriends
    features['eliteFriendsRatio'] = numEliteFriends/(1 + numFriends)
    #features['pagerank'] = userPageRank[userID]
    #features['hub'] = userHub[userID]
    for c in user['compliments']:
        features['{}/{}'.format('complimentsFansRatio', c)] = user['compliments'][c]/(1 + numFans)
    
    #Review Features
    features['numReviews'] = user['review_count']
    features['numTips'] = len(userTipMap[userID]) if userID in userTipMap else 0
    for v in user['votes']:
        features['{}/{}'.format('votesReviewsRatio', v)] = user['votes'][v]/(1 + numReviews)
    features['maxNumSlidingReviews'] = max(numSlidingReviews)
    for y, n in zip(range(2004, 2016), numSlidingReviews):
        features['{}/{}'.format('numReviewsYear', y)] = n
    features['numCategories'] = len(categories)
    for c in categories:
        features['{}/{}'.format('category', c)] = True
    for i in range(1, 6):
        features['numReviews/{}stars'.format(i)] = len([r for r in reviews if r['stars'] == i])
    features['avgReviewLength'] = np.mean([len(r['text'].split()) for r in reviews])
    features['truediv'] = trueDiv(user)
        
    #Location
    for c in cities:
        features['{}/{}'.format('cities', c)] = True
        
    #??? Honam
    features['localEliteFriends'] = localEliteFriends[userID]/(1 + numFriends)
    
    return features

In [14]:
featurizeUser = Pipeline(steps=[('featurize', ApplyFunc(lambda u: userFeatures(u)))\
                               ,('vectorize', DictVectorizer(sparse=True))])

In [15]:
logregclf = Pipeline(steps=[('featurize', featurizeUser)\
                            #,('pca', PCA(20))
                            #,('normalize', Normalizer())
                            ,('model', LogisticRegression(fit_intercept=False))])

In [16]:
logregclf.fit(usersTrain, [isElite(u) for u in usersTrain])

Pipeline(steps=[('featurize', Pipeline(steps=[('featurize', ApplyFunc(f=<function <lambda> at 0x000000019EBE1D90>)), ('vectorize', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('model', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [19]:
predictions = logregclf.predict(usersTrain)
print(classification_report([isElite(u) for u in usersTrain], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.979     0.991     0.985    238952
       True      0.860     0.715     0.781     17748

avg / total      0.971     0.972     0.971    256700



In [17]:
predictions = logregclf.predict(usersTestValid)
print(classification_report([isElite(u) for u in usersTestValid], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.979     0.990     0.985     51265
       True      0.842     0.708     0.769      3742

avg / total      0.970     0.971     0.970     55007



In [18]:
predictions = logregclf.predict(usersTestTest)
print(classification_report([isElite(u) for u in usersTestTest], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.978     0.991     0.984     51197
       True      0.850     0.700     0.768      3811

avg / total      0.969     0.971     0.969     55008



In [130]:
sorted(getFeatureWeights(logregclf), key=lambda x: x[1], reverse=True)

[('eliteFriendsRatio', 3.131384929095272),
 ('complimentsFansRatio/writer', 0.58757512190254191),
 ('complimentsFansRatio/more', 0.39307880015320096),
 ('complimentsFansRatio/hot', 0.24951079471879184),
 ('category/Airports', 0.19919384769973342),
 ('category/Burgers', 0.15026339813923914),
 ('fans', 0.14838516479590766),
 ('category/Day Spas', 0.14777561086642285),
 ('category/Beer, Wine & Spirits', 0.14667154448344566),
 ('category/Dance Clubs', 0.14159863720421909),
 ('category/Breweries', 0.12805578513487356),
 ('category/Lounges', 0.12195432741839),
 ('category/Bagels', 0.11050374406251802),
 ('category/Arts & Entertainment', 0.10671274487984223),
 ('category/American (New)', 0.10655882668110929),
 ('category/Grocery', 0.10500182833655004),
 ('category/Buffets', 0.10438853440223332),
 ('category/Pubs', 0.10224242236354797),
 ('complimentsFansRatio/cool', 0.10078283916667644),
 ('cities/Montreal', 0.093779350084831986),
 ('category/Cinema', 0.091793857042089325),
 ('category/Hotels

In [131]:
featureWeights = sorted(getFeatureWeights(logregclf), key=lambda x: x[1], reverse=True)
with open('results/log_weights.tsv', 'w', newline='', encoding='utf-8') as f:
    w = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)
    w.writerows(featureWeights)

In [20]:
svmclf = Pipeline(steps=[('featurize', featurizeUser)\
                         ,('model', LinearSVC(fit_intercept=False, C=1, dual=False))])

In [21]:
svmclf.fit(usersTrain, [isElite(u) for u in usersTrain])

Pipeline(steps=[('featurize', Pipeline(steps=[('featurize', ApplyFunc(f=<function <lambda> at 0x000000019EBE1D90>)), ('vectorize', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('model', LinearSVC(C=1, class_weight=None, dual=False, fit_intercept=False,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [22]:
predictions = svmclf.predict(usersTrain)
print(classification_report([isElite(u) for u in usersTrain], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.978     0.993     0.985    238952
       True      0.875     0.704     0.781     17748

avg / total      0.971     0.973     0.971    256700



In [23]:
predictions = svmclf.predict(usersTestValid)
print(classification_report([isElite(u) for u in usersTestValid], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.977     0.991     0.984     51265
       True      0.847     0.681     0.755      3742

avg / total      0.968     0.970     0.968     55007



In [24]:
predictions = svmclf.predict(usersTestTest)
print(classification_report([isElite(u) for u in usersTestTest], predictions, digits=3))

             precision    recall  f1-score   support

      False      0.976     0.991     0.984     51197
       True      0.851     0.677     0.754      3811

avg / total      0.968     0.969     0.968     55008



In [107]:
sorted(getFeatureWeights(svmclf), key=lambda x: x[1], reverse=True)

[('eliteFriendsRatio', 1.1191314777179233),
 ('category/Castles', 0.86192095057055262),
 ('category/Radio Stations', 0.80964008422204936),
 ('category/Television Stations', 0.80456957700818976),
 ('cities/Waddell', 0.74555993274581678),
 ('category/Curry Sausage', 0.72052882568370979),
 ('cities/Le Sud-Ouest', 0.70915622194399286),
 ('category/Yelp Events', 0.66776459561786738),
 ('category/Gastroenterologist', 0.59984235331413016),
 ('cities/South Queensferry', 0.59163074900162993),
 ('cities/Île des Soeurs', 0.51753819721389582),
 ('cities/Chomedey', 0.51721187759546805),
 ('cities/South Las Vegas', 0.4880288000289254),
 ('cities/Banksville', 0.48365444941737024),
 ('category/Flight Instruction', 0.48197889694674484),
 ('category/General Litigation', 0.46651342189337669),
 ('cities/North Las Vegas ', 0.46466904026331934),
 ('category/Trinidadian', 0.4641790576860329),
 ('cities/Sainte-Ann-De-Bellevue', 0.4521068163796691),
 ('category/Brewing Supplies', 0.45190024100222548),
 ('categ

In [108]:
featureWeights = sorted(getFeatureWeights(svmclf), key=lambda x: x[1], reverse=True)
with open('results/svm_weights.tsv', 'w', newline='', encoding='utf-8') as f:
    w = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)
    w.writerows(featureWeights)

In [27]:
dir(lda)

['LDA',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_lda',
 'absolute_import',
 'datasets',
 'lda',
 'logging',
 'pbr',
 'unicode_literals',
 'utils']

In [30]:
X = lda.datasets.load_reuters()

In [31]:
X

array([[1, 0, 1, ..., 0, 0, 0],
       [7, 0, 2, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0]], dtype=int32)

In [28]:
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)

In [29]:
model.fit([r['text'] for r in reviews])

AttributeError: 'list' object has no attribute 'shape'