In [49]:
from __future__ import print_function
import json
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from collections import Counter
from util import *
from skutil import *

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from scipy.cluster.vq import kmeans2, whiten

def mostCommon(lst):
    data = Counter(lst)
    return data.most_common(1)[0][0]

def getKmeans(reviews, businesses):
    geo_data = []
    for review in reviews:
        business_id = review['business_id']
        lat = businesses[business_id]['latitude']
        lon = businesses[business_id]['longitude']

        geo_data.append([lat, lon])
    centroid, label = kmeans2(whiten(geo_data), 10, iter = 30) # clustering into 10 groups
    
    return centroid, label

def getLocalEliteFriends(reviews, users, label):
    user_group = defaultdict(list)
    elite_friends = defaultdict(list)
    idx = 0
    
    for review in reviews:
        user_id = review['user_id']
        group_num = label[idx]
        user_group[user_id].append(group_num)

        idx += 1

    for user_id in user_group:
        user_group[user_id] = mostCommon(user_group[user_id])

    for user_id in users:
        friends = users[user_id]['friends']

        same_group_cnt = 0
        for friend_id in friends:
            if friend_id not in user_group:
                print("none existing user" + friend_id)
                continue

            if user_group[friend_id] == user_group[user_id]:
                same_group_cnt += 1
        elite_friends[user_id] = same_group_cnt
        
    return elite_friends

In [50]:
with open('../data/yelp_academic_dataset_review.json') as f:
    reviews = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_user.json') as f:
    users = [json.loads(line) for line in f]
with open('../data/yelp_academic_dataset_business.json') as f:
    businesses = [json.loads(line) for line in f]

In [52]:
userMap = mapUsers(users)
businessMap = mapBusinesses(businesses)

In [54]:
centroid, label = getKmeans(reviews, businessMap)

In [57]:
localEliteFriends = getLocalEliteFriends(reviews, userMap, label)

In [39]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
# setup Lambert Conformal basemap.
# set resolution=None to skip processing of boundary datasets.
m = Basemap(width=18000000,height=12000000,projection='lcc',
            resolution=None,lat_1=45.,lat_2=55,lat_0=38,lon_0=-104.)
m.bluemarble()

# convert to map projection coords.
# Note that lon,lat can be scalars, lists or numpy arrays.
xpt,ypt = m(lons,lats)
# convert back to lat/lon
lonpt, latpt = m(xpt,ypt,inverse=True)
m.plot(xpt,ypt,'co', markersize=14, alpha = 0.3)  # plot a blue dot there
# put some text next to the dot, offset a little bit
# (the offset is in map projection coordinates)

plt.show()