In [30]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from networkx.algorithms import bipartite
import pandas as pd
import re
import matplotlib
font = {'family' : 'DejaVu Sans', 'weight' : 'normal', 'size'   : 22}
matplotlib.rc('font', **font)

In [31]:
def degree(g, nodes=None, as_list=True):
    deg = dict(g.degree())
    if nodes: deg = dict(g.degree(nodes))
    
    if as_list: return list(deg.values())
    return deg

def degree_plot(g, nodes=None, filename=None, title=''):
    deg = degree(g, nodes=nodes)
    bins = 100
    if len(nodes) < 100:
        bins = len(nodes)
    hist = np.histogram(deg, bins=bins)
    freqs, edges = hist[0], hist[1]
    n = freqs.size
    means = [(edges[i] + edges[i+1]) / 2 for i in range(n)]
    
    # SCATTER PLOT
    plt.figure(figsize=[15,10])
    plt.plot(means, freqs, ".", markersize=20)
    plt.xlabel("k")
    plt.ylabel("frequency")
    plt.title("Degree distribution for %s" % title)
    if filename: plt.savefig('plots/%s.svg' % filename, format='svg', bbox_inches="tight")
    plt.show()
    
    # LOG LOG PLOT
    plt.figure(figsize=[15,10])
    plt.loglog(means, freqs, ".", markersize=20)
    plt.xlabel("log(k)")
    plt.ylabel("log(frequency)")
    plt.title("Log-log degree distribution for %s" % title)
    if filename: plt.savefig('plots/log_%s.svg' % filename, format='svg', bbox_inches="tight")
    plt.show()

## Motivation
### What is your dataset?
The dataset used in this project comes from the data provided for the Yelp Dataset Challenge. This dataset consists of about 1.5 million users and about 200 thousand businesses from North America.  Additionally the dataset includes just under 6 million reviews, made by users of the Yelp service, to businesses. The businesses included in the dataset are both restaurants as well as businesses offering other services, such as postal delivery. 

### Why did you choose this/these particular dataset(s)?
The dataset is tremendous
### What was your goal for the end user's experience?
The purpose of this project is to investigate properties of Yelp’s Elite users. For this paper, the focus will lie on Yelp’s two primary claims about their Elite users:

Yelp states that its Elite users have high connectivity, which means that they are connected with many other users and interact often with members of their Yelp community. 

Yelp claims that its Elite users make up the “true heart of the Yelp community.” Third, Yelp claims that its users have high contribution, which means that the user has made a large impact on the site with meaningful and high-quality reviews. 

The first goal of our project is to analyze whether the above claims about Yelp’s Elite users are quantifiably valid. For this, we will specify several characteristics which we expect Elite users to have based on these claims. We will then perform analyses on Yelp’s dataset in order to determine whether these properties are truly represented among the Elite users. The secondary goal of our project is to find which properties are most indicative of Elite status on Yelp. 

The analyses for the first goal can be used for this purpose as well. This kind of information may be useful for those who are interested in becoming Elite members on Yelp. In order to become a member of the “Elite squad,” a user must go through an application process. Despite the suggestions presented above, Yelp doesn’t provide any specific criteria on exactly what characteristics a user must have to become Elite. The mystery behind the selection process for Elite users is well-documented.




## Basic stats. Let's understand the dataset better
### Write about your choices in data cleaning and preprocessing
- Mis-formatted JSON to valid JSON

In [26]:
import pandas as pd
def cleanup(N, dataset, chunk_size=100000):
    '''
    Cleans up a JSON file by adding a trailing comma to each line,
    which is missing from the Yelp dataset files.
    A chunk size must be specified, since all the lines in the data
    files cannot be stored in memory at the same time, due to being very large!
    '''
    for k in range(N):
        dirty_path = 'yelp_dataset/yelp_academic_dataset_%s.json' % dataset
        clean_path = "cleaned/%s%i.json" % (dataset, k)
        dirty_file = open(dirty_path, "r")
        clean_file = open(clean_path, "w")


        start = chunk_size * k
        end = chunk_size * (k+1)

        content = ''
        i = 0
        for line in dirty_file:
            if i == end:
                break
            elif i >= start:
                s = line.replace('\n', ',\n')
                content += s
            i += 1
        if content:
            payload = '{"data" : \n[%s]}' % (content[:-2] + '\n')
            clean_file.write(payload)
        else:
            print("No more content.")
    
    print('Iteration', k, 'done')
    
def read_json_to_df(N, dataset):
    # Create dataframe from JSON files
    df_matrix = [None] * N
    for i in range(N):
        path = "cleaned/business%i.json" % i
        df_matrix[i] = pd.DataFrame(list(pd.read_json(path).data))
    return pd.concat(df_matrix)

### Get all restaurants from Toronto

In [27]:
# Clean business JSON files
N = 2 # There are about 200k restaurants, therefore 2 chunks of 100k elements is sufficient
dataset = 'business'
cleanup(N, dataset)

# Make dataframe from JSON data
df = read_json_to_df(N, dataset)

# Restaurants will contain the keywords 'restaurant' 
# and/or 'food' in the 'category' attribute.
keywords = ['restaurant', 'food']
idx = df.categories.str.lower().str.contains("|".join(keywords)).fillna(False)
rest = df[idx]


# Only include Toronto restaurants
rest.city = rest.city.str.lower()
rest = rest[rest.city == 'toronto']

# Drop attributes irrelevant to the analysis
rest = rest.drop(['city', 'attributes', 'categories', 'address', 'neighborhood', 'is_open', 'hours'], axis=1)

# Save dataset to CSV
rest.to_csv('toronto2/toronto_restaurants.csv', header=False)

Iteration 1 done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### Get all reviews from Toronto

In [29]:
# Clean business JSON files
N = 60 # There are about 6M reviews, therefore 60 chunks of 100k elements is sufficient
dataset = 'review'
cleanup(N, dataset)

# Make dataframe from JSON data
df = read_json_to_df(N, dataset)

# Filter out reviews of businesses outside Toronto
reviews = df[df.business_id.isin(rest.business_id)]

# Drop attributes irrelevant to the analysis
reviews = reviews.drop(['cool', 'funny', 'useful'], axis=1)

# Save dataset to CSV
reviews.to_csv('toronto2/toronto_reviews.csv')

KeyboardInterrupt: 

### Get all users in the Toronto reviews

In [None]:
# Clean business JSON files
N = 30 # A guess
dataset = 'user'
cleanup(N, dataset)

# Make dataframe from JSON data
df = read_json_to_df(N, dataset)

# Filter out users not in the Toronto reviews
toronto_users = df[df.user_id.isin(reviews.user_id)]

# Drop attributes irrelevant to the analysis
toronto_users = toronto_users.drop(['compliment_cool', 'compliment_cute',
       'compliment_funny', 'compliment_hot', 'compliment_list',
       'compliment_more', 'compliment_note', 'compliment_photos',
       'compliment_plain', 'compliment_profile', 'compliment_writer', 'cool',
     'funny', 'fans'], axis=1)

# Save to CSV
toronto_users.to_csv('toronto/toronto_users.csv', index=False)

### Dataset stats
- Reviews: ~6 million
- Users: Many
- Businesses: ~200,000

For this project the restaurants in Toronto were the main focus, as Toronto is a big city with more than a sufficient amount of data to perform a serious analysis, but small enough for various graph algorithms to be carried out. The users considered in this project were all the users who left a review on a business in Toronto.

- Period: March 1st 2008 to August 1st 2018
- Reviews: ~380,000
- Users: ~85,000
- Elite users hereof: ~7,500
- Restaurants: ~10,000

## Tools, theory and analysis. Describe the process of theory to insight

### Talk about how you've worked with text, including regular expressions, unicode, etc.

### Describe which network science tools and data analysis strategies you've used, how those network science measures work, and why the tools you've chosen are right for the problem you're solving.
* NetworkX for creating the graphs for analysis. 
* Centrality measures: Degree and eigenvalue. Degree tells us how popular a user is in terms of friends, or how many reviews they made.
* Clustering coefficient: How connected is the graph on average?
* Degree distributions: Do the network degrees obey power law distributions?


#### Modelling the social network
The social network was created by extracting the friends of each user who made a review on a Toronto-based restaurant, and then creating a link between each user. Some users in the social network will not have placed a review on a Toronto-based restaurant, and will only be in the network due to their friendship with someoneone who has. 

#### Modelling the review network
The Toronto Yelp review network was modelled as an undirected graph, containing user nodes where the edges between two user nodes represent the fact two users have reviewed the same restaurant. 

#### Most connected subcomponent
Detecting how important the elite users were for the network was done by removing them from the network in small chunks, and then observing how the largest connected subcomponent shrinks. The elite users were deleted based on their degree centrality.

### How did you use the tools to understand your dataset?

* NetworkX for creating the graphs for analysis. 
* Centrality measures: Degree and eigenvalue. Degree tells us how popular a user is in terms of friends, or how many reviews they made.
* Clustering coefficient: How connected is the graph on average?
* Degree distributions: Do the network degrees obey power law distributions?
* Plotting: We tried plotting it, but the network is simply too large.


#### Modelling the review network
The Toronto Yelp review network was modelled as an undirected graph, containing user nodes where the edges between two user nodes represent the fact two users have reviewed the same restaurant. 

In [None]:
# Constant Strings
USER = 'user'
ELITE_USER = 'elite_user'
BIZ = 'biz'

# Read in data
biz = pd.read_csv('toronto/toronto_biz.csv')
user = pd.read_csv('toronto/toronto_users.csv')
reviews = pd.read_csv('toronto/toronto_reviews.csv')
elite_user = user[~user.elite.str.contains('None')]

print('#Reviews:', len(reviews))
print('#Users:', len(set(reviews.user_id)))
print('#Elite users:', len(elite_user))
print('#Businesses:', len(set(reviews.business_id)))

In [None]:
# A node class for storing data.
class Node:
    def __init__(self, Data, Type):
        self.Data = Data
        self.Type = Type
    
    def to_string(self):
        return "Node (%s), Data: " % (self.Type, self.Data)
    
    def __hash__(self):
        return hash(self.Data)
    def __eq__(self, other):
        return (
                self.__class__ == other.__class__ and 
                self.Data == other.Data
               )

In [None]:
# Create a NetworkX graph for the review network
review_network = nx.Graph()

# For each review, create a node for the user and business and a link between them
for r in reviews.itertuples():
    a = Node(r.user_id, ELITE_USER if r.user_id in elite_ids else USER)
    b = Node(r.business_id, BIZ)
    review_network.add_edge(a, b, weight=r.stars)

# Show the number of nodes and edges
print('Nodes:', len(review_network.nodes()))
print('Edges:', len(review_network.edges()))

### Review network measures

In [None]:
# Separate nodes based on their type
review_biz_nodes = [n for n in list(review_network.nodes()) if n.Type == BIZ]
review_user_nodes = [n for n in list(review_network.nodes()) if n.Type == USER]
review_elite_nodes = [n for n in list(review_network.nodes()) if n.Type == ELITE_USER]

#### Regular user degree distributions

In [None]:
degree_plot(review_network, review_user_nodes, title="all Toronto users", filename='reviews_degree_normal_users')

#### Elite user distribution

In [None]:
degree_plot(review_network, review_elite_nodes, title="Toronto Elite users", filename='reviews_degree_elite_users')

#### Restaurant degree distributions

In [None]:
degree_plot(review_network, review_biz_nodes, title="Toronto restaurants", filename='reviews_degree_all_biz')

#### Clustering coefficient

In [None]:
cluster_coeff_avg = nx.average_clustering(review_network)
print('Average clustering coeff. :', cluster_coeff_avg)

#### Centrality measures
* Degree centrality is a basic measure for the number of reviews a user has given, and for a restaurant it represents the number of reviews the restaurant has been given.
* Eigenvector centrality for a user node indicates to which degree they have reviewed restaurants with many reviews, and for a restaurant the measure represents how many reviews come from users who themselves have given a lot of reviews.

In [None]:
# Degree centrality
deg = nx.degree(review_network)
deg_elite_user = [deg[n] for n in deg if n.Type == ELITE_USER]
deg_user = [deg[n] for n in deg if n.Type == USER]
elite_avg_deg = np.mean(deg_elite_user)
user_avg_deg = np.mean(deg_user)
all_user_deg = np.mean(deg_elite_user + deg_user)

# Show results
print('Normal user mean degree centrality', user_avg_deg)
print('Elite user mean degree centrality', elite_avg_deg)
print('All users mean degree centrality', all_user_deg)
ratio = elite_avg_deg / user_avg_deg
print('Ratio degree (Elite : Normal): %.2f' % ratio)

In [None]:
# Eigenvalue centrality
ev = nx.eigenvector_centrality_numpy(review_network)
ev_elite_user = [ev[n] for n in ev if n.Type == ELITE_USER]
ev_user = [ev[n] for n in ev if n.Type == USER]
elite_avg_ev = np.mean(ev_elite_user)
user_avg_ev = np.mean(ev_user)
all_user_ev = np.mean(ev_elite_user + ev_user)

# Show results
print('Normal user mean EV centrality', user_avg_ev)
print('Elite user mean EV centrality', elite_avg_ev)
print('All users mean EV centrality', all_user_ev)
ratio = elite_avg_ev / user_avg_ev
print('Ratio EV (Elite : Normal): %.2f' % ratio)

### Eigenvalue centrality vs average user rating

In [None]:
# Create new column in the user dataframe
user['ev'] = 0
ev_user = {n.Data: ev[n] for n in ev if (n.Type == ELITE_USER) or (n.Type == USER)}

# Insert the eigenvalue of the user in the dataframe. This takes several minutes...
i = 1
p = int(len(ev_user) / 100)
for k in ev_user:
    if  i % p == 0: print('%i percent done' % (i/p))
    eigenvalue = ev_user[k]
    user.loc[user.user_id == k, 'ev'] = eigenvalue
    i += 1

In [None]:
# Plot the eigenvalue of the user vs. the average rating the user
plt.figure(figsize=[15,10])
plt.scatter(user.average_stars, user.ev, edgecolors='black')
plt.xlabel('Yelp average rating')
plt.ylabel('Eigenvalue centrality')
plt.title('Eigenvalue centrality vs. average user rating for Yelp users in Toronto')
plt.savefig('plots/user_rating_ev.svg', format='svg', bbox_inches="tight")
plt.show()

### Restaurants and eigenvalue centrality

In [None]:
ev_biz = {n.Data: ev[n] for n in ev if n.Type == BIZ}
deg_biz = {n.Data: deg[n] for n in deg if n.Type == BIZ}

biz['ev'] = 0.0

In [None]:
for k in ev_biz:
    eigenvalue = ev_biz[k]
    biz.loc[biz.business_id == k, 'ev'] = eigenvalue

In [None]:
plt.figure(figsize=[15,10])
plt.scatter(biz.stars, biz.ev, edgecolors='black')
plt.xlabel('Yelp rating')
plt.ylabel('Eigenvalue centrality')
plt.title('Eigenvalue centrality vs Yelp rating for restaurants in Toronto')
plt.savefig('plots/biz_rating_ev.svg', format='svg', bbox_inches="tight")
plt.show()

In [None]:
plt.figure(figsize=[15,10])
plt.scatter(biz.ev, biz.review_count, edgecolors='black')
plt.xlabel('Eigenvector centrality score')
plt.ylabel('Review count')
plt.title('Eigenvector centrality vs. number of review for restaurants in Toronto')
plt.savefig('plots/biz_ev_count.svg', format='svg', bbox_inches="tight")
plt.show()

### Review differences
For this chapter, the differences in ratings between the regular users and the elite users were investigated. 

#### Overall rating distributions
Are elite users overall harsher in their reviews? Or is it the other way around? Let us find out!

In [None]:
elite_stars = np.array(reviews[reviews.user_id.isin(elite_user.user_id)].stars)
regular_stars = np.array(reviews[~reviews.user_id.isin(elite_user.user_id)].stars)

reg = np.histogram(regular_stars, bins=[1,2,3,4,5,6])[0]
reg = reg / sum(reg)

elit = np.histogram(elite_stars, bins=[1,2,3,4,5,6])[0]
elit = elit / sum(elit)

**Conclusion:** Elite users are more moderate and peak at 4 stars, where regular users are more critical and over-enthustiastic, i.e. giving 1 star reviews, and 5 star reviews.

#### Concrete differences in ratings
Are elite users harsher in their reviews? Or is it the other way around? Let us find out!

In [None]:
# Only elite reviews
elite_biz_graph = nx.subgraph(review_network, review_elite_nodes + review_biz_nodes)
elite_weights_dict = elite_biz_graph.degree(review_biz_nodes, weight='weight')
elite_degrees_dict = elite_biz_graph.degree(review_biz_nodes)
elite_biz_ratings_dict = {
    node.Data: elite_weights_dict[node] / elite_degrees_dict[node]
    for node in review_biz_nodes
    if elite_degrees_dict[node] > 0
}

In [None]:
# All user reviews
all_weights_dict = review_network.degree(review_biz_nodes, weight='weight')
all_degrees_dict = review_network.degree(review_biz_nodes)
all_biz_ratings_dict = {
    node.Data: all_weights_dict[node] / all_degrees_dict[node] 
    for node in review_biz_nodes
    if all_degrees_dict[node] > 0
}

In [None]:
# Comparison REGULAR AND ELITE
biz_ids = [b.Data for b in review_biz_nodes]
deltas_reg = {
    biz_id: elite_biz_ratings_dict[biz_id] - reg_biz_ratings_dict[biz_id]
    for biz_id in biz_ids
    if biz_id in reg_biz_ratings_dict.keys()
    and biz_id in elite_biz_ratings_dict.keys()
}

In [None]:
# Comparison ALL AND ELITE
deltas_all = {
    biz_id: elite_biz_ratings_dict[biz_id] - all_biz_ratings_dict[biz_id]
    for biz_id in biz_ids
    if biz_id in all_biz_ratings_dict.keys()
    and biz_id in elite_biz_ratings_dict.keys()
}

In [None]:
plt.figure(figsize=[20,5])
plt.hist(100 * np.array(list(deltas_all.values())) / 5, bins=200, edgecolor='black')
plt.xlabel('Delta (%)')
plt.title('Elite reviews compared to all user reviews')
plt.show()

In [None]:
plt.figure(figsize=[20,5])
plt.hist(100 * np.array(list(deltas_reg.values())) / 5, bins=200, edgecolor='black')
plt.xlabel('Delta (%)')
plt.title('Elite reviews compared to regular user reviews')
plt.show()

## Discussion. Think critically about your creation
### What went well?
* Cleaning the dataset
* Analysing the social network
* Text analysis

### What is still missing? What could be improved?
* Review network is still kind of inconclusive, we had hoped that a better analysis would come of it
* An idea we would like to try, but requires taking a subgraph of the graph is making a review graph based on users who reviewed the same restaurant.