In [46]:
# !pip install plotly==4.8.2
# !pip install networkx
import networkx as nx
import plotly as plt
import os, shutil
import json
import tqdm

In [None]:
class GraphBuilder(object):
    def __init__(self, yelp_dataset = None):
        if not os.path.exists("./yelp_data_uncompressed"):
            print("Unarchiving dataset...")
            shutil.unpack_archive(yelp_dataset, "yelp_data_uncompressed", "tar")
        self.raw_data = "./yelp_data_uncompressed"
        self.data_files = [file for file in os.listdir(self.raw_data) if file.endswith(".json")]
        print("Found {} Files \n{}".format(len(self.data_files), self.data_files))
        
        print("Initializing Graph...")
        # Using MutiGraph as we have multiple types of edges between the user and business
        self.graph = nx.MultiGraph()
        self.add_Nodes("user")
        self.add_Nodes("business")
        self.add_Edges(src = "user", dest = "business", edge_type = "review")
        self.add_Edges(src = "user", dest = "business", edge_type = "review")
    def _save_graph(name = "./yelp_graph.pkl"):
        self.G.write(name)
        
    def add_Nodes(self, node_type):
        """
        Populates graph with ["Business", "user"] nodes
        """
        temp = os.path.join(self.raw_data, [file for file in self.data_files if file.endswith("{}.json".format(node_type))][0])
        
        print("Reading {} Json file...".format(node_type))
        nodes = []
        with open(temp, "rb") as f:
            for line in tqdm.tqdm(f):
                data = json.loads(line)
                # Load items as tuples for easy load in Graphs (Node_id, attributes)
#                 nodes.append(("{}_{}".format(node_type, data['{}_id'.format(node_type)]), data))
                self.graph.add_nodes_from([("{}_{}".format(node_type, data['{}_id'.format(node_type)]), data)])
#         print("Adding nodes to the graph...")      
#         self.graph.add_nodes_from(nodes)
        print("Added {} {} Nodes...".format(len(nodes), node_type))
        
    def add_Edges(self, src, dest, edge_type):
        """
        A utility to add and edge between src and dest of type edge_type ["reviews", "tip", "check_in"]
        """
        
        temp = os.path.join(self.raw_data, [file for file in self.data_files if file.endswith("{}.json".format(edge_type))][0])
        
        print("Reading {} Json file...".format(edge_type))
        edges = []
        with open(temp, "rb") as f:
            for line in tqdm.tqdm(f):
                data = json.loads(line)
                # Load items as tuples for easy load in Graphs (Node_id, attributes)
#                 edges.append(("{}_{}".format(src, data["{}_id".format(src)]), # src_node
#                               "{}_{}".format(dest, data["{}_id".format(src)]), # dest_node,
#                               "{}".format(edge_type), # edge_type
#                               data))
                self.graph.add_edges_from(
                    [("{}_{}".format(src, data["{}_id".format(src)]), # src_node
                              "{}_{}".format(dest, data["{}_id".format(src)]), # dest_node,
                              "{}".format(edge_type), # edge_type
                              data)])
#         print("Adding Edges to the graph...")
#         self.graph.add_edges_from(edges)
        print("Added {} {} Edges...".format(len(edges), edge_type))
        
        
        
g = GraphBuilder("./yelp_dataset.tar")

Found 5 Files 
['yelp_academic_dataset_business.json', 'yelp_academic_dataset_checkin.json', 'yelp_academic_dataset_review.json', 'yelp_academic_dataset_tip.json', 'yelp_academic_dataset_user.json']
Initializing Graph...
Reading user Json file...


530413it [00:21, 25101.71it/s]

In [None]:
len(g.edges)