Merge pull request #7 from srikris/howto-80-chars

Made everything 80 chars and consistent with format.
turi-code · Aug 26, 2014 · 84e0356 · 84e0356
2 parents 133fa9b + eb8de16
commit 84e0356
Show file tree

Hide file tree

Showing 8 changed files with 52 additions and 48 deletions.
diff --git a/convert_column_to_timestamp.py b/convert_column_to_timestamp.py
@@ -1,18 +1,18 @@
-# Title: Convert a date string column to a UNIX timestamp
+# Requires you to 'pip install python-dateutil==1.5'
 import graphlab as gl
 from datetime import datetime
-# Requires you to 'pip install python-dateutil==1.5'
 from dateutil import parser
 
 def str_to_timestamp(the_str):
     try:
         dt = parser.parse(the_str)
     except:
         return None
-
     # UNIX epoch is January 1, 1970
     return (dt - datetime(1970,1,1)).total_seconds()
 
 # 02/29/2001 is invalid, so should be 'None' in output
-sf = gl.SFrame({'date':['2000-08-21','2013-06-08 17:25:00.12753','02/29/2001'],'id':[1,2,3]})
+sf = gl.SFrame({
+        'date':['2000-08-21','2013-06-08 17:25:00.12753','02/29/2001'],
+        'id':[1,2,3]})
 sf['date'] = sf['date'].apply(str_to_timestamp)
diff --git a/join_vertex_data_on_sgraph.py b/join_vertex_data_on_sgraph.py
@@ -1,11 +1,9 @@
-#Title: Join vertex data from multiple graphs
-
 import graphlab as gl
 
-#### Load graph
+# Load graph
 g = gl.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', 'snap')
 
-#### Compute various graph statistics
+# Compute various graph statistics
 pagerank_model = gl.pagerank.create(g)
 pagerank_graph = pagerank_model['graph']
 print pagerank_graph.vertices
@@ -14,7 +12,7 @@
 triangle_counting_graph = triangle_counting_model['graph']
 print triangle_counting_graph.vertices
 
-#### Joined the computed statistics in a new graph
+# Joined the computed statistics in a new graph
 v = pagerank_graph.vertices.join(triangle_counting_graph.vertices)
 joined_graph = gl.SGraph(v, g.edges)
 print joined_graph.vertices
diff --git a/load_yelp_dataset.py b/load_yelp_dataset.py
@@ -1,17 +1,26 @@
-import graphlab
+import graphlab as gl
 
-business = graphlab.SFrame.read_csv('yelp_academic_dataset_business.json', header=False, delimiter='\n', column_type_hints=dict)
-checkin = graphlab.SFrame.read_csv('yelp_academic_dataset_checkin.json', header=False, delimiter='\n', column_type_hints=dict)
-review = graphlab.SFrame.read_csv('yelp_academic_dataset_review.json', header=False, delimiter='\n', column_type_hints=dict)
-user = graphlab.SFrame.read_csv('yelp_academic_dataset_user.json', header=False, delimiter='\n', column_type_hints=dict)
-tip = graphlab.SFrame.read_csv('yelp_academic_dataset_tip.json', header=False, delimiter='\n', column_type_hints=dict)
+# Data available to download from 
+# https://www.yelp.com/academic_dataset
+business = gl.SFrame.read_csv('yelp_academic_dataset_business.json', 
+                header=False, delimiter='\n', column_type_hints=dict)
+checkin = gl.SFrame.read_csv('yelp_academic_dataset_checkin.json', 
+                header=False, delimiter='\n', column_type_hints=dict)
+review = gl.SFrame.read_csv('yelp_academic_dataset_review.json', 
+                header=False, delimiter='\n', column_type_hints=dict)
+user = gl.SFrame.read_csv('yelp_academic_dataset_user.json', 
+                header=False, delimiter='\n', column_type_hints=dict)
+tip = gl.SFrame.read_csv('yelp_academic_dataset_tip.json', 
+                header=False, delimiter='\n', column_type_hints=dict)
 
 # Changing JSON into tables, i.e. SFrames
 reviews = review.unpack('X1', column_name_prefix='')
-businesses = business.unpack('X1', column_name_prefix='', limit=['business_id', 'name', 'latitude', 'longitude', 'stars'])
+businesses = business.unpack('X1', column_name_prefix='', 
+              limit=['business_id', 'name', 'latitude', 'longitude', 'stars'])
 
 # Build a recommender system
-m = graphlab.recommender.create(reviews, 'user_id', 'business_id')
+m = gl.recommender.create(reviews, 'user_id', 'business_id')
 
 # Find businesses that are similar based on users in common
-m.get_similar_items(['BVxlrYWgmi-8TPGMe6CTpg']).join(businesses, on={'similar_item':'business_id'})
+similar_items = m.get_similar_items(['BVxlrYWgmi-8TPGMe6CTpg'])
+print similar_items.join(businesses, on={'similar_item':'business_id'})
diff --git a/remove_duplicate_edges.py b/remove_duplicate_edges.py
@@ -1,17 +1,19 @@
-# Title: Remove duplicate edges from SGraph
 import graphlab as gl
 
 vertices = gl.SFrame({'id':[1,2,3,4,5]})
 edges = gl.SFrame({'src':[1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4],
                    'dst':[2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5]})
 edges['edata'] = edges['src'] + edges['dst']
 
+# Create a graph (as an example)
 g = gl.SGraph(vertices, edges, vid_field='id', src_field='src', dst_field='dst')
 print g.summary()
 print g.vertices
 print g.edges
 
-g2 = gl.SGraph(g.vertices, g.edges.groupby(['__src_id', '__dst_id'], {'data': gl.aggregate.SELECT_ONE('edata')}))
+# Remove duplicates
+g2 = gl.SGraph(g.vertices, g.edges.groupby(['__src_id', '__dst_id'], 
+                                {'data': gl.aggregate.SELECT_ONE('edata')}))
 print g2.summary()
 print g2.vertices
 print g2.edges
diff --git a/sframe_xml_to_dict.py b/sframe_xml_to_dict.py
@@ -36,11 +36,8 @@ def xml_filename_to_dict(filename):
 # file-200.xml
 
 # Read the names of the files in to an SFrame
-all_files = gl.SFrame.read_csv('all_files.csv', header=False)
-all_files.rename({'X1': 'filename'})
+data = gl.SFrame.read_csv('all_files.csv', header=False)
+data.rename({'X1': 'filename'})
 
-# Parse contents of xml files into a dictionary and update the SFrame 
-# with the data (one xml file per row)
-data = gl.SFrame()
-data['xml-dictionary'] = all_files['filename'].apply(lambda x: xml_filename_to_dict(x))
+data['xml-dict'] = data['filename'].apply(lambda x: xml_filename_to_dict(x))
 
diff --git a/sgraph_show_with_nx_layout.py b/sgraph_show_with_nx_layout.py
@@ -1,22 +1,23 @@
 import graphlab as gl
+
+# Add some example edges -- replace with your own graph
 sg = gl.SGraph()
-# add some example edges -- replace with your own graph
 sg = sg.add_edges([gl.Edge(i, i+1) for i in range(10)])
 
 import networkx as nx
 g = nx.Graph()
 
-# put the nodes and edges from the SGraph into a NetworkX graph
+# Put the nodes and edges from the SGraph into a NetworkX graph
 g.add_nodes_from(list(sg.vertices['__id']))
 g.add_edges_from([(e['__src_id'], e['__dst_id']) for e in sg.edges])
 
-# create the layout with NetworkX and convert to regular Python types
-# you can substitute any of the layout algorithms here for circular_layout:
+# Create the layout with NetworkX and convert to regular Python types
+# You can substitute any of the layout algorithms here for circular_layout:
 # http://networkx.github.io/documentation/latest/reference/drawing.html#module-networkx.drawing.layout
 layout = nx.circular_layout(g)
 layout = {k: map(float, list(v)) for k,v in layout.iteritems()}
 
-# show the SGraph in Canvas with that layout
+# Show the SGraph in Canvas with that layout
 sg.vertices['x'] = sg.vertices.apply(lambda v: layout[v['__id']][0])
 sg.vertices['y'] = sg.vertices.apply(lambda v: layout[v['__id']][1])
 sg.show(vertex_positions=('x', 'y'))
diff --git a/triple_apply_shortest_path.py b/triple_apply_shortest_path.py
@@ -1,8 +1,6 @@
-# Title: Implement shortest_path using SGraph.triple_apply
 import graphlab as gl
 import time
 
-
 def sssp_update_fn(src, edge, dst):
     sdist = src['distance']
     ddist = dst['distance']
@@ -11,10 +9,10 @@ def sssp_update_fn(src, edge, dst):
         dst['distance'] = sdist + 1
     return (src, edge, dst)
 
-
 def sssp_triple_apply(input_graph, src_vid, max_distance=1e30):
     g = gl.SGraph(input_graph.vertices, input_graph.edges)
-    g.vertices['distance'] = g.vertices['__id'].apply(lambda x: max_distance if x != src_vid else 0.0)
+    g.vertices['distance'] = \
+      g.vertices['__id'].apply(lambda x: max_distance if x != src_vid else 0.0)
     it = 0
     num_changed = len(g.vertices)
     start = time.time()
@@ -27,9 +25,9 @@ def sssp_triple_apply(input_graph, src_vid, max_distance=1e30):
     print 'Triple apply sssp finished in: %f secs' % (time.time() - start)
     return g
 
-#### Load graph
+# Load graph
 g = gl.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', 'snap')
 
-#### Run triple apply sssp
+# Run triple apply sssp
 triple_apply_sssp_distance = sssp_triple_apply(g, src_vid=0)
 print triple_apply_sssp_distance
diff --git a/triple_apply_weighted_pagerank.py b/triple_apply_weighted_pagerank.py
@@ -1,24 +1,20 @@
-# Title: Implement weighted pagerank using SGraph.triple_apply
 import graphlab as gl
 import time
 
-
 def pagerank_update_fn(src, edge, dst):
     dst['pagerank'] += src['prev_pagerank'] * edge['weight']
     return (src, edge, dst)
 
-
 def sum_weight(src, edge, dst):
     src['total_weight'] += edge['weight']
     return src, edge, dst
 
-
 def normalize_weight(src, edge, dst):
     edge['weight'] /= src['total_weight']
     return src, edge, dst
 
-
-def pagerank_triple_apply(input_graph, reset_prob=0.15, threshold=1e-3, max_iterations=20):
+def pagerank_triple_apply(input_graph, reset_prob=0.15, threshold=1e-3, 
+                          max_iterations=20):
     g = gl.SGraph(input_graph.vertices, input_graph.edges)
 
     # compute normalized edge weight
@@ -35,20 +31,23 @@ def pagerank_triple_apply(input_graph, reset_prob=0.15, threshold=1e-3, max_iter
     while(total_l1_delta > threshold and it < max_iterations):
         g.vertices['pagerank'] = 0.0
         g = g.triple_apply(pagerank_update_fn, ['pagerank'])
-        g.vertices['pagerank'] = g.vertices['pagerank'] * (1 - reset_prob) + reset_prob
-        g.vertices['l1_delta'] = (g.vertices['pagerank'] - g.vertices['prev_pagerank']).apply(lambda x: abs(x))
+        g.vertices['pagerank'] = g.vertices['pagerank'] * (1 - reset_prob) \
+                                                                  + reset_prob
+        g.vertices['l1_delta'] = (g.vertices['pagerank'] - \
+                          g.vertices['prev_pagerank']).apply(lambda x: abs(x))
         total_l1_delta = g.vertices['l1_delta'].sum()
         g.vertices['prev_pagerank'] = g.vertices['pagerank']
-        print 'Iteration %d: total pagerank changed in L1 = %f' % (it, total_l1_delta)
+        print 'Iteration %d: total pagerank changed in L1 = %f' % (it,\
+                                                                total_l1_delta)
         it = it + 1
     print 'Triple apply pagerank finished in: %f secs' % (time.time() - start)
     del g.vertices['prev_pagerank']
     return g
 
-#### Load graph
+# Load graph
 g = gl.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', 'snap')
 g.edges['weight'] = 1.0
 
-#### Run triple apply sssp
+# Run triple apply sssp
 pagerank_graph = pagerank_triple_apply(g)
 print pagerank_graph