Skip to content

Commit

Permalink
Merge pull request #7 from srikris/howto-80-chars
Browse files Browse the repository at this point in the history
Made everything 80 chars and consistent with format.
  • Loading branch information
znation committed Aug 26, 2014
2 parents 133fa9b + eb8de16 commit 84e0356
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 48 deletions.
8 changes: 4 additions & 4 deletions convert_column_to_timestamp.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
# Title: Convert a date string column to a UNIX timestamp
# Requires you to 'pip install python-dateutil==1.5'
import graphlab as gl
from datetime import datetime
# Requires you to 'pip install python-dateutil==1.5'
from dateutil import parser

def str_to_timestamp(the_str):
try:
dt = parser.parse(the_str)
except:
return None

# UNIX epoch is January 1, 1970
return (dt - datetime(1970,1,1)).total_seconds()

# 02/29/2001 is invalid, so should be 'None' in output
sf = gl.SFrame({'date':['2000-08-21','2013-06-08 17:25:00.12753','02/29/2001'],'id':[1,2,3]})
sf = gl.SFrame({
'date':['2000-08-21','2013-06-08 17:25:00.12753','02/29/2001'],
'id':[1,2,3]})
sf['date'] = sf['date'].apply(str_to_timestamp)
8 changes: 3 additions & 5 deletions join_vertex_data_on_sgraph.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#Title: Join vertex data from multiple graphs

import graphlab as gl

#### Load graph
# Load graph
g = gl.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', 'snap')

#### Compute various graph statistics
# Compute various graph statistics
pagerank_model = gl.pagerank.create(g)
pagerank_graph = pagerank_model['graph']
print pagerank_graph.vertices
Expand All @@ -14,7 +12,7 @@
triangle_counting_graph = triangle_counting_model['graph']
print triangle_counting_graph.vertices

#### Joined the computed statistics in a new graph
# Joined the computed statistics in a new graph
v = pagerank_graph.vertices.join(triangle_counting_graph.vertices)
joined_graph = gl.SGraph(v, g.edges)
print joined_graph.vertices
27 changes: 18 additions & 9 deletions load_yelp_dataset.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
import graphlab
import graphlab as gl

business = graphlab.SFrame.read_csv('yelp_academic_dataset_business.json', header=False, delimiter='\n', column_type_hints=dict)
checkin = graphlab.SFrame.read_csv('yelp_academic_dataset_checkin.json', header=False, delimiter='\n', column_type_hints=dict)
review = graphlab.SFrame.read_csv('yelp_academic_dataset_review.json', header=False, delimiter='\n', column_type_hints=dict)
user = graphlab.SFrame.read_csv('yelp_academic_dataset_user.json', header=False, delimiter='\n', column_type_hints=dict)
tip = graphlab.SFrame.read_csv('yelp_academic_dataset_tip.json', header=False, delimiter='\n', column_type_hints=dict)
# Data available to download from
# https://www.yelp.com/academic_dataset
business = gl.SFrame.read_csv('yelp_academic_dataset_business.json',
header=False, delimiter='\n', column_type_hints=dict)
checkin = gl.SFrame.read_csv('yelp_academic_dataset_checkin.json',
header=False, delimiter='\n', column_type_hints=dict)
review = gl.SFrame.read_csv('yelp_academic_dataset_review.json',
header=False, delimiter='\n', column_type_hints=dict)
user = gl.SFrame.read_csv('yelp_academic_dataset_user.json',
header=False, delimiter='\n', column_type_hints=dict)
tip = gl.SFrame.read_csv('yelp_academic_dataset_tip.json',
header=False, delimiter='\n', column_type_hints=dict)

# Changing JSON into tables, i.e. SFrames
reviews = review.unpack('X1', column_name_prefix='')
businesses = business.unpack('X1', column_name_prefix='', limit=['business_id', 'name', 'latitude', 'longitude', 'stars'])
businesses = business.unpack('X1', column_name_prefix='',
limit=['business_id', 'name', 'latitude', 'longitude', 'stars'])

# Build a recommender system
m = graphlab.recommender.create(reviews, 'user_id', 'business_id')
m = gl.recommender.create(reviews, 'user_id', 'business_id')

# Find businesses that are similar based on users in common
m.get_similar_items(['BVxlrYWgmi-8TPGMe6CTpg']).join(businesses, on={'similar_item':'business_id'})
similar_items = m.get_similar_items(['BVxlrYWgmi-8TPGMe6CTpg'])
print similar_items.join(businesses, on={'similar_item':'business_id'})
6 changes: 4 additions & 2 deletions remove_duplicate_edges.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
# Title: Remove duplicate edges from SGraph
import graphlab as gl

vertices = gl.SFrame({'id':[1,2,3,4,5]})
edges = gl.SFrame({'src':[1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4],
'dst':[2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5]})
edges['edata'] = edges['src'] + edges['dst']

# Create a graph (as an example)
g = gl.SGraph(vertices, edges, vid_field='id', src_field='src', dst_field='dst')
print g.summary()
print g.vertices
print g.edges

g2 = gl.SGraph(g.vertices, g.edges.groupby(['__src_id', '__dst_id'], {'data': gl.aggregate.SELECT_ONE('edata')}))
# Remove duplicates
g2 = gl.SGraph(g.vertices, g.edges.groupby(['__src_id', '__dst_id'],
{'data': gl.aggregate.SELECT_ONE('edata')}))
print g2.summary()
print g2.vertices
print g2.edges
9 changes: 3 additions & 6 deletions sframe_xml_to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,8 @@ def xml_filename_to_dict(filename):
# file-200.xml

# Read the names of the files in to an SFrame
all_files = gl.SFrame.read_csv('all_files.csv', header=False)
all_files.rename({'X1': 'filename'})
data = gl.SFrame.read_csv('all_files.csv', header=False)
data.rename({'X1': 'filename'})

# Parse contents of xml files into a dictionary and update the SFrame
# with the data (one xml file per row)
data = gl.SFrame()
data['xml-dictionary'] = all_files['filename'].apply(lambda x: xml_filename_to_dict(x))
data['xml-dict'] = data['filename'].apply(lambda x: xml_filename_to_dict(x))

11 changes: 6 additions & 5 deletions sgraph_show_with_nx_layout.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import graphlab as gl

# Add some example edges -- replace with your own graph
sg = gl.SGraph()
# add some example edges -- replace with your own graph
sg = sg.add_edges([gl.Edge(i, i+1) for i in range(10)])

import networkx as nx
g = nx.Graph()

# put the nodes and edges from the SGraph into a NetworkX graph
# Put the nodes and edges from the SGraph into a NetworkX graph
g.add_nodes_from(list(sg.vertices['__id']))
g.add_edges_from([(e['__src_id'], e['__dst_id']) for e in sg.edges])

# create the layout with NetworkX and convert to regular Python types
# you can substitute any of the layout algorithms here for circular_layout:
# Create the layout with NetworkX and convert to regular Python types
# You can substitute any of the layout algorithms here for circular_layout:
# http://networkx.github.io/documentation/latest/reference/drawing.html#module-networkx.drawing.layout
layout = nx.circular_layout(g)
layout = {k: map(float, list(v)) for k,v in layout.iteritems()}

# show the SGraph in Canvas with that layout
# Show the SGraph in Canvas with that layout
sg.vertices['x'] = sg.vertices.apply(lambda v: layout[v['__id']][0])
sg.vertices['y'] = sg.vertices.apply(lambda v: layout[v['__id']][1])
sg.show(vertex_positions=('x', 'y'))
10 changes: 4 additions & 6 deletions triple_apply_shortest_path.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Title: Implement shortest_path using SGraph.triple_apply
import graphlab as gl
import time


def sssp_update_fn(src, edge, dst):
sdist = src['distance']
ddist = dst['distance']
Expand All @@ -11,10 +9,10 @@ def sssp_update_fn(src, edge, dst):
dst['distance'] = sdist + 1
return (src, edge, dst)


def sssp_triple_apply(input_graph, src_vid, max_distance=1e30):
g = gl.SGraph(input_graph.vertices, input_graph.edges)
g.vertices['distance'] = g.vertices['__id'].apply(lambda x: max_distance if x != src_vid else 0.0)
g.vertices['distance'] = \
g.vertices['__id'].apply(lambda x: max_distance if x != src_vid else 0.0)
it = 0
num_changed = len(g.vertices)
start = time.time()
Expand All @@ -27,9 +25,9 @@ def sssp_triple_apply(input_graph, src_vid, max_distance=1e30):
print 'Triple apply sssp finished in: %f secs' % (time.time() - start)
return g

#### Load graph
# Load graph
g = gl.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', 'snap')

#### Run triple apply sssp
# Run triple apply sssp
triple_apply_sssp_distance = sssp_triple_apply(g, src_vid=0)
print triple_apply_sssp_distance
21 changes: 10 additions & 11 deletions triple_apply_weighted_pagerank.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,20 @@
# Title: Implement weighted pagerank using SGraph.triple_apply
import graphlab as gl
import time


def pagerank_update_fn(src, edge, dst):
dst['pagerank'] += src['prev_pagerank'] * edge['weight']
return (src, edge, dst)


def sum_weight(src, edge, dst):
src['total_weight'] += edge['weight']
return src, edge, dst


def normalize_weight(src, edge, dst):
edge['weight'] /= src['total_weight']
return src, edge, dst


def pagerank_triple_apply(input_graph, reset_prob=0.15, threshold=1e-3, max_iterations=20):
def pagerank_triple_apply(input_graph, reset_prob=0.15, threshold=1e-3,
max_iterations=20):
g = gl.SGraph(input_graph.vertices, input_graph.edges)

# compute normalized edge weight
Expand All @@ -35,20 +31,23 @@ def pagerank_triple_apply(input_graph, reset_prob=0.15, threshold=1e-3, max_iter
while(total_l1_delta > threshold and it < max_iterations):
g.vertices['pagerank'] = 0.0
g = g.triple_apply(pagerank_update_fn, ['pagerank'])
g.vertices['pagerank'] = g.vertices['pagerank'] * (1 - reset_prob) + reset_prob
g.vertices['l1_delta'] = (g.vertices['pagerank'] - g.vertices['prev_pagerank']).apply(lambda x: abs(x))
g.vertices['pagerank'] = g.vertices['pagerank'] * (1 - reset_prob) \
+ reset_prob
g.vertices['l1_delta'] = (g.vertices['pagerank'] - \
g.vertices['prev_pagerank']).apply(lambda x: abs(x))
total_l1_delta = g.vertices['l1_delta'].sum()
g.vertices['prev_pagerank'] = g.vertices['pagerank']
print 'Iteration %d: total pagerank changed in L1 = %f' % (it, total_l1_delta)
print 'Iteration %d: total pagerank changed in L1 = %f' % (it,\
total_l1_delta)
it = it + 1
print 'Triple apply pagerank finished in: %f secs' % (time.time() - start)
del g.vertices['prev_pagerank']
return g

#### Load graph
# Load graph
g = gl.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz', 'snap')
g.edges['weight'] = 1.0

#### Run triple apply sssp
# Run triple apply sssp
pagerank_graph = pagerank_triple_apply(g)
print pagerank_graph

0 comments on commit 84e0356

Please sign in to comment.