<h1>GraphX Assignment</h1>
In this assignment, you need to do the following:
<li>Read the file 201710-citibike-tripdata.csv</li>
<li>Construct a graph with stations as vertices and trips between stations as edges</li>
<li>Vertex Ids are station numbers and Vertex attributes are station names</li>
<li>Edge attributes are trip duration (durations are in seconds)</li>
<li>Then answer the questions below</li>

In [1]:
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD


Intitializing Scala interpreter ...

Spark Web UI available at http://ziyangs-mbp.attlocal.net:4040
SparkContext available as 'sc' (version = 2.4.6, master = local[*], app id = local-1596176960202)
SparkSession available as 'spark'


import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD


<h1>Step 1: Construct the graph</h1>
<li>read the data file and drop the header line</li>
<li>create a vertex rdd (the union of start stations and end stations)</li>
<li>create an edge rdd (the trips - start station id, end station id, duration)</li>
<li>create a graph</li>

In [2]:
// read the data file
val data = sc.textFile("201710-citibike-tripdata.csv")

data: org.apache.spark.rdd.RDD[String] = 201710-citibike-tripdata.csv MapPartitionsRDD[1] at textFile at <console>:30


In [3]:
data.first

res0: String = "tripduration","starttime","stoptime","start station id","start station name","start station latitude","start station longitude","end station id","end station name","end station latitude","end station longitude","bikeid","usertype","birth year","gender"


In [4]:
data.count

res1: Long = 1897593


In [6]:
// drop the header
val header_removed = data.mapPartitionsWithIndex{ (idx,iter) => if (idx==0) iter.drop(1) else iter}

header_removed: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at mapPartitionsWithIndex at <console>:31


In [7]:
header_removed.take(4)

res2: Array[String] = Array(457,"2017-10-01 00:00:00","2017-10-01 00:07:38",479,"9 Ave & W 45 St",40.76019252,-73.9912551,478,"11 Ave & W 41 St",40.76030096,-73.99884222,30951,"Subscriber",1985,1, 6462,"2017-10-01 00:00:20","2017-10-01 01:48:03",279,"Peck Slip & Front St",40.707873,-74.00167,307,"Canal St & Rutgers St",40.71427487,-73.98990025,14809,"Customer",NULL,0, 761,"2017-10-01 00:00:27","2017-10-01 00:13:09",504,"1 Ave & E 16 St",40.73221853,-73.98165557,350,"Clinton St & Grand St",40.71559509,-73.9870295,28713,"Subscriber",1992,1, 1193,"2017-10-01 00:00:29","2017-10-01 00:20:22",3236,"W 42 St & Dyer Ave",40.75898481399634,-73.99379968643188,3233,"E 48 St & 5 Ave",40.75724567911726,-73.97805914282799,16008,"Customer",1992,2)


In [8]:
header_removed.count

res3: Long = 1897592


In [20]:
header_removed.take(4)(0)

res14: String = 457,"2017-10-01 00:00:00","2017-10-01 00:07:38",479,"9 Ave & W 45 St",40.76019252,-73.9912551,478,"11 Ave & W 41 St",40.76030096,-73.99884222,30951,"Subscriber",1985,1


In [63]:
// create a vertex rdd: 'vertex_rdd'
val start_stations = header_removed.map(l => l.split(",")).map(l => (l(3).toLong, l(4)))
val end_stations = header_removed.map(l => l.split(",")).map(l => (l(7).toLong, l(8)))
val vertex_rdd = (start_stations union end_stations).distinct

start_stations: org.apache.spark.rdd.RDD[(Long, String)] = MapPartitionsRDD[58] at map at <console>:34
end_stations: org.apache.spark.rdd.RDD[(Long, String)] = MapPartitionsRDD[60] at map at <console>:35
vertex_rdd: org.apache.spark.rdd.RDD[(Long, String)] = MapPartitionsRDD[64] at distinct at <console>:36


In [67]:
vertex_rdd.take(4)

res53: Array[(Long, String)] = Array((459,"W 20 St & 11 Ave"), (228,"E 48 St & 3 Ave"), (3284,"E 88 St & Park Ave"), (3346,"Berkeley Pl & 7 Ave"))


In [30]:
// create an edge rdd: 'edge_rdd'
val edge_rdd = header_removed.map(l => l.split(",")).map(l => Edge(l(3).toLong, l(7).toLong, l(0).toDouble))

edge_rdd: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Double]] = MapPartitionsRDD[10] at map at <console>:31


In [31]:
edge_rdd.take(4)

res24: Array[org.apache.spark.graphx.Edge[Double]] = Array(Edge(479,478,457.0), Edge(279,307,6462.0), Edge(504,350,761.0), Edge(3236,3233,1193.0))


In [71]:
// create the graph
val graph: Graph[String, Double] = Graph(vertex_rdd, edge_rdd)

graph: org.apache.spark.graphx.Graph[String,Double] = org.apache.spark.graphx.impl.GraphImpl@136d0b72


<h1>Step 2: Basic questions</h1>
<li>How many citibike stations are there in the network?</li>
<li>How many trips were made in the month in question?</li>
<li>How many trips started and ended at the same station?</li>
<li>How many station to station connections are there (at least one edge exists between station i and station j and i is not equal to j)?</li>

In [68]:
// How many citibike stations?
val num_stations = vertex_rdd.count

num_stations: Long = 785


In [69]:
// How many trips?
val num_trips = edge_rdd.count

num_trips: Long = 1897592


In [76]:
graph.edges.take(4)(0)

res57: org.apache.spark.graphx.Edge[Double] = Edge(72,72,635.0)


In [79]:
// How many trips started and ended at the same station?
val num_same_stations = graph.edges.filter(l => l.srcId == l.dstId).count

num_same_stations: Long = 33245


In [86]:
// How many station to station connections are there 
// (at least one edge exists between station i and station j and i is not equal to j)?
val num_station_connections = graph.convertToCanonicalEdges().edges.filter(l => l.srcId != l.dstId).count

num_station_connections: Long = 107524


<h2>STEP 3: Find the Station from which most trips originate</h2>
<li>Note that the graph has one edge for each trip (i.e., there are many edges between two vertices)</li>
<li>The function <span style="color:blue">outDegrees</span> returns the number of outgoing edges from every vertex</li>
<li>Print the name of the station with most originating trips</li>

In [107]:
val out_most = graph.outDegrees.collect.sortBy(_._2).reverse(0)
graph.vertices.filter(l => l._1 == out_most._1).foreach(l => println(l._2))

"Pershing Square North"


out_most: (org.apache.spark.graphx.VertexId, Int) = (519,17995)


<h2>STEP 4: Proportion of trips for each station that start and end at that same station</h2>
<li>Use <span style="color:blue">aggregateMessages</span> to calculate the number of trips that start and end at the same vertex (for each vertex)</li>
<li>Divide that number by the number of trips that originate at that vertex using a <span style="color:blue">join</span> with <span style="color:blue">outDegrees</span></li>
<li>Note that some stations may have no trips at all (i.e., the denominator may be non-existent). You will need to use <b>Option</b> objects to handle this</li>
<li>Print the name of the station and the percent of trips that are self-trips from that station</li>

In [117]:
// aggregateMessages
val num_same_stations_by_vertex = graph.aggregateMessages[Int](ec => {if (ec.srcId==ec.dstId)
                                                                                  ec.sendToDst(1) 
                                                                                  else ec.sendToDst(0)},
                                                               (x,y) => x+y)

num_same_stations_by_vertex: org.apache.spark.graphx.VertexRDD[Int] = VertexRDDImpl[201] at RDD at VertexRDD.scala:57


In [145]:
num_same_stations_by_vertex.count

res100: Long = 784


In [146]:
num_same_stations_by_vertex.take(4)

res101: Array[(org.apache.spark.graphx.VertexId, Int)] = Array((3146,24), (3454,13), (330,30), (3058,20))


In [137]:
// outDegrees
val out = graph.outDegrees

out: org.apache.spark.graphx.VertexRDD[Int] = VertexRDDImpl[173] at RDD at VertexRDD.scala:57


In [162]:
out.take(4)

res115: Array[(org.apache.spark.graphx.VertexId, Int)] = Array((3146,1827), (3454,923), (330,2628), (3058,586))


In [141]:
out.count

res96: Long = 756


In [213]:
// use join to calculate the proportion for each station
val proportion = num_same_stations_by_vertex.fullOuterJoin(out)
                .map(t => (t._1, t._2 match {
                    case (Some(a), Some(b)) => a.toDouble/b.toDouble
                    case (Some(a), None) => "No trips originate from this station"
                    case (None, Some(b)) => 0.0
                    case (None, None) => "Unknown"
                }))

proportion: org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, Any)] = MapPartitionsRDD[309] at map at <console>:34


In [214]:
proportion.take(4)

res149: Array[(org.apache.spark.graphx.VertexId, Any)] = Array((3146,0.013136288998357963), (3454,0.014084507042253521), (3344,0.04155495978552279), (3058,0.034129692832764506))


In [215]:
proportion.count

res150: Long = 785


In [216]:
// Print the name of the station and the percent of trips that are self-trips from that station
val self_trips_stations = vertex_rdd.fullOuterJoin(proportion).map(t => (t._2 match{
    case (Some(a),Some(b)) => (a,b)
    case (Some(a), None) => "Unknown"
    case (None, Some(b)) => "Unknown"
    case (None, None) => "Unknown"
}))
self_trips_stations.foreach(println)

("Atlantic Ave & Furman St",0.059593679458239276)
("Vernon Blvd & 10 St",0.05152224824355972)
("Liberty Light Rail",No trips originate from this station)
("Cherry St",0.034385569334836524)
("Pacific St & Nevins St",0.02584493041749503)
("Broadway & W 29 St",0.0077724234416291)
("Crescent St & 35 Ave",0.02280130293159609)
("5 Ave & 3 St",0.01490757304710793)
("Jackson St & Leonard St",0.024844720496894408)
("Berkeley Pl & 7 Ave",0.021597187343043698)
("Clermont Ave & Lafayette Ave",0.0102880658436214)
("Kent Ave & N 7 St",0.034230312623664425)
("E 109 St & 3 Ave",0.027972027972027972)
("State St & Smith St",0.01856763925729443)
("W 27 St & 10 Ave",0.013198416190057193)
("W 84 St & Columbus Ave",0.012247838616714697)
("John St & William St",0.01694915254237288)
("Willoughby Ave & Tompkins Ave",0.0196078431372549)
("Vernon Blvd & 30 Rd",0.02857142857142857)
("Emerson Pl & Myrtle Ave",0.013664596273291925)
("11 Ave & W 59 St",0.012727594131164928)
("Avenue D & E 8 St",0.026248399487836107)

("Monroe St & Tompkins Ave",0.015360983102918587)
("Henry St & W 9 St",0.012121212121212121)
("Old Slip & Front St",0.01167728237791932)
("E 24 St & Park Ave S",0.00825565912117177)
("Fulton St & Waverly Ave",0.012524719841793012)
("E 102 St & 1 Ave",0.036112586298459905)
("W 70 St & Amsterdam Ave",0.03)
("E 2 St & 2 Ave",0.007612966601178782)
("W 56 St & 10 Ave",0.010083036773428233)
("Grand Ave & Bergen St",0.023752969121140142)
("Hs Don't Use",0.029411764705882353)
("Degraw St & Smith St",0.007042253521126761)
("West End Ave & W 94 St",0.022329511164755584)
("E 47 St & Park Ave",0.0046696240952603316)
("44 Dr & 21 St",0.014778325123152709)
("Broadway & W 53 St",0.007886034088018317)
("LaGuardia Pl & W 3 St",0.008453436807095344)
("Metropolitan Ave & Meeker Ave",0.01260239445494644)
("Lafayette St & E 8 St",0.00710776627992056)
("3 St & 3 Ave",0.004840940525587829)
("5 Ave & E 126 St",0.07142857142857142)
("Union Ave & Wallabout St",0.03299492385786802)
("31 St & Broadway",0.01401869

("E 41 St & 5 Ave",0.005131494547787043)
("Pershing Square North",0.004445679355376493)
("DeKalb Ave & Skillman St",0.02109181141439206)
("Adam Clayton Powell Blvd & W 115 St",0.03689567430025445)
("West Drive & Prospect Park West",0.14900527811611855)
("Cadman Plaza E & Red Cross Pl",0.03351519875292284)
("27 Ave & 9 St",0.05737704918032787)
("Centre St & Chambers St",0.06588292921871174)
("Franklin Ave & Empire Blvd",0.08113590263691683)
("W 106 St & Amsterdam Ave",0.017128874388254486)
("E 103 St & Lexington Ave",0.03088803088803089)
("W Broadway & Spring Street",0.013975726369988967)
("28 St & 41 Ave",0.019230769230769232)
("Franklin St & W Broadway",0.01359039190897598)
("St. Nicholas Ave & W 126 St",0.042440318302387266)
("Gold St & Frankfort St",0.02956989247311828)
("Clinton Ave & Flushing Ave",0.0155195681511471)
("Union St & 4 Ave",0.010786802030456852)
("5 Ave & E 73 St",0.05582078589790672)
("E 17 St & Broadway",0.006823166274063846)
("2 Ave & 9 St",0.007444168734491315)
("

("Scholes St & Manhattan Ave",0.019175455417066157)
("2 Ave & E 96 St",0.015282131661442006)
("Wolcott St & Dwight St",0.0708955223880597)
("Columbus Ave & W 72 St",0.012094395280235988)
("Rogers Ave & Sterling St",0.0273972602739726)
("W 43 St & 6 Ave",0.01117212649119485)
("E 93 St & 2 Ave",0.009825327510917031)
("28 St & 38 Ave",0.009287925696594427)
("Henry St & Grand St",0.01764705882352941)
("E 11 St & 1 Ave",0.012358609132802682)
("E 2 St & Avenue C",0.013071895424836602)
("Columbia Heights & Cranberry St",0.037628278221208664)
("Lincoln Pl & Classon Ave",0.030927835051546393)
("W 110 St & Amsterdam Ave",0.01289134438305709)
("Madison Ave & E 82 St",0.020526315789473684)
("Center Blvd & Borden Ave",0.1189710610932476)
("MacDougal St & Prince St",0.010139416983523447)
("Grove St PATH",No trips originate from this station)
("Hancock St & Bedford Ave",0.024930747922437674)
("11 Ave & W 27 St",0.011262007287181186)
("Broad St & Bridge St",0.024154589371980676)
("W 53 St & 10 Ave",0.

self_trips_stations: org.apache.spark.rdd.RDD[java.io.Serializable] = MapPartitionsRDD[313] at map at <console>:35


<h2>STEP 5: Create a new graph that contains all edges except for those between the same station</h2>
<li>Print the proportion of trips that start and end at the same station</li>

In [220]:
// create the new graph excluding edges between the same station
val new_graph = graph.subgraph(e => e.srcId != e.dstId, (v_id,v_attr) => true)

// Print
println("The proportion of trips that start and end at the same station is " 
        + (1-new_graph.edges.count.toDouble/graph.edges.count.toDouble))

The proportion of trips that start and end at the same station is 0.017519572173575804


new_graph: org.apache.spark.graphx.Graph[String,Double] = org.apache.spark.graphx.impl.GraphImpl@4ee8e940


<h2>STEP 6: Calculate the average duration between every pair of stations</h2>
<li>Use groupEdges to create a new graph with edges equal to the total durations of all trips between two stations</li>
<li>Extract edges from this new graph and create a new RDD with the data format ((src,dst),total_time)</li>
<li>Use reduceByKey to calculate the count of edges between each pair of stations</li>
<li>join the two RDDs, use map to calculate averages and then construct new edges (Edge objects) with the source, destination, and the average as the edge attribute</li>
<li>make a new graph with these new edges</li>
<li>print the top 10 edges with the smallest average durations</li>

In [221]:
// repartition then 'groupEdges'
val grouped = new_graph.partitionBy(PartitionStrategy.CanonicalRandomVertexCut)
                       .groupEdges((edge1, edge2) => edge1 + edge2)


grouped: org.apache.spark.graphx.Graph[String,Double] = org.apache.spark.graphx.impl.GraphImpl@19b0bd47


In [291]:
grouped.vertices.first

res209: (org.apache.spark.graphx.VertexId, String) = (3146,"E 81 St & 3 Ave")


In [238]:
// extract edges from the new graph and create a new RDD
val new_edge_rdd = grouped.edges.map(l => ((l.srcId,l.dstId),l.attr))

new_edge_rdd: org.apache.spark.rdd.RDD[((org.apache.spark.graphx.VertexId, org.apache.spark.graphx.VertexId), Double)] = MapPartitionsRDD[351] at map at <console>:30


In [266]:
new_edge_rdd.take(4)

res195: Array[((org.apache.spark.graphx.VertexId, org.apache.spark.graphx.VertexId), Double)] = Array(((72,128),12593.0), ((72,265),3143.0), ((72,281),12355.0), ((72,293),16179.0))


In [267]:
// use reduceByKey to calculate the count of edges between each pair of stations
val new_edge_count_rdd = new_graph.edges.map(l => ((l.srcId,l.dstId),1)).reduceByKey((a,b) => a + b)

new_edge_count_rdd: org.apache.spark.rdd.RDD[((org.apache.spark.graphx.VertexId, org.apache.spark.graphx.VertexId), Int)] = ShuffledRDD[389] at reduceByKey at <console>:31


In [268]:
new_edge_count_rdd.take(4)

res196: Array[((org.apache.spark.graphx.VertexId, org.apache.spark.graphx.VertexId), Int)] = Array(((2009,361),43), ((479,449),53), ((417,3107),3), ((3256,336),20))


In [285]:
// join the two RDDs, use map to calculate averages and then construct new edges
val new_join = new_edge_rdd.fullOuterJoin(new_edge_count_rdd)
val new_averages = new_join.map(t => (t._1,t._2 match{
    case (Some(a),Some(b)) => a/b
    case (Some(a), None) => 0.0
    case (None, Some(b)) => 0.0
    case (None, None) => 0.0
}))
val new_averages_edge_rdd = new_averages.map(l => Edge(l._1._1, l._1._2, l._2))

new_join: org.apache.spark.rdd.RDD[((org.apache.spark.graphx.VertexId, org.apache.spark.graphx.VertexId), (Option[Double], Option[Int]))] = MapPartitionsRDD[424] at fullOuterJoin at <console>:36
new_averages: org.apache.spark.rdd.RDD[((org.apache.spark.graphx.VertexId, org.apache.spark.graphx.VertexId), Double)] = MapPartitionsRDD[425] at map at <console>:37
new_averages_edge_rdd: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Double]] = MapPartitionsRDD[426] at map at <console>:43


In [286]:
new_join.first

res204: ((org.apache.spark.graphx.VertexId, org.apache.spark.graphx.VertexId), (Option[Double], Option[Int])) = ((2009,361),(Some(21555.0),Some(43)))


In [287]:
new_averages.first

res205: ((org.apache.spark.graphx.VertexId, org.apache.spark.graphx.VertexId), Double) = ((2009,361),501.27906976744185)


In [290]:
new_averages_edge_rdd.first

res208: org.apache.spark.graphx.Edge[Double] = Edge(2009,361,501.27906976744185)


In [292]:
// make a new graph
val new_average_graph: Graph[String, Double] = Graph(grouped.vertices, new_averages_edge_rdd)

new_average_graph: org.apache.spark.graphx.Graph[String,Double] = org.apache.spark.graphx.impl.GraphImpl@7edb9bb7


In [302]:
// print the top 10 edges with the smallest average durations
new_average_graph.edges.sortBy(_.attr).take(10).foreach(println)

Edge(3596,3601,67.0)
Edge(3352,3333,68.0)
Edge(3551,3502,70.5)
Edge(3523,3515,76.0)
Edge(505,3464,77.5)
Edge(3071,3069,80.5)
Edge(3329,3303,83.0)
Edge(420,416,87.0)
Edge(3329,3332,87.5)
Edge(3501,3503,87.57142857142857)


<h2>STEP 7: Important stations</h2>
Citibike wants to figure out how best to deploy its workers in checking whether a station is over-full (too many bikes) or needs more bikes. It figures that the best way to do this is to find out which stations are the most important in terms of flows:
<li>A station that has high bike returns and is connected to other stations with high bike returns is more likely to have too many bikes in its station and therefore should be monitored more often</li>
<li>A station that has high bike pickups and is connected to other stations with high bike pickups is more likely to be short of bikes and therefore should be monitored more often</li>
<li>Calculate the propensities for over-fullness and emptiness for every station</li>
<li>Report the 5 most important stations for over-fullness</li>
<li>Report the 5 most important stations for emptiness</li>

In [327]:
// use the 'new_graph' from step 5
// use pagerank

// make new edges with attributes of 1 per trip to the station
val return_bikes = new_graph.mapEdges(l => 1)
// returns a new graph called 'return_bikes'

return_bikes: org.apache.spark.graphx.Graph[String,Int] = org.apache.spark.graphx.impl.GraphImpl@57beee42


In [328]:
// run pagerank
val ranks_1 = return_bikes.pageRank(0.0001).vertices
val stations_1 = return_bikes.vertices
val ranksByStation_1 = stations_1.join(ranks_1).map {
  case (id, (station, rank)) => (station, rank)
}
// Report the 5 most important stations for over-fullness
ranksByStation_1.sortBy(_._2,false).take(5).foreach(println)

("Pershing Square North",5.012871239386011)
("West St & Chambers St",3.738987724710476)
("Broadway & E 22 St",3.6361291974134105)
("E 17 St & Broadway",3.588887101309807)
("W 21 St & 6 Ave",3.48583225244726)


ranks_1: org.apache.spark.graphx.VertexRDD[Double] = VertexRDDImpl[4724] at RDD at VertexRDD.scala:57
stations_1: org.apache.spark.graphx.VertexRDD[String] = VertexRDDImpl[331] at RDD at VertexRDD.scala:57
ranksByStation_1: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[4734] at map at <console>:36


In [329]:
// Similarly for pickup by make the edge attribute negative

In [330]:
val pickup_bikes = new_graph.mapEdges(l => -1)
val ranks_2 = pickup_bikes.pageRank(0.0001).vertices
val stations_2 = return_bikes.vertices
val ranksByStation_2 = stations_2.join(ranks_2).map {
  case (id, (station, rank)) => (station, rank)
}
// Report the 5 most important stations for emptiness
ranksByStation_2.sortBy(_._2).take(5).foreach(println)

("JSQ Don't Use",0.15567242488999491)
("8D QC Station 01",0.15567242488999491)
("Astor Place",0.1559546418499317)
("City Hall",0.1559681093166172)
("Liberty Light Rail",0.15628633114127)


pickup_bikes: org.apache.spark.graphx.Graph[String,Int] = org.apache.spark.graphx.impl.GraphImpl@7a0564f9
ranks_2: org.apache.spark.graphx.VertexRDD[Double] = VertexRDDImpl[5583] at RDD at VertexRDD.scala:57
stations_2: org.apache.spark.graphx.VertexRDD[String] = VertexRDDImpl[331] at RDD at VertexRDD.scala:57
ranksByStation_2: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[5593] at map at <console>:40


<h2>STEP 8: Calculate the clustering coefficient of every station</h2>
<li>And report the top 20 stations by clustering coefficient</li>
<li>Note that you may be unable to use <span style="color:red">zip</span>. Try using <span style="color:blue">join</span> instead</li>

In [333]:
// use the 'new_graph' from step 5
val triangles = new_graph.triangleCount
val triangles_by_vertices = triangles.vertices
val possible_triangles = graph.convertToCanonicalEdges().degrees.map(d => (d._1,d._2*(d._2-1)/2))

triangles: org.apache.spark.graphx.Graph[Int,Double] = org.apache.spark.graphx.impl.GraphImpl@240be142
triangles_by_vertices: org.apache.spark.graphx.VertexRDD[Int] = VertexRDDImpl[5692] at RDD at VertexRDD.scala:57
possible_triangles: org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, Int)] = MapPartitionsRDD[5713] at map at <console>:37


In [334]:
triangles_by_vertices.first

res240: (org.apache.spark.graphx.VertexId, Int) = (3146,39467)


In [335]:
possible_triangles.first

res241: (org.apache.spark.graphx.VertexId, Int) = (3146,54615)


In [342]:
val triangles_and_degrees = triangles_by_vertices.fullOuterJoin(possible_triangles)

triangles_and_degrees: org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, (Option[Int], Option[Int]))] = MapPartitionsRDD[5727] at fullOuterJoin at <console>:31


In [343]:
triangles_and_degrees.first

res245: (org.apache.spark.graphx.VertexId, (Option[Int], Option[Int])) = (3146,(Some(39467),Some(54615)))


In [351]:
val clustering_coefficientients = triangles_and_degrees.map(t => (t._1,t._2 match {
                                case (Some(a),Some(b)) => a.toDouble/b.toDouble
                            }))

It would fail on the following inputs: (None, None), (None, Some(_)), (Some(_), None)
       val clustering_coefficientients = triangles_and_degrees.map(t => (t._1,t._2 match {
                                                                                ^
clustering_coefficientients: org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, Double)] = MapPartitionsRDD[5729] at map at <console>:30


In [352]:
clustering_coefficientients.first

res252: (org.apache.spark.graphx.VertexId, Double) = (3146,0.7226403002838048)


In [353]:
clustering_coefficientients.sortBy(_._2,false).take(20).foreach(println)
// Report the top 20 stations by clustering coefficient: (id, clustering coefficient)

(3279,1.0)
(3192,1.0)
(3040,1.0)
(3485,1.0)
(3639,1.0)
(3647,1.0)
(3186,1.0)
(153,1.0)
(339,0.877201420748853)
(3464,0.8679573382796197)
(247,0.8602079768329604)
(3175,0.8592469808193227)
(3176,0.8568452539928423)
(3623,0.8549019607843137)
(3491,0.854122621564482)
(266,0.849218980253463)
(3441,0.8482701509017299)
(3646,0.8333333333333334)
(3642,0.832)
(444,0.8283229697508064)
