This repository has been archived by the owner on May 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 22
/
GraphClusterApp.scala
200 lines (173 loc) · 9.51 KB
/
GraphClusterApp.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/*
* Copyright (c) 2014 Oculus Info Inc.
* http://www.oculusinfo.com/
*
* Released under the MIT License.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.oculusinfo.tilegen.graph.cluster
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx._
import com.oculusinfo.tilegen.util.ArgumentParser
import com.oculusinfo.tilegen.spark.MavenReference
import com.oculusinfo.tilegen.spark.SparkConnector
/**
* Sample application for hierarchical clustering of graph data using Louvain-based community detection.
*
* Adapted from Sotera's graphX implementation of the distributed Louvain modularity algorithm
* https://github.com/Sotera/spark-distributed-louvain-modularity
*
* This application handles reading in a graph dataset from a CSV file, and performing hierarchical clustering.
*
* NOTE: It is expected that the 1st column of the CSV graph data will contain either the keyword "node"
* for data lines representing a graph's node/vertex, or the keyword "edge" for data lines representing
* a graph's edge. However, if "-onlyEdges" is set to true, then the source data can simply be a delimited file
* of only edges. Node ID's must be type Long.
*
* The following command line arguments control how the application runs:
*
* -source -- The source location at which to find the data [required].
* -onlyEdges -- [Boolean, optional] If set to true then source data can simply be a delimited file of edges only. Default = false.
* -output -- The output location where to save results [required].
* Note: the format for community results is CommunityID \t Parent community ID \t Number of internal nodes \t Community Degree
* -parts -- The number of partitions into which to break up each source file. Default = chosen automatically by Spark.
* -p -- Sets spark.default.parallelism and minSplits on the edge file. Default = based on input partitions.
* -progMin -- Percent of nodes that must change communites for the algorithm to consider progress relative to total vertices in a level. Default = 0.15
* -progCount -- Number of times the algorithm can fail to make progress before exiting. Default = 1.
* -d -- Specify the input dataset delimiter. Default is tab-delimited.
* -nID -- The column number in the raw data of node ID's. ID's must be type Long [required if onlyEdges=false].
* -nAttr -- Column numbers of additional node metadata to parse and save with cluster results (attribute ID tags separated by commas) [optional].
* -eSrcID -- The column number of an edge's source ID. ID's must be type Long [required].
* -eDstID -- The column number of an edge's destination ID. ID's must be type Long [required].
* -eWeight -- The column number of an edge's weight. Default = -1, meaning no edge weighting is used.
*
* Results for each hierarchical level are stored in a "level_#" sub-directory, with the following data format,
* For nodes:
* "node\t" + id + "\t" + parentId + "\t" + internalNodes + "\t" + nodeDegree + "\t" + extraAttributes (tab-delimited)
* And for edges:
* "edge\t" + srcID + "\t" + dstID + "\t" + edge weight
**/
object GraphClusterApp {
def main(args: Array[String]) {
val argParser = new ArgumentParser(args)
argParser.debug
val jars =
Seq(new MavenReference("com.oculusinfo", "tile-generation", "0.3-SNAPSHOT")) union SparkConnector.getDefaultLibrariesFromMaven
val sc = argParser.getSparkConnector(jars).getSparkContext("Graph Clustering")
val sourceFile = argParser.getString("source", "The source location at which to find the data")
val bOnlyEdges = argParser.getBoolean("onlyEdges", "If set to true then source data can simply be a delimited file of edges only", Some(false))
val outputDir = argParser.getString("output", "The output location where to save data")
val partitions = argParser.getInt("parts", "The number of partitions into which to break up each source file", Some(0))
val parallelism = argParser.getInt("p", "Sets spark.default.parallelism and minSplits on the edge file. Default=based on input partitions", Some(-1))
val minProgress = argParser.getDouble("progMin", "Percent of nodes that must change communites for the algorithm to consider progress relative to total vertices in a level. default=0.15", Some(0.15))
val progressCounter = argParser.getInt("progCount", "Number of times the algorithm can fail to make progress before exiting. default=1", Some(1))
val edgedelimiter = argParser.getString("d", "Specify input file edge delimiter. Default is tab-delimited", Some("\t"))
val nodeIDindex = argParser.getInt("nID", "The column number in the raw data of node ID's", Some(-1))
val edgeSrcIDindex = argParser.getInt("eSrcID", "The column number of an edge's source ID", Some(0))
val edgeDstIDindex = argParser.getInt("eDstID", "The column number of an edge's source ID", Some(1))
val edgeWeightIndex = argParser.getInt("eWeight", "The column number of an edge's weight", Some(-1))
val nodeAttrIndices = argParser.getString("nAttr",
"Column numbers of additional node metadata to parse and save with cluster results (attribute ID tags separated by commas)",
Some("-1")).split(",").map(_.trim().toInt)
// read the input data
val rawData = if (0 == partitions) {
sc.textFile(sourceFile)
} else {
sc.textFile(sourceFile, partitions)
}
//TODO -- edge weights are currently assumed to be Long. Should this be changed to Double?
// (would need to modify internal Louvain clustering code)
// Parse the raw data into an edgeRDD
var edgeRDD = if (bOnlyEdges) {
//input data is a delimited file of edges-only
rawData.map(row => {
val tokens = row.split(edgedelimiter).map(_.trim())
val srcID = tokens(edgeSrcIDindex).toLong
val dstID = tokens(edgeDstIDindex).toLong
val weight = if (edgeWeightIndex == -1) 1L else tokens(edgeWeightIndex).toLong
new Edge(srcID, dstID, weight)
})
}
else {
//check the first column to see which rows correspond to edges or nodes
rawData.flatMap(row => {
val tokens = row.split(edgedelimiter).map(_.trim())
if (tokens(0) == "edge") {
val srcID = tokens(edgeSrcIDindex).toLong
val dstID = tokens(edgeDstIDindex).toLong
val weight = if (edgeWeightIndex == -1) 1L else tokens(edgeWeightIndex).toLong
Some(new Edge(srcID, dstID, weight))
}
else {
None
}
})
}
// if the parallelism option was set map the input to the correct number of partitions,
// otherwise parallelism will be based off number of HDFS blocks
if ((parallelism != -1 ) && (edgeRDD.partitions.size != parallelism)) {
edgeRDD = edgeRDD.coalesce(parallelism,shuffle=true)
}
if ((bOnlyEdges) || (nodeAttrIndices(0) == -1)) {
// create the graph
val graph = Graph.fromEdges(edgeRDD, None)
// use a helper class to execute the louvain algorithm and save the output.
val runner = new HDFSLouvainRunner(minProgress,progressCounter,outputDir)
runner.run(sc, graph)
}
else {
if (nodeIDindex <= 0)
throw new IllegalArgumentException("nID arguement must be > 0")
//check the first column to see which rows correspond to nodes
// TODO -- this flatMap operation should be combined with one above so we aren't iterating through the dataset twice
var nodeRDD = rawData.flatMap(row => {
val tokens = row.split(edgedelimiter).map(_.trim())
if (tokens(0) == "node") {
val nodeID = tokens(nodeIDindex).toLong
var nodeAttributes = ""
val len = nodeAttrIndices.size
val lenTokens = tokens.size
for (i <- 0 until len) {
if (nodeAttrIndices(i) < lenTokens) {
nodeAttributes += tokens(nodeAttrIndices(i))
}
if (i < len-1)
nodeAttributes += "\t"
}
Some((nodeID, nodeAttributes))
}
else {
None
}
})
// if the parallelism option was set map the input to the correct number of partitions,
// otherwise parallelism will be based off number of HDFS blocks
if ((parallelism != -1 ) && (nodeRDD.partitions.size != parallelism)) {
nodeRDD = nodeRDD.coalesce(parallelism,shuffle=true)
}
// create the graph (with default vertex attribute = an empty string)
val graph = Graph(nodeRDD, edgeRDD, "")
// use a helper class to execute the louvain algorithm and save the output.
val runner = new HDFSLouvainRunner(minProgress,progressCounter,outputDir)
runner.run(sc, graph)
}
println("DONE!!")
}
}