In [1]:
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.spark.ml.feature.{CountVectorizer, IDF}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import java.util.Properties

import edu.umd.cloud9.collection.XMLInputFormat
import edu.stanford.nlp.ling.CoreAnnotations.{LemmaAnnotation, SentencesAnnotation, TokensAnnotation}
import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}

Intitializing Scala interpreter ...

Spark Web UI available at http://localhost:4040
SparkContext available as 'sc' (version = 2.4.6, master = local[*], app id = local-1602421834037)
SparkSession available as 'spark'


import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.spark.ml.feature.{CountVectorizer, IDF}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import java.util.Properties
import edu.umd.cloud9.collection.XMLInputFormat
import edu.stanford.nlp.ling.CoreAnnotations.{LemmaAnnotation, SentencesAnnotation, TokensAnnotation}
import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}


In [2]:
val path = "Data/Wikipedia-Geometry.xml"
@transient val conf = new Configuration()
conf.set(XMLInputFormat.START_TAG_KEY, "<page>")
conf.set(XMLInputFormat.END_TAG_KEY, "</page>")
val kvs = spark.sparkContext.newAPIHadoopFile(path, classOf[XMLInputFormat], classOf[LongWritable], classOf[Text], conf)
val rawXmls = kvs.map(_._2.toString).toDS()

path: String = Data/Wikipedia-Geometry.xml
conf: org.apache.hadoop.conf.Configuration = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
kvs: org.apache.spark.rdd.RDD[(org.apache.hadoop.io.LongWritable, org.apache.hadoop.io.Text)] = Data/Wikipedia-Geometry.xml NewHadoopRDD[0] at newAPIHadoopFile at <console>:43
rawXmls: org.apache.spark.sql.Dataset[String] = [value: string]


In [3]:
import java.io.ByteArrayInputStream;
import info.bliki.wiki.dump._
import org.xml.sax.SAXException
import spark.implicits._

case class Page(var page: WikiArticle = new WikiArticle) {}

class ArticleFilter(val Page: Page) extends IArticleFilter {
    @throws(classOf[SAXException])
    def process(page: WikiArticle, siteinfo: Siteinfo) {
        Page.page = page
    }
}

def wikiXmlToPlainText(pageXml: String): Option[(String, String)] = {
    val Page = new Page
    try{
        val parser = new WikiXMLParser(new ByteArrayInputStream(pageXml.getBytes),new ArticleFilter(Page))
        parser.parse()
    }catch{
        case e: Exception =>
    }
    val page = Page.page
    if(page.getText != null && page.getTitle != null
      && page.getId != null && page.getRevisionId != null
      && page.getTimeStamp != null && !page.isTemplate) {
        Some((page.getTitle, page.getText))
    } else{
        None
    }
}
val docTexts = rawXmls.filter(_ != null).flatMap(wikiXmlToPlainText)

import java.io.ByteArrayInputStream
import info.bliki.wiki.dump._
import org.xml.sax.SAXException
import spark.implicits._
defined class Page
defined class ArticleFilter
wikiXmlToPlainText: (pageXml: String)Option[(String, String)]
docTexts: org.apache.spark.sql.Dataset[(String, String)] = [_1: string, _2: string]


In [4]:
def createNLPPipeline(): StanfordCoreNLP = {
    val props = new Properties()
    props.put("annotators","tokenize, ssplit, pos, lemma")
    new StanfordCoreNLP(props)
}

def isOnlyLetters(str: String): Boolean = {
    str.forall(c => Character.isLetter(c))
}

def plainTextToLemmas(text: String, stopWords: Set[String],
    pipeline: StanfordCoreNLP): Seq[String] = {
    val doc = new Annotation(text)
    pipeline.annotate(doc)
    
    val lemmas = new ArrayBuffer[String]()
    val sentences = doc.get(classOf[SentencesAnnotation])
    for (sentence <- sentences.asScala;
        token <- sentence.get(classOf[TokensAnnotation]).asScala){
        val lemma = token.get(classOf[LemmaAnnotation])
        if(lemma.length > 2 && !stopWords.contains(lemma)
          && isOnlyLetters(lemma)){
          lemmas += lemma.toLowerCase
        }
    }
    lemmas
}

createNLPPipeline: ()edu.stanford.nlp.pipeline.StanfordCoreNLP
isOnlyLetters: (str: String)Boolean
plainTextToLemmas: (text: String, stopWords: Set[String], pipeline: edu.stanford.nlp.pipeline.StanfordCoreNLP)Seq[String]


In [5]:
val stopWords = scala.io.Source.fromFile("Data/stopwords.txt").getLines().toSet
val bStopWords = spark.sparkContext.broadcast(stopWords)

val terms: Dataset[(String, Seq[String])] = 
 docTexts.mapPartitions { iter =>
     val pipeline = createNLPPipeline()
     iter.map { case(title, contents) =>
        (title, plainTextToLemmas(contents, bStopWords.value, pipeline))
     }
 }

stopWords: scala.collection.immutable.Set[String] = Set(down, it's, ourselves, that's, for, further, she'll, any, there's, this, haven't, in, ought, myself, have, your, off, once, i'll, are, is, his, why, too, why's, am, than, isn't, didn't, himself, but, you're, below, what, would, i'd, if, you'll, own, they'll, up, we're, they'd, so, our, do, all, him, had, nor, before, it, a, she's, as, hadn't, because, has, she, yours, or, above, yourself, herself, she'd, such, they, each, can't, don't, i, until, that, out, he's, cannot, to, we've, hers, you, did, let's, most, here, these, hasn't, was, there, when's, shan't, doing, at, through, been, over, i've, on, being, same, how, whom, my, after, who, itself, me, them, by, then, couldn't, he, should, few, wasn't, again, while, their, not, with, ...

In [6]:
val termsDF = terms.toDF("title", "terms")

termsDF: org.apache.spark.sql.DataFrame = [title: string, terms: array<string>]


In [7]:
val filtered = termsDF.where(size($"terms") > 1)

filtered: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [title: string, terms: array<string>]


In [8]:
filtered.show

+--------------------+--------------------+
|               title|               terms|
+--------------------+--------------------+
|Category:Discrete...|[portal, mathemat...|
|Category:Incidenc...|[commons, categor...|
|Category:Metric g...|[commons, categor...|
|Category:Integral...|[category, geomet...|
|Category:Conforma...|[cat, main, categ...|
|Category:Trigonom...|[commons, cat, tr...|
|Category:Convex g...|[commons, categor...|
|Category:Technica...|[cat, main, techn...|
|   Category:Symmetry|[commons, cat, sy...|
|Category:Homogene...|[cat, main, homog...|
|       Ambient space|[short, descripti...|
|Category:Duality ...|[portal, mathemat...|
|          Superspace|[superspace, coor...|
|     Geometry Center|[geometry, center...|
|          Dehn plane|[geometry, dehn, ...|
|Complex reflectio...|[mathematics, com...|
|Category:Geometri...|[commons, cat, ma...|
|    Lipschitz domain|[mathematics, lip...|
|        Complex line|[mathematics, com...|
|Visibility (geome...|[visibilit

In [9]:
val numTerms = 20000
val countVectorizer = new CountVectorizer().
    setInputCol("terms").
    setOutputCol("termFreqs").
    setVocabSize(numTerms)
val vocabModel = countVectorizer.fit(filtered)
val docTermFreqs = vocabModel.transform(filtered)

numTerms: Int = 20000
countVectorizer: org.apache.spark.ml.feature.CountVectorizer = cntVec_8baec16221d2
vocabModel: org.apache.spark.ml.feature.CountVectorizerModel = cntVec_8baec16221d2
docTermFreqs: org.apache.spark.sql.DataFrame = [title: string, terms: array<string> ... 1 more field]


In [10]:
docTermFreqs.cache()

res1: docTermFreqs.type = [title: string, terms: array<string> ... 1 more field]


In [11]:
val idf = new IDF().
    setInputCol("termFreqs").
    setOutputCol("tfidfVec")
val idfModel = idf.fit(docTermFreqs)
val docTermMatrix = idfModel.transform(docTermFreqs).select("title","tfidfVec")

idf: org.apache.spark.ml.feature.IDF = idf_6aad8f089a2a
idfModel: org.apache.spark.ml.feature.IDFModel = idf_6aad8f089a2a
docTermMatrix: org.apache.spark.sql.DataFrame = [title: string, tfidfVec: vector]


In [12]:
val termIds: Array[String] = vocabModel.vocabulary
val docIds = docTermFreqs.rdd.map(_.getString(0)).
    zipWithUniqueId().
    map(_.swap).
    collect().toMap

termIds: Array[String] = Array(geometry, point, space, title, line, journal, cite, group, math, plane, page, two, date, can, first, use, volume, category, frac, url, right, one, mathcal, surface, year, angle, set, vector, file, image, theory, mathematics, last, shape, publisher, isbn, book, doi, also, metric, give, time, cdot, define, function, left, field, form, triangle, theorem, symmetry, dimensional, phi, coordinate, number, example, axis, case, computational, see, issue, minkowski, geodesic, method, equation, thumb, parallel, circle, operatorname, distance, map, structure, geometric, three, dimension, pattern, model, mathbf, call, anatomy, base, mathematical, object, mathbb, transformation, problem, follow, circ, varphi, author, area, length, manifold, end, unit, euclidean, system,...

In [13]:
import org.apache.spark.mllib.linalg.{Vectors, Vector => MLLibVector}
import org.apache.spark.ml.linalg.{Vector => MLVector}

val vecRdd = docTermMatrix.select("tfidfVec").rdd.map { row =>
Vectors.fromML(row.getAs[MLVector]("tfidfVec"))
}

import org.apache.spark.mllib.linalg.{Vectors, Vector=>MLLibVector}
import org.apache.spark.ml.linalg.{Vector=>MLVector}
vecRdd: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[52] at map at <console>:52


In [14]:
import org.apache.spark.mllib.linalg.distributed.RowMatrix

vecRdd.cache()
val mat = new RowMatrix(vecRdd)
val k = 500
val svd = mat.computeSVD(k, computeU=true)

import org.apache.spark.mllib.linalg.distributed.RowMatrix
mat: org.apache.spark.mllib.linalg.distributed.RowMatrix = org.apache.spark.mllib.linalg.distributed.RowMatrix@4b736e8a
k: Int = 500
svd: org.apache.spark.mllib.linalg.SingularValueDecomposition[org.apache.spark.mllib.linalg.distributed.RowMatrix,org.apache.spark.mllib.linalg.Matrix] =
SingularValueDecomposition(org.apache.spark.mllib.linalg.distributed.RowMatrix@1b907a40,[1256.4621355939073,929.0766540782947,833.2957789686252,678.4654323957276,656.4446372483457,643.6494433768422,617.6619508502157,564.2799052331369,521.7230256071273,520.0396076459301,503.30050831911143,463.8714120045462,423.29347308036415,406.7361268552447,391.17122596442886,371.75197242292796,345.111715344905,329.8404563381224,324.66019551297796,309.24073158383...

In [15]:
import org.apache.spark.mllib.linalg.{Matrix,SingularValueDecomposition}
import org.apache.spark.mllib.linalg.distributed.RowMatrix

def topTermsInTopConcepts(
svd: SingularValueDecomposition[RowMatrix, Matrix],
numConcepts: Int,
numTerms: Int, termIds: Array[String])
: Seq[Seq[(String, Double)]] = {
val v = svd.V
val topTerms = new ArrayBuffer[Seq[(String, Double)]]()
val arr = v.toArray
for (i <- 0 until numConcepts) {
    val offs = i * v.numRows
    val termWeights = arr.slice(offs, offs + v.numRows).zipWithIndex
    val sorted = termWeights.sortBy(-_._1)
    topTerms += sorted.take(numTerms).map {
        case (score, id) => (termIds(id), score)
    }
}
topTerms
}

import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
topTermsInTopConcepts: (svd: org.apache.spark.mllib.linalg.SingularValueDecomposition[org.apache.spark.mllib.linalg.distributed.RowMatrix,org.apache.spark.mllib.linalg.Matrix], numConcepts: Int, numTerms: Int, termIds: Array[String])Seq[Seq[(String, Double)]]


In [16]:
def topDocsInTopConcepts(
svd: SingularValueDecomposition[RowMatrix, Matrix],
numConcepts: Int, numDocs: Int, docIds:Map[Long, String])
: Seq[Seq[(String, Double)]]= {
val u = svd.U
val topDocs = new ArrayBuffer[Seq[(String, Double)]]()
for (i <- 0 until numConcepts) {
    val docWeights = u.rows.map(_.toArray(i)).zipWithUniqueId()
    topDocs += docWeights.top(numDocs).map{
        case (score, id) => (docIds(id), score)
    }
}
topDocs
}

topDocsInTopConcepts: (svd: org.apache.spark.mllib.linalg.SingularValueDecomposition[org.apache.spark.mllib.linalg.distributed.RowMatrix,org.apache.spark.mllib.linalg.Matrix], numConcepts: Int, numDocs: Int, docIds: Map[Long,String])Seq[Seq[(String, Double)]]


In [17]:
val topConceptTerms = topTermsInTopConcepts(svd, 6, 6, termIds)
val topConceptDocs = topDocsInTopConcepts(svd, 6, 6, docIds)
for ((terms, docs) <- topConceptTerms.zip(topConceptDocs)){
    println("Concept terms: " + terms.map(_._1).mkString(", "))
    println("Concept docs: " + docs.map(_._1).mkString(", "))
    println()
}

Concept terms: category, geometry, catmain, newstub, wpss, stereochemistry
Concept docs: Category:Inversive geometry, Category:Conformal geometry, Category:Analytic geometry, Category:Geometry stubs, Category:Theorems in geometry, Category:Molecular geometry

Concept terms: snub, cdd, bmatrix, node, uniform, tiling
Concept docs: Snub (geometry), Complex reflection group, Minkowski space, Spacetime diagram, Busemann function, Moir? pattern

Concept terms: minkowski, math, mathbf, spacetime, vector, relativity
Concept docs: Minkowski space, Spacetime diagram, Busemann function, Moir? pattern, Geometry, Fat object (geometry)

Concept terms: minkowski, mathbf, vector, math, spacetime, eta
Concept docs: Minkowski space, Computational anatomy, Valuation (geometry), Spacetime diagram, Riemannian metric and Lie bracket in computational anatomy, Complex reflection group

Concept terms: moir, layer, pattern, minkowski, mathbf, spacetime
Concept docs: Moir? pattern, Line moir?, Minkowski space, P

topConceptTerms: Seq[Seq[(String, Double)]] = ArrayBuffer(ArraySeq((category,3.2526065174565133E-19), (geometry,-2.1819568721270777E-18), (catmain,-5.506282006739749E-8), (newstub,-5.616726082803211E-8), (wpss,-5.616726085166772E-8), (stereochemistry,-9.011149715819344E-8)), ArraySeq((snub,0.5016816106506126), (cdd,0.4914580527147358), (bmatrix,0.4389003606247568), (node,0.26871997759467914), (uniform,0.15838182790459227), (tiling,0.15295434166125405)), ArraySeq((minkowski,0.3162818903503688), (math,0.3013590597490821), (mathbf,0.2397618662268306), (spacetime,0.2080337169047539), (vector,0.17937256351152658), (relativity,0.17062239618071576)), ArraySeq((minkowski,0.13621843201552586), (mathbf,0.11676383518907249), (vector,0.10539298227158342), (math,0.08723626570553503), (spacetime,0.08...