In [2]:
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF
from pyspark.sql.types import DoubleType


In [3]:
documents = sqlContext.createDataFrame([
    ('a', "The sky is blue"),
    ('b', "The sky is blue and beautiful"),
    ('c', "Look at the bright blue sky!,"),
   ], ["doc_id", "doc_text"])


documents.printSchema()

root
 |-- doc_id: string (nullable = true)
 |-- doc_text: string (nullable = true)



In [4]:
query= sqlContext.createDataFrame([('a',"look blue")],["doc_id", "doc_text"])

In [5]:
doc=documents.rdd

In [6]:
doc.take(1)

[Row(doc_id='a', doc_text='The sky is blue')]

In [7]:
import re
#from stemming.porter2 import stem
def clean_word(w):
   # w=w.lower().strip()
    w=re.sub('\n',"",w)
    return re.sub("[^a-z| |0-9]|,\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{(?i)\b|[0-9]|((?:https?://|www\d{0,3}))|[//]|[#]|[$]|", "", (w.lower()))


In [8]:
from pyspark.ml.feature import StopWordsRemover

In [9]:
remover = StopWordsRemover(inputCol="features", outputCol="filtered")
    

In [10]:
toy=(documents.rdd.map(lambda x: (x.doc_id,clean_word (x.doc_text))).toDF().withColumnRenamed("_1","doc_id")
  .withColumnRenamed("_2","doc_text"))

In [11]:
df = (toy
      .rdd
  .map(lambda x : (x.doc_id,x.doc_text.split(" ")))
  .toDF()
  .withColumnRenamed("_1","doc_id")
  .withColumnRenamed("_2","features"))


In [12]:
d=remover.transform(df)

In [13]:
d.printSchema()

root
 |-- doc_id: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [15]:
htf = HashingTF(inputCol="filtered", outputCol="tf")#,numFeatures=10)
tf = htf.transform(d)
tf.show(truncate=False)

+------+------------------------------------+-------------------------+-------------------------------------------------------+
|doc_id|features                            |filtered                 |tf                                                     |
+------+------------------------------------+-------------------------+-------------------------------------------------------+
|a     |[the, sky, is, blue]                |[sky, blue]              |(262144,[27190,103048],[1.0,1.0])                      |
|b     |[the, sky, is, blue, and, beautiful]|[sky, blue, beautiful]   |(262144,[1998,27190,103048],[1.0,1.0,1.0])             |
|c     |[look, at, the, bright, blue, sky]  |[look, bright, blue, sky]|(262144,[27190,103048,214076,223763],[1.0,1.0,1.0,1.0])|
+------+------------------------------------+-------------------------+-------------------------------------------------------+



In [16]:
idf = IDF(inputCol="tf", outputCol="tfidf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(truncate=False)

+------+------------------------------------+-------------------------+-------------------------------------------------------+-------------------------------------------------------------------------------------+
|doc_id|features                            |filtered                 |tf                                                     |tfidf                                                                                |
+------+------------------------------------+-------------------------+-------------------------------------------------------+-------------------------------------------------------------------------------------+
|a     |[the, sky, is, blue]                |[sky, blue]              |(262144,[27190,103048],[1.0,1.0])                      |(262144,[27190,103048],[0.0,0.0])                                                    |
|b     |[the, sky, is, blue, and, beautiful]|[sky, blue, beautiful]   |(262144,[1998,27190,103048],[1.0,1.0,1.0])             |(262144,[1998,271

In [17]:
df1 = (query
      .rdd
  .map(lambda x : (x.doc_id,x.doc_text.split(" ")))
  .toDF()
  .withColumnRenamed("_1","doc_id")
  .withColumnRenamed("_2","features"))

In [22]:
d1=remover.transform(df1)
htf1 = HashingTF(inputCol="filtered", outputCol="tf")#,numFeatures=10)
tf = htf1.transform(d1)
tf.show(truncate=False)

+------+------------+------------+----------------------------------+
|doc_id|features    |filtered    |tf                                |
+------+------------+------------+----------------------------------+
|a     |[look, blue]|[look, blue]|(262144,[103048,223763],[1.0,1.0])|
+------+------------+------------+----------------------------------+



In [23]:
idf = IDF(inputCol="tf", outputCol="tfidf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(truncate=False)

+------+------------+------------+----------------------------------+----------------------------------+
|doc_id|features    |filtered    |tf                                |tfidf                             |
+------+------------+------------+----------------------------------+----------------------------------+
|a     |[look, blue]|[look, blue]|(262144,[103048,223763],[1.0,1.0])|(262144,[103048,223763],[0.0,0.0])|
+------+------------+------------+----------------------------------+----------------------------------+



In [14]:
corpus = tf.select( "tf")

In [67]:
corpus1

PythonRDD[165] at RDD at PythonRDD.scala:48

In [15]:
corpus.take(1)

[Row(tf=SparseVector(10, {2: 1.0, 8: 1.0}))]

In [12]:
from pyspark.mllib.linalg import Vectors

corpus1=corpus.rdd.map(lambda x: Vectors.dense(x[0]))  #[[0, DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0])]]
#corpus1=corpus.rdd.map(lambda x: x.tf) 

In [19]:
corpus1.take(1)  # [[0, DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0])]]

[DenseVector([0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0])]

In [13]:
corpus2= corpus1.zipWithIndex().map(lambda x: [x[1], x[0]]).cache().map(list)

In [14]:
from pyspark.mllib.clustering import LDA, LDAModel

from pyspark.mllib.linalg import SparseVector, DenseVector

In [22]:
from pyspark.mllib.clustering import DistributedLDAModel

ImportError: cannot import name 'DistributedLDAModel'

In [19]:
from pyspark.ml.clustering.DistributedLDAModel import transform

ImportError: No module named 'pyspark.ml.clustering.DistributedLDAModel'; 'pyspark.ml.clustering' is not a package

In [15]:
ldaModel1 = LDA.train(corpus2, k=2,seed=1,maxIterations=2)

In [16]:
ldaModel1

<pyspark.mllib.clustering.LDAModel at 0x7efcb5741ef0>

In [17]:
topics = ldaModel1.topicsMatrix()

In [18]:
test_results = ldaModel1.transform(corpus)

AttributeError: 'LDAModel' object has no attribute 'transform'

In [31]:
ldaModel1.describeTopics()

[([8, 2, 6, 9],
  [0.4857497269509451,
   0.4221656684670669,
   0.05847766596940288,
   0.03360693861258507]),
 ([8, 2, 9, 6],
  [0.40147369475315847,
   0.24091919630562186,
   0.19174032351125056,
   0.16586678542996905])]

In [34]:
ldaModel1.topTopicsPerDocument()

AttributeError: 'LDAModel' object has no attribute 'topTopicsPerDocument'

In [32]:
import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel}

SyntaxError: invalid syntax (<ipython-input-32-5d148187df30>, line 1)

In [24]:
idf = IDF(inputCol="tf", outputCol="tfidf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(truncate=True)

+------+--------------------+--------------------+--------------------+--------------------+
|doc_id|            features|            filtered|                  tf|               tfidf|
+------+--------------------+--------------------+--------------------+--------------------+
|     a|[the, sky, is, blue]|         [sky, blue]|(10,[2,8],[1.0,1.0])|(10,[2,8],[0.0,0.0])|
|     b|[the, sky, is, bl...|[sky, blue, beaut...|(10,[2,8],[1.0,2.0])|(10,[2,8],[0.0,0.0])|
|     c|[look, at, the, b...|[look, bright, bl...|(10,[2,6,8,9],[1....|(10,[2,6,8,9],[0....|
+------+--------------------+--------------------+--------------------+--------------------+



In [None]:
tfidf

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
mat1=tfidf.rdd.map(lambda x: (x.tfidf))

In [None]:
mat1.take(1)

In [None]:
mat1

In [None]:
import numpy as np
from scipy.sparse import csr_matrix
def as_matrix(vec):
    data, indices = vec.values, vec.indices
    shape = 1, vec.size
    return csr_matrix((data, indices, np.array([0, vec.values.size])), shape)

mats = mat1.map(as_matrix)

In [None]:
mats

In [None]:
from scipy.sparse import vstack

mat1 = mats.reduce(lambda x, y: vstack([x, y]))

In [None]:
mat1

In [None]:
cosimilarities = cosine_similarity(mat1, mat1)

In [None]:
cosimilarities

In [None]:
index=np.argsort(-cosimilarities)

In [None]:
for i in range(3):##### change here
    a=(index[i]).tolist()
    df3=doc.filter(lambda x: x[0]==a[0]).map (lambda x: x[1])
    print ('Document Similarity Analysis using Cosine Similarity\n')
    print ('='*60)
    print ("documents:",i+1,":",df3.collect())
   # print (df3.collect())    
    for j in (range(len(a)-1)):
        df2=doc.filter(lambda x: x[0]==a[j+1]).map (lambda x: x[1])        
        print (" similar documents\n","Top Doc:", j+1,":\n",df2.collect() )
       # print (df2.collect())
        print ('-'*40 )

In [None]:
a=(index[i]).tolist()

In [None]:
a

In [None]:
df3=doc.filter(lambda x: x[0]==a[0]).map (lambda x: x[1])

In [None]:
str (df3.collect())

In [None]:
df1=documents[documents.doc_id.isin([2,1, 0])]

In [None]:
df2.take(3)

In [None]:
df1.take(3)

In [None]:
df2=df1.select("doc_id")

In [None]:
mat1=df1.rdd.map(lambda x: (x[1]))

In [None]:
header = mat1.take(1)[0]
rows = mat1.filter(lambda line: line != header)

In [None]:
mat1str=[(rows.collect())]

In [None]:
df3.take(1)

In [None]:
df3str=str (df3.collect())

In [None]:
df3str

In [None]:
file = open("dump.txt", "w")
file.write ('\nDocument Similarity Analysis using Cosine Similarity\n')
for i in range(3):##### change here
    a=(index[i]).tolist()
    df3=doc.filter(lambda x: x[0]==a[0]).map (lambda x: str (x[1]))
    df3str=str (df3.collect()) 
    file.write ("\n")
    file.write ('='*60)
    file.write ("\ndocuments:")
    c=str(i+1)
    file.write (c)
    file.write (":\n")   
    file.write (df3str)
    file.write (" \nsimilar documents\n")
    for j in range(2):
        df2=doc.filter(lambda x: x[0]== a[j+1]).map (lambda x: str (x[1]))
        df2str=str (df2.collect())        
        file.write ("\nTop Doc:")
        b= str (j+1)
        file.write (b)
        file.write (":\n")        
        file.write (df2str)
        file.write ("\n")
        file.write ('-'*40 )
        file.write ("\n")
        #file.close()
file.close ()

In [None]:
from pyspark.ml.feature import StopWordsRemover

In [None]:
remover = StopWordsRemover()

In [None]:
remover.collect()

In [None]:
remover.transform(tags).show(truncate=False)

In [None]:
from pyspark.mllib.feature import HashingTF

In [None]:
hashingTF = HashingTF(10)
tf = hashingTF.transform(tags)

In [None]:
from pyspark.mllib.feature import IDF

In [None]:
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [None]:
from pyspark.mllib.linalg.distributed import RowMatrix,IndexedRowMatrix,CoordinateMatrix,MatrixEntry
rdd=RowMatrix (tfidf)

In [None]:
rdd1=rdd.rows

In [None]:
rd=CoordinateMatrix (rdd1.map(lambda row: MatrixEntry(*row)))

In [None]:
import numpy as np

In [None]:
rdd2=rdd1.map(lambda x: np.array(x))

In [None]:
rdd3=rd.transpose

In [None]:
exact = rdd.columnSimilarities()

In [None]:
exact.numCols()

In [None]:
rddT1 = rdd1.zipWithIndex().flatMap( lambda (x,i): [(i,j,e) for (j,e) in enumerate(x)])

In [None]:
rddT1.take(5)

In [None]:
def rddTranspose(rdd):
    rddT1 = rdd1.zipWithIndex().flatMap(lambda  (x,i): x.zipWithIndex.map(lambda (number, columnIndex): columnIndex -> (rowIndex, number)
    rddT2 = rddT1.map(lambda i,j,e: (j, (i,e))).groupByKey().sortByKey()
    rddT3 = rddT2.map(lambda i, x: sorted(list(x),cmp=lambda i1,e1,i2,e2 : cmp(i1, i2)))
    rddT4 = rddT3.map(lambda x: map(lambda i, y: y , x))
    #return rddT4.map(lambda x: np.asarray(x))

In [None]:
rdd3=rddTranspose (rdd2)

In [1]:
###################lda######################
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors


In [20]:
# Load and parse the data
data = sc.textFile("sample_lda_data.txt")



In [21]:
data.take(5)

['1 2 6 0 2 3 1 1 0 0 3',
 '1 3 0 1 3 0 0 2 0 0 1',
 '1 4 1 0 0 4 9 0 1 2 0',
 '2 1 0 3 0 0 5 0 2 3 9',
 '3 1 1 9 3 0 2 0 0 1 3']

In [22]:
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))

In [23]:
parsedData.take(5)

[DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0]),
 DenseVector([1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0]),
 DenseVector([1.0, 4.0, 1.0, 0.0, 0.0, 4.0, 9.0, 0.0, 1.0, 2.0, 0.0]),
 DenseVector([2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, 3.0, 9.0]),
 DenseVector([3.0, 1.0, 1.0, 9.0, 3.0, 0.0, 2.0, 0.0, 0.0, 1.0, 3.0])]

In [24]:
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA

# Output topics. Each is a distribution over words (matching word count vectors)


In [26]:
corpus.take(2)  

[[0, DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0])],
 [1, DenseVector([1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0])]]

In [None]:
ldaModel = LDA.train(corpus, k=3)

In [None]:
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
      + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
ldaModel.save(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")
sameModel = LDAModel\
    .load(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")

In [14]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector, DenseVector

In [21]:
data = [
...     [1, SparseVector(2, {1: 1.0})],
...     [2, SparseVector(2, {0: 1.0})],
... ]

In [22]:
rdd =  sc.parallelize(data)


In [25]:
rdd.take(2)

[[1, SparseVector(2, {1: 1.0})], [2, SparseVector(2, {0: 1.0})]]

In [26]:
model = LDA.train(data, k=2, seed=1)

Py4JError: An error occurred while calling o188.trainLDAModel. Trace:
py4j.Py4JException: Method trainLDAModel([class java.util.ArrayList, class java.lang.Integer, class java.lang.Integer, class java.lang.Double, class java.lang.Double, class java.lang.Integer, class java.lang.Integer, class java.lang.String]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:272)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)

