In [5]:
import pyspark 
sc = pyspark.SparkContext.getOrCreate()
sparkSql = pyspark.SQLContext(sc)


In [None]:
sc.stop()

In [6]:
sc

In [9]:
import pickle

with open('filename.pickle', 'rb') as handle: 

    b = pickle.load(handle) 

In [7]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = sparkSql.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[0,5,9,17],[0...|
|  0.0|(20,[2,7,9,13,15]...|
|  1.0|(20,[4,6,13,15,18...|
+-----+--------------------+



In [10]:
# b['DF'][0]

In [11]:
sentenceData = sparkSql.createDataFrame([b['DF'][0]])



In [12]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = sparkSql.createDataFrame(b['DF'])

tokenizer = Tokenizer(inputCol="article", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "features").show()



+--------+--------------------+
|   label|            features|
+--------+--------------------+
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
|Business|(20,[0,1,2,3,4,5,...|
+--------+--------------------+
only showing top 20 rows



In [14]:
rescaledData.select("label", "features").collect()

[Row(label='Business', features=SparseVector(20, {0: 0.206, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0915, 7: 0.0, 8: 0.1602, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.156})),
 Row(label='Business', features=SparseVector(20, {0: 0.032, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0687, 7: 0.0, 8: 0.0275, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.055})),
 Row(label='Business', features=SparseVector(20, {0: 0.357, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.1785, 7: 0.0, 8: 0.4943, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.3303})),
 Row(label='Business', features=SparseVector(20, {0: 0.4165, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.238, 7: 0.0, 8: 0.4302, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.4128})),
 Row(label='Business', features=SparseVector(20, {0: 0.2517, 1: 0.

In [49]:
from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = sparkSql.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.0267522901297,0.0757169947028,-0.0334546715021]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [0.0251003167193,-0.00408402511052,-0.035211261761]

Text: [Logistic, regression, models, are, neat] => 
Vector: [-0.0148567939177,0.0169823542237,-0.0607494233991]



In [34]:
op.collect()

[Row(id=0, raw=['I', 'saw', 'the', 'red', 'balloon'], filtered=['saw', 'red', 'balloon']),
 Row(id=1, raw=['Mary', 'had', 'a', 'little', 'lamb'], filtered=['Mary', 'little', 'lamb'])]

In [51]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

sentenceDataFrame = sparkSql.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java javay javaset could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

sentenceDataFrame = sentenceData

tokenizer = Tokenizer(inputCol="article", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="article", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("article", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("article", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [55]:
regexTokenized.select('words','label').show()

+--------------------+--------+
|               words|   label|
+--------------------+--------+
|[chaim, and, chay...|Business|
+--------------------+--------+



In [57]:
from pyspark.ml.feature import StopWordsRemover

#sentenceData = sparkSql.createDataFrame(b['DF'])

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
op = remover.transform(regexTokenized)

In [61]:
op.select('filtered','label').collect()

[Row(filtered=['chaim', 'chaya', 'meet', 'cute', '1908', 'halifax', 'new', 'arrivals', 'jews', 'fled', 'romania', 'shunted', 'line', 'sick', 'might', 'contracted', 'typhus', 'says', 'rash', 'might', 'caught', 'sister', 'tuberculosis', 'thinks', 'cough', 'two', 'traumatized', 'kids', 'fall', 'love', 'immigration', 'let', 'live', 'long', 'enough', 'work', 'mingled', 'genres', 'strong', 'flavors', 'old', 'stock', 'refugee', 'love', 'story', 'produced', '2b', 'theater', 'company', '59e59', 'theaters', 'mixes', 'bitter', 'herbs', 'apples', 'honey', 'didactic', 'anarchic', 'tragic', 'comic', 'klezmer', 'musical', 'love', 'story', 'particular', 'family', 'history', 'chaim', 'chaya', 'based', 'canadian', 'playwright', 'hannah', 'moscovitch', 'great', 'grandparents', 'broad', 'allegory', 'refugee', 'crisis', 'present', 'style', 'mostly', 'story', 'theater', 'narrated', 'singer', 'songwriter', 'ben', 'caplan', 'created', 'piece', 'ms', 'moscovitch', 'director', 'songwriter', 'christian', 'barry'