In [5]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext,SparkSession
from pyspark.ml.feature import Tokenizer,HashingTF,IDF
conf = SparkConf().setMaster('local')
spark = SparkSession \
    .builder \
    .appName("pyspark") \
    .config(conf = conf) \
    .enableHiveSupport()\
    .getOrCreate()
sc = spark.sparkContext

In [23]:
from pyspark.ml import Pipeline
sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),\
                                     (0, "I wish Java could use case classes"),\
                                     (1, "Logistic regression models are neat")],["label","sentence"])
tokenizer = Tokenizer(inputCol="sentence",outputCol="words")
hashingTf =HashingTF(numFeatures=2000,inputCol="words",outputCol="rawFeatures")
wordsData = tokenizer.transform(sentenceData)
featureizedData = hashingTf.transform(wordsData)
idf = IDF(inputCol='rawFeatures',outputCol='features')
idfModel = idf.fit(featureizedData)
rescaledData = idfModel.transform(featureizedData)
# featureizedData.select('rawFeatures','label','words').head()
rescaledData.select('features','words').take(3)

[Row(features=SparseVector(2000, {240: 0.6931, 333: 0.6931, 1105: 1.3863, 1329: 0.5754, 1357: 0.6931, 1777: 0.6931}), words=[u'i', u'heard', u'about', u'spark', u'and', u'i', u'love', u'spark']),
 Row(features=SparseVector(2000, {213: 0.6931, 342: 0.6931, 489: 0.6931, 495: 0.6931, 1329: 0.2877, 1809: 0.6931, 1967: 0.6931}), words=[u'i', u'wish', u'java', u'could', u'use', u'case', u'classes']),
 Row(features=SparseVector(2000, {286: 0.6931, 695: 0.6931, 1138: 0.6931, 1193: 0.6931, 1604: 0.6931}), words=[u'logistic', u'regression', u'models', u'are', u'neat'])]

In [45]:
pipeline = Pipeline(stages=[tokenizer,hashingTf,idfModel])
pipelineModel = pipeline.fit('sentenceData')
rescaledData = pipelineModel.transform(sentenceData)
for i in pipelineModel.transform(sentenceData).select('features','words').take(3):
    for j in zip(i[1],filter(lambda x:x!=0,list(i[0].toArray()))):
        print ":".join([str(x) for x in j])


i:0.69314718056
heard:0.69314718056
about:1.38629436112
spark:0.575364144904
and:0.69314718056
i:0.69314718056
i:0.69314718056
wish:0.69314718056
java:0.69314718056
could:0.69314718056
use:0.287682072452
case:0.69314718056
classes:0.69314718056
logistic:0.69314718056
regression:0.69314718056
models:0.69314718056
are:0.69314718056
neat:0.69314718056


In [46]:
sc.stop()
spark.stop()

In [None]:
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.feature import HashingTF

if __name__ == "__main__":
    sc = SparkContext(appName="PythonBookExample")

    # Load 2 types of emails from text files: spam and ham (non-spam).
    # Each line has text from one email.
    spam = sc.textFile("files/spam.txt")
    ham = sc.textFile("files/ham.txt")

    # Create a HashingTF instance to map email text to vectors of 100 features.
    tf = HashingTF(numFeatures = 100)
    # Each email is split into words, and each word is mapped to one feature.
    spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
    hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

    # Create LabeledPoint datasets for positive (spam) and negative (ham) examples.
    positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
    negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))
    training_data = positiveExamples.union(negativeExamples)
    training_data.cache() # Cache data since Logistic Regression is an iterative algorithm.

    # Run Logistic Regression using the SGD optimizer.
    # regParam is model regularization, which can make models more robust.
    model = LogisticRegressionWithSGD.train(training_data)

    # Test on a positive example (spam) and a negative one (ham).
    # First apply the same HashingTF feature transformation used on the training data.
    posTestExample = tf.transform("O M G GET cheap stuff by sending money to ...".split(" "))
    negTestExample = tf.transform("Hi Dad, I started studying Spark the other ...".split(" "))

    # Now use the learned model to predict spam/ham for new emails.
    print "Prediction for positive test example: %g" % model.predict(posTestExample)
    print "Prediction for negative test example: %g" % model.predict(negTestExample)

    sc.stop()