In [None]:
from pyspark.mllib.feature import HashingTF, IDF

hashingTF = HashingTF(1000)
tf = hashingTF.transform(df_withdiff)

# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

# spark.mllib's IDF implementation provides an option for ignoring terms
# which occur in less than a minimum number of documents.
# In such cases, the IDF for these terms is set to 0.
# This feature can be used by passing the minDocFreq value to the IDF constructor.
idfIgnore = IDF(minDocFreq=2).fit(tf)
tfidfIgnore = idfIgnore.transform(tf)

In [None]:
# I used alias to avoid confusion with the mllib library
from pyspark.ml.feature import HashingTF as MLHashingTF
from pyspark.ml.feature import IDF as MLIDF
from pyspark.sql.types import DoubleType

df_withdiff.printSchema()

doc = (df_withdiff
  .rdd
  .map(lambda x : (x.title_page,x.diff.split(" ")))
  .toDF()
  .withColumnRenamed("_1","title_page")
  .withColumnRenamed("_2","diff"))

htf = MLHashingTF(inputCol="diff", outputCol="tf")
tf = htf.transform(doc)
tf.show(truncate=False)

idf = MLIDF(inputCol="tf", outputCol="idf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(truncate=False)

res = tfidf.rdd.map(lambda x : (x.title_page,x.tf,x.idf,(None if x.idf is None else x.idf.values.sum())))

for r in res.take(10):
    print(r)

In [None]:
trainingData, testData=tfidf.randomSplit([0.7, 0.3])

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("diff","label_string","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)