# Term Frequency-Inverse Document Frequency (TF-IDF)
In information retrieval, TF-IDF is a numerical statistic helps decide how important a word is to a document in a collection or a corpus. It is most popular term-weighing schemes today (83% of text-based recommender systems - 1985)

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF

# Load documents (one per line).
raw_data = sc.textFile("/user/student/subset-small.tsv")
fields = raw_data.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

# Store the document names for later:
document_names = fields.map(lambda x: x[1])

# Now hash the words in each document to their term frequencies:
hashing_TF = HashingTF(100000)  #100K hash buckets just to save some memory
tf = hashing_TF.transform(documents)

# At this point we have an RDD of sparse vectors representing each document,
# where each value maps to the term frequency of each unique hash value.

# Let's compute the TF*IDF of each term in each document:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)


In [None]:
search_text = "Gettysburg"
text_TF = hashing_TF.transform([search_text])
text_hash_value = int(text_TF.indices[0])

# Now we will extract the TF*IDF score for Gettsyburg's hash value into
# a new RDD for each document:
text_relevance = tfidf.map(lambda x: x[text_hash_value])

# We'll zip in the document names so we can see which is which:
results = text_relevance.zip(document_names)

# And, print the document with the maximum TF*IDF value:
print("Best document for {} is:".format(search_text))
print(results.max())