In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

In [5]:
import pandas as pd
from pyspark.ml.feature import Tokenizer 
df = pd.read_csv("/home/pygmy/Projects/Moti/public.items.csv", sep="|")
df = spark.createDataFrame(df)
tokenizer = Tokenizer(inputCol="items", outputCol="words")
tokenizer = tokenizer.transform(df)
tokenizer.show()

+---+--------------------+--------------------+
| id|               items|               words|
+---+--------------------+--------------------+
|  0|Raw Shrimp, Seedl...|[raw, shrimp,, se...|
|  1|Cracked Wheat, St...|[cracked, wheat,,...|
|  2|Beet Apple Carrot...|[beet, apple, car...|
|  3|               Vodka|             [vodka]|
|  4|Globe Eggplant, P...|[globe, eggplant,...|
|  5|Organic Baby Spin...|[organic, baby, s...|
|  6|Reduced Fat Crack...|[reduced, fat, cr...|
|  7|Organic Red Onion...|[organic, red, on...|
|  8|Organic Cripps Pi...|[organic, cripps,...|
|  9|Organic Baby Spin...|[organic, baby, s...|
| 10|Uncured Beef Hot ...|[uncured, beef, h...|
| 11|Donut House Choco...|[donut, house, ch...|
| 12|[Concentrated But...|[[concentrated, b...|
| 13|Raspberries, Gree...|[raspberries,, gr...|
| 14|Original Tofurky ...|[original, tofurk...|
| 15|Extra Hold Non-Ae...|[extra, hold, non...|
| 16|Organic Coconut M...|[organic, coconut...|
| 17|No. 485 Gin, Mont...|[no., 485, gin

In [7]:
from pyspark.ml.feature import HashingTF
hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features_hashtf")

In [8]:
hashdf= hashingTF.transform(tokenizer)

In [9]:
hashdf.show()

+---+--------------------+--------------------+--------------------+
| id|               items|               words|     features_hashtf|
+---+--------------------+--------------------+--------------------+
|  0|Raw Shrimp, Seedl...|[raw, shrimp,, se...|(10,[1,2,3,4,5,6,...|
|  1|Cracked Wheat, St...|[cracked, wheat,,...|(10,[0,1,2,3,4,5,...|
|  2|Beet Apple Carrot...|[beet, apple, car...|(10,[0,1,4,5,6,7,...|
|  3|               Vodka|             [vodka]|      (10,[4],[1.0])|
|  4|Globe Eggplant, P...|[globe, eggplant,...|(10,[0,1,2,3,4,5,...|
|  5|Organic Baby Spin...|[organic, baby, s...|(10,[0,1,2,3,4,5,...|
|  6|Reduced Fat Crack...|[reduced, fat, cr...|(10,[0,1,2,3,4,5,...|
|  7|Organic Red Onion...|[organic, red, on...|(10,[0,1,2,3,4,5,...|
|  8|Organic Cripps Pi...|[organic, cripps,...|(10,[3,5,6,8,9],[...|
|  9|Organic Baby Spin...|[organic, baby, s...|(10,[0,1,2,3,4,5,...|
| 10|Uncured Beef Hot ...|[uncured, beef, h...|(10,[0,1,2,3,4,5,...|
| 11|Donut House Choco...|[donut, 

In [10]:
hashdf

DataFrame[id: bigint, items: string, words: array<string>, features_hashtf: vector]

In [11]:
from pyspark.ml.feature import IDF
idf = IDF(minDocFreq=3, inputCol="features_hashtf", outputCol="idf")

In [12]:
model = idf.fit(hashdf)

In [13]:
newmodel = model.transform(hashdf)

In [14]:
newmodel.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|               items|               words|     features_hashtf|                 idf|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|Raw Shrimp, Seedl...|[raw, shrimp,, se...|(10,[1,2,3,4,5,6,...|(10,[1,2,3,4,5,6,...|
|  1|Cracked Wheat, St...|[cracked, wheat,,...|(10,[0,1,2,3,4,5,...|(10,[0,1,2,3,4,5,...|
|  2|Beet Apple Carrot...|[beet, apple, car...|(10,[0,1,4,5,6,7,...|(10,[0,1,4,5,6,7,...|
|  3|               Vodka|             [vodka]|      (10,[4],[1.0])|(10,[4],[0.160373...|
|  4|Globe Eggplant, P...|[globe, eggplant,...|(10,[0,1,2,3,4,5,...|(10,[0,1,2,3,4,5,...|
|  5|Organic Baby Spin...|[organic, baby, s...|(10,[0,1,2,3,4,5,...|(10,[0,1,2,3,4,5,...|
|  6|Reduced Fat Crack...|[reduced, fat, cr...|(10,[0,1,2,3,4,5,...|(10,[0,1,2,3,4,5,...|
|  7|Organic Red Onion...|[organic, red, on...|(10,[0,1,2,3,4,5,...|(10,[0,1,2,3,4,5,...|
|  8|Organ

In [15]:
dief = spark.createDataFrame([(["a", "b", "c"],)], ["words"])

In [16]:
ha = HashingTF(numFeatures=10, inputCol="words", outputCol="features")

In [17]:
ha = ha.transform(dief)

In [18]:
ha.show()

+---------+--------------------+
|    words|            features|
+---------+--------------------+
|[a, b, c]|(10,[0,1,2],[1.0,...|
+---------+--------------------+



In [19]:
ha

DataFrame[words: array<string>, features: vector]

In [20]:
aidief = IDF(minDocFreq=3, inputCol="features", outputCol="idf")

In [21]:
aidief = aidief.fit(ha)

In [22]:
aidief = aidief.transform(ha)

In [23]:
aidief.show()

+---------+--------------------+--------------------+
|    words|            features|                 idf|
+---------+--------------------+--------------------+
|[a, b, c]|(10,[0,1,2],[1.0,...|(10,[0,1,2],[0.0,...|
+---------+--------------------+--------------------+

